using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; using ZB.MOM.WW.MxGateway.Client; using ZB.MOM.WW.MxGateway.Contracts.Proto; using ZB.MOM.WW.OtOpcUa.Core.Abstractions; using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse; using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config; using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health; using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime; namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy; /// /// In-process .NET 10 Galaxy driver — the only Galaxy backend since PR 7.2 retired /// the legacy Galaxy.Host / Galaxy.Proxy / Galaxy.Shared /// projects and the OtOpcUaGalaxyHost Windows service. Implements the full /// capability surface: , , /// , , , /// , and . Galaxy /// access flows through the in-process driver over gRPC to the separately /// installed mxaccessgw gateway (sibling repo), which owns the MXAccess /// COM apartment server-side. /// /// /// /// Registered as a Tier A in-process driver alongside Modbus / S7 / etc. via /// under driver-type name /// "GalaxyMxGateway". /// /// /// Tests inject capability seams (, /// , , /// , , /// ) via the internal ctor so capability flow /// can be exercised without a real gw round-trip. /// /// public sealed class GalaxyDriver : IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IRediscoverable, IHostConnectivityProbe, IAlarmSource, IDisposable, IAsyncDisposable { private readonly string _driverInstanceId; private readonly GalaxyDriverOptions _options; private readonly ILogger _logger; // PR 4.1 — IGalaxyHierarchySource is the test seam for browse. When null, the driver // lazily builds a GatewayGalaxyHierarchySource around a GalaxyRepositoryClient on // first DiscoverAsync. Tests inject a fake source via the internal ctor to exercise // GalaxyDiscoverer's translation logic without a real gRPC channel. private IGalaxyHierarchySource? _hierarchySource; private GalaxyRepositoryClient? _ownedRepositoryClient; // PR 4.2 — IGalaxyDataReader is the test seam for IReadable. PR 4.4 supplies the // production implementation that wraps GalaxyMxSession's SubscribeBulk + StreamEvents // pump; until then ReadAsync throws NotSupportedException when the reader is null // (legacy-host backend handles reads in production via DriverNodeManager's // capability-routing). private IGalaxyDataReader? _dataReader; // PR 4.3 — IGalaxyDataWriter is the test seam for IWritable. Production wraps // GalaxyMxSession via GatewayGalaxyDataWriter (Write / WriteSecured routing). The // per-tag SecurityClassification map is populated during ITagDiscovery and consumed // here at write time. private IGalaxyDataWriter? _dataWriter; private readonly System.Collections.Concurrent.ConcurrentDictionary _securityByFullRef = new(StringComparer.OrdinalIgnoreCase); // PR 4.4 — subscription lifecycle. The pump consumes the gw event stream and fans // out OnDataChange events to every registered driver subscription via the registry's // reverse map. The subscriber is the test seam — production uses // GatewayGalaxySubscriber over a connected GalaxyMxSession. private IGalaxySubscriber? _subscriber; private readonly SubscriptionRegistry _subscriptions = new(); private EventPump? _eventPump; private readonly Lock _pumpLock = new(); // IAlarmSource implementation. Production-side acks route through // GatewayGalaxyAlarmAcknowledger which calls the session-less // MxGatewayClient.AcknowledgeAlarmAsync RPC; alarm transitions arrive on the // gateway's session-less StreamAlarms feed via GatewayGalaxyAlarmFeed. Tests inject // IGalaxyAlarmAcknowledger + IGalaxyAlarmFeed via the internal ctor to exercise the // wiring without a running gateway. This driver bridges the feed's OnAlarmTransition // onto IAlarmSource.OnAlarmEvent. private IGalaxyAlarmAcknowledger? _alarmAcknowledger; private IGalaxyAlarmFeed? _alarmFeed; private readonly Lock _alarmHandlersLock = new(); private readonly Lock _alarmFeedLock = new(); private bool _alarmFeedWired; // List preserves insertion order so OnAlarmFeedTransition always picks the // earliest-registered handle — a deterministic choice that doesn't vary as // handles are added/removed (Driver.Galaxy-006 fix: HashSet.First() is unstable). private readonly List _alarmSubscriptions = new(); // PR 4.W — production runtime owned by InitializeAsync. The driver builds these // when it opens a real gw session; tests bypass them by injecting seams via the // internal ctor. private GalaxyMxSession? _ownedMxSession; private MxGatewayClient? _ownedMxClient; // PR 4.5 — reconnect supervisor. Reflects in DriverState.Degraded while not Healthy. private ReconnectSupervisor? _supervisor; // PR 4.6 — IRediscoverable plumbing. private DeployWatcher? _deployWatcher; // PR 4.7 — IHostConnectivityProbe plumbing. The aggregator owns the merged // transport+per-platform view; the forwarder is fed from the supervisor on // transport state transitions; the probe watcher subscribes ScanState attributes // for every discovered platform and pushes value changes to the aggregator. private readonly HostStatusAggregator _hostStatuses = new(); private HostConnectivityForwarder? _transportForwarder; private PerPlatformProbeWatcher? _probeWatcher; private DriverHealth _health = new(DriverState.Unknown, null, null); private bool _disposed; /// /// Server-pushed data-change notification. Fires from the /// 's background loop; handlers should be cheap (or queue /// onto another thread) to avoid blocking the gw event stream. /// public event EventHandler? OnDataChange; /// Fires when the gateway signals a deploy-time change (PR 4.6 DeployWatcher). public event EventHandler? OnRediscoveryNeeded; /// Fires when a host transitions Running ↔ Stopped (PR 4.7 HostStatusAggregator). public event EventHandler? OnHostStatusChanged; /// public event EventHandler? OnAlarmEvent; public GalaxyDriver( string driverInstanceId, GalaxyDriverOptions options, ILogger? logger = null) : this(driverInstanceId, options, hierarchySource: null, dataReader: null, dataWriter: null, subscriber: null, alarmAcknowledger: null, alarmFeed: null, logger) { } /// /// Test-visible ctor — inject custom seams so , /// , , and /// can be exercised against canned data without /// building real gRPC channels. /// internal GalaxyDriver( string driverInstanceId, GalaxyDriverOptions options, IGalaxyHierarchySource? hierarchySource, IGalaxyDataReader? dataReader = null, IGalaxyDataWriter? dataWriter = null, IGalaxySubscriber? subscriber = null, IGalaxyAlarmAcknowledger? alarmAcknowledger = null, IGalaxyAlarmFeed? alarmFeed = null, ILogger? logger = null) { _driverInstanceId = !string.IsNullOrWhiteSpace(driverInstanceId) ? driverInstanceId : throw new ArgumentException("Driver instance id required.", nameof(driverInstanceId)); _options = options ?? throw new ArgumentNullException(nameof(options)); _logger = logger ?? NullLogger.Instance; _hierarchySource = hierarchySource; _dataReader = dataReader; _dataWriter = dataWriter; _subscriber = subscriber; _alarmAcknowledger = alarmAcknowledger; _alarmFeed = alarmFeed; // Forward the aggregator's transitions through IHostConnectivityProbe. _hostStatuses.OnHostStatusChanged += (_, args) => OnHostStatusChanged?.Invoke(this, args); } /// public string DriverInstanceId => _driverInstanceId; /// public string DriverType => GalaxyDriverFactoryExtensions.DriverTypeName; /// Test-visible options snapshot. internal GalaxyDriverOptions Options => _options; /// /// Test-visible entry into . The supervisor's /// drives this on a /// background task in production; tests prefer to invoke it directly so the /// branch can be /// asserted deterministically (Driver.Galaxy-013). /// internal Task InvokeReplayForTestAsync(CancellationToken cancellationToken) => ReplayAsync(cancellationToken); /// public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); // Tests inject seams via the internal ctor; production InitializeAsync builds // the gateway client + session + per-capability runtime components from // GalaxyDriverOptions. When seams are pre-injected we leave them alone (the // test exercises the wired surface without a real gw round-trip). if (_subscriber is null && _dataWriter is null && _hierarchySource is null) { await BuildProductionRuntimeAsync(cancellationToken).ConfigureAwait(false); } else { _logger.LogDebug( "GalaxyDriver {InstanceId} initializing with pre-injected seams — production runtime build skipped", _driverInstanceId); } StartDeployWatcher(); _logger.LogInformation( "GalaxyDriver {InstanceId} initialized — endpoint={Endpoint} clientName={ClientName}", _driverInstanceId, _options.Gateway.Endpoint, _options.MxAccess.ClientName); _health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null); } /// /// Build the production gw client + session + per-capability runtime components /// from _options. Sets up the reconnect supervisor's reopen / replay /// callbacks so a transport drop replays every active subscription on the /// restored session. /// private async Task BuildProductionRuntimeAsync(CancellationToken cancellationToken) { var clientOptions = BuildClientOptions(_options.Gateway); _ownedMxClient = MxGatewayClient.Create(clientOptions); _ownedMxSession = new GalaxyMxSession(_options.MxAccess, _logger); await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false); // PR 6.1 — wrap the gw-facing seams in tracing decorators so every Subscribe / // Unsubscribe / Write / StreamEvents call emits a span on the // "ZB.MOM.WW.OtOpcUa.Driver.Galaxy" ActivitySource. The host process's tracing // listener (OTLP exporter, dotnet-trace, etc.) consumes these without the driver // taking a dependency on the OpenTelemetry packages. _subscriber = new TracedGalaxySubscriber( new GatewayGalaxySubscriber(_ownedMxSession), _options.MxAccess.ClientName); _dataWriter = new TracedGalaxyDataWriter( new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger), _options.MxAccess.ClientName); _supervisor = new ReconnectSupervisor( reopen: ReopenAsync, replay: ReplayAsync, options: new ReconnectOptions( InitialBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.InitialBackoffMs), MaxBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.MaxBackoffMs)), logger: _logger); _transportForwarder = new HostConnectivityForwarder(_options.MxAccess.ClientName, _hostStatuses, _logger); _transportForwarder.SetTransport(HostState.Running); // initial state — we just connected _supervisor.StateChanged += OnSupervisorStateChanged; _probeWatcher = new PerPlatformProbeWatcher( _subscriber, _hostStatuses, _logger, bufferedUpdateIntervalMs: _options.MxAccess.PublishingIntervalMs); // Wire the alarm acknowledger + feed to the live gateway client. Both are // session-less — the gateway serves alarms from an always-on central monitor — // so they hang off the owned MxGatewayClient, not the worker session. _alarmAcknowledger ??= new GatewayGalaxyAlarmAcknowledger(_ownedMxClient, _logger); _alarmFeed ??= new GatewayGalaxyAlarmFeed( _ownedMxClient.StreamAlarmsAsync, _logger, _options.MxAccess.ClientName); } /// /// Reopen callback for : re-Register the gw session. /// If the session never connected, this is a fresh ConnectAsync; otherwise it's a /// reconnect against the existing client. /// private async Task ReopenAsync(CancellationToken cancellationToken) { if (_ownedMxSession is null) return; var clientOptions = BuildClientOptions(_options.Gateway); await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false); } /// /// Replay callback. Walks every active subscription, re-issues SubscribeBulk for /// its tag list, and rebinds the /// registry with the fresh item handles the gateway returned — the pre-reconnect /// handles are dead once the session reopened. The faulted /// is recreated first so the replayed subscriptions have a live StreamEvents /// consumer; without that restart the replayed tags are subscribed on the gw but /// never reach OnDataChange (Driver.Galaxy-008). PR 6.x can swap this for /// the gw's batched ReplaySubscriptionsCommand once it ships. /// private async Task ReplayAsync(CancellationToken cancellationToken) { if (_subscriber is null) return; var entries = _subscriptions.SnapshotEntries(); if (entries.Count == 0) return; // Driver.Galaxy-013: honor ReplayOnSessionLost. When operators opt out (false) // we skip the per-tag SubscribeBulk fan-out — they're delegating to the // gateway's session-level ReplaySubscriptions or accept post-reconnect tag // loss. We still restart the EventPump so a future Subscribe call lands on // a live consumer. if (!_options.Reconnect.ReplayOnSessionLost) { RestartEventPumpForReplay(); _logger.LogInformation( "GalaxyDriver {InstanceId} reconnect replay skipped — ReplayOnSessionLost=false ({SubCount} subscriptions tracked)", _driverInstanceId, entries.Count); return; } // The stream-fault that triggered this recovery left the old pump's RunAsync loop // exited and its channel completed; EventPump.Start() is a no-op on a non-null but // completed loop. Recreate the pump so the replayed subscriptions have a consumer. RestartEventPumpForReplay(); var tagCount = 0; foreach (var (subscriptionId, oldBindings) in entries) { var refs = oldBindings .Select(b => b.FullReference) .Distinct(StringComparer.OrdinalIgnoreCase) .ToArray(); if (refs.Length == 0) continue; var results = await _subscriber .SubscribeBulkAsync(refs, _options.MxAccess.PublishingIntervalMs, cancellationToken) .ConfigureAwait(false); var byAddress = BuildResultIndex(results); var newBindings = new List(refs.Length); foreach (var fullRef in refs) { var itemHandle = byAddress.TryGetValue(fullRef, out var match) && match.WasSuccessful ? match.ItemHandle : 0; newBindings.Add(new TagBinding(fullRef, itemHandle)); } // Rebind so the EventPump fan-out reverse map points at the post-reconnect // handles; otherwise events on the new handles miss every subscription. _subscriptions.Rebind(subscriptionId, newBindings); tagCount += refs.Length; } _logger.LogInformation( "GalaxyDriver {InstanceId} replay completed — {SubCount} subscriptions, {TagCount} tags re-subscribed", _driverInstanceId, entries.Count, tagCount); } /// /// Index a SubscribeBulk result list by tag address (OrdinalIgnoreCase) so the /// subscribe / replay correlation loops are O(1) per reference rather than a /// linear scan. Last-write-wins on a duplicate address (the gw shouldn't emit one). /// private static Dictionary BuildResultIndex(IReadOnlyList results) { var index = new Dictionary(results.Count, StringComparer.OrdinalIgnoreCase); foreach (var result in results) { if (!string.IsNullOrEmpty(result.TagAddress)) index[result.TagAddress] = result; } return index; } /// /// Dispose the faulted and create a fresh one bound to the /// same subscriber / registry, started immediately. Invoked from the reconnect /// replay path. No-op when no pump was ever started (no active subscriptions). /// private void RestartEventPumpForReplay() { EventPump? old; lock (_pumpLock) { old = _eventPump; if (old is null) return; // pump never started — nothing to restart _eventPump = null; } // Detach + dispose the faulted pump outside the lock so a slow shutdown doesn't // block a concurrent EnsureEventPumpStarted; the old loop already exited on fault. old.OnDataChange -= OnPumpDataChange; try { old.DisposeAsync().AsTask().GetAwaiter().GetResult(); } catch (Exception ex) { _logger.LogWarning(ex, "GalaxyDriver {InstanceId} faulted EventPump dispose failed during replay — continuing.", _driverInstanceId); } // EnsureEventPumpStarted creates + starts a fresh pump under the lock. Skip the // recreate if the driver is being disposed — Dispose already tore the pump down // and a fresh one here would leak past the driver's lifetime. if (_disposed) return; EnsureEventPumpStarted(); } private void OnSupervisorStateChanged(object? sender, StateTransition transition) { // Reflect supervisor state in DriverHealth + transport forwarder. _health = transition.Next switch { ReconnectSupervisor.State.Healthy => new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null), _ => new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, transition.Cause), }; if (_transportForwarder is not null) { var hostState = transition.Next == ReconnectSupervisor.State.Healthy ? HostState.Running : HostState.Stopped; _transportForwarder.SetTransport(hostState); } } /// /// Resolves Gateway.ApiKeySecretRef to the actual API-key bytes. Four /// forms supported, evaluated in order: /// /// env:NAME — reads Environment.GetEnvironmentVariable(NAME). /// Throws when the variable is unset, so a misconfigured deployment fails /// fast at InitializeAsync rather than silently sending an empty key. /// file:PATH — reads UTF-8 text from PATH, trimming /// whitespace. Lets operators stash the key in an ACL'd file outside the /// repo (the same pattern as the legacy .local/galaxy-host-secret.txt). /// dev:KEY — explicit cleartext literal. The dev: prefix /// is a deliberate opt-in signal (dev box, parity rig) so the resolver /// doesn't emit a warning; production should never use this arm. /// Anything else — used as the literal API key for back-compat with /// configs that pre-date this resolver. When a logger is supplied the /// resolver emits a startup warning so an operator who accidentally /// committed a cleartext key sees it (Driver.Galaxy-010). /// /// A future PR can swap any of these arms for a DPAPI-backed lookup without /// changing the call site. /// internal static string ResolveApiKey(string secretRef) => ResolveApiKey(secretRef, logger: null); /// /// Logger-aware overload. Emits a if the secret /// ref falls through to the back-compat literal arm (an unprefixed cleartext /// API key in DriverConfig JSON). The dev: prefix is the explicit /// opt-in path that doesn't warn. /// internal static string ResolveApiKey(string secretRef, ILogger? logger) { ArgumentException.ThrowIfNullOrEmpty(secretRef); if (secretRef.StartsWith("env:", StringComparison.OrdinalIgnoreCase)) { var name = secretRef[4..]; var value = Environment.GetEnvironmentVariable(name); return !string.IsNullOrEmpty(value) ? value : throw new InvalidOperationException( $"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' resolves to env var '{name}', but it is unset."); } if (secretRef.StartsWith("file:", StringComparison.OrdinalIgnoreCase)) { var path = secretRef[5..]; if (!File.Exists(path)) { throw new InvalidOperationException( $"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' points at '{path}', which doesn't exist."); } var contents = File.ReadAllText(path).Trim(); return !string.IsNullOrEmpty(contents) ? contents : throw new InvalidOperationException( $"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' file '{path}' is empty."); } if (secretRef.StartsWith("dev:", StringComparison.OrdinalIgnoreCase)) { // Explicit dev opt-in — no warning, the operator deliberately chose a // cleartext literal (dev box, parity rig). return secretRef[4..]; } // Back-compat literal arm. An unprefixed string is treated as the literal // API key — but emit a warning so an operator who accidentally committed a // cleartext key into DriverConfig sees it at startup. Use the dev: prefix // to suppress this warning when the literal is intentional. logger?.LogWarning( "Galaxy.Gateway.ApiKeySecretRef is being treated as a literal cleartext API key. " + "Prefer env:NAME, file:PATH, or the explicit dev:KEY prefix for dev rigs — " + "a literal key in DriverConfig JSON is stored in cleartext in the central config DB."); return secretRef; } private MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new() { Endpoint = new Uri(gw.Endpoint, UriKind.Absolute), // Driver.Galaxy-010: pass the logger so the literal-arm cleartext fallback // surfaces a startup warning rather than silently shipping the key. ApiKey = ResolveApiKey(gw.ApiKeySecretRef, _logger), UseTls = gw.UseTls, CaCertificatePath = gw.CaCertificatePath, ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds), DefaultCallTimeout = TimeSpan.FromSeconds(gw.DefaultCallTimeoutSeconds), StreamTimeout = gw.StreamTimeoutSeconds > 0 ? TimeSpan.FromSeconds(gw.StreamTimeoutSeconds) : null, }; private void StartDeployWatcher() { if (!_options.Repository.WatchDeployEvents) return; if (_ownedRepositoryClient is null && _hierarchySource is null) return; // Reuse the lazily-built repository client (DiscoverAsync constructs it on demand). // If discovery hasn't run yet, build the client here so the watcher has a target. // Driver.Galaxy-009 fix: guard with ??= so if BuildDefaultHierarchySource later runs // it reuses this client rather than overwriting the field and leaking the first instance. _ownedRepositoryClient ??= MxGateway.Client.GalaxyRepositoryClient.Create( BuildClientOptions(_options.Gateway)); var source = new GatewayGalaxyDeployWatchSource(_ownedRepositoryClient); _deployWatcher = new DeployWatcher(source, _logger); _deployWatcher.OnRediscoveryNeeded += (_, args) => OnRediscoveryNeeded?.Invoke(this, args); // StartAsync schedules the background loop and returns Task.CompletedTask immediately. // It throws InvalidOperationException synchronously if called twice (programming error). // Driver.Galaxy-009 fix: don't discard the return value — observe any synchronous throw. var startTask = _deployWatcher.StartAsync(CancellationToken.None); // The task is already completed (StartAsync is synchronous); surface any synchronous fault. if (startTask.IsFaulted) startTask.GetAwaiter().GetResult(); } /// /// /// /// In-place config reapply. The driver does not currently support /// hot-swapping at runtime — changing the /// gateway endpoint, MxAccess client name, or reconnect policy requires /// tearing down the gw session, supervisor, event pump, and address space. /// The host stack handles that via DriverInstance restart, so this method /// only accepts an equivalent config (no meaningful change) and refreshes /// health; a non-equivalent reapply throws /// so the caller knows the change wasn't applied (Driver.Galaxy-013: /// previously the method silently ignored driverConfigJson). /// /// public Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); if (!string.IsNullOrWhiteSpace(driverConfigJson)) { // Materialise the incoming config and compare against the live options. We // refuse any change that would require a session teardown rather than // pretending to apply it. GalaxyDriverOptions incoming; try { // Reuse the factory's parse pipeline so any missing-required-field // error surfaces with the same diagnostic text as InitializeAsync. var transient = GalaxyDriverFactoryExtensions.CreateInstance(_driverInstanceId, driverConfigJson); incoming = transient.Options; // The transient instance never started a runtime — disposing is cheap. transient.Dispose(); } catch (Exception ex) when (ex is not NotSupportedException and not ObjectDisposedException) { throw new NotSupportedException( $"GalaxyDriver.ReinitializeAsync could not parse the incoming DriverConfig JSON for '{_driverInstanceId}': {ex.Message}", ex); } if (!OptionsAreEquivalent(_options, incoming)) { throw new NotSupportedException( "GalaxyDriver.ReinitializeAsync does not support hot-swapping driver options at runtime " + "(gateway endpoint, MxAccess client name, reconnect policy, etc.). Restart the DriverInstance " + "through the host stack to apply a config change."); } } _health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null); return Task.CompletedTask; } /// /// Compare two for runtime equivalence — every /// field that drives gw session shape, address space, or reconnect behaviour /// must match. Records get value-equality from the language, so a direct /// equality check is enough. /// private static bool OptionsAreEquivalent(GalaxyDriverOptions a, GalaxyDriverOptions b) => a == b; /// public Task ShutdownAsync(CancellationToken cancellationToken) { if (_disposed) return Task.CompletedTask; _logger.LogInformation("GalaxyDriver {InstanceId} shutting down", _driverInstanceId); _health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null); return Task.CompletedTask; } /// public DriverHealth GetHealth() { // Reconnect supervisor wins when degraded — the cached _health reflects the last // successful operation, but ongoing recovery should surface as Degraded. if (_supervisor?.IsDegraded == true) { return new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, _supervisor.LastError); } return _health; } // ===== IHostConnectivityProbe (PR 4.7 wire-up) ===== /// public IReadOnlyList GetHostStatuses() => _hostStatuses.Snapshot(); /// /// /// Estimated footprint: 64 bytes × tracked item handles (one gw subscription entry /// per bound tag) + 256 bytes × tracked driver subscriptions (registry overhead per /// OPC UA monitored item). Returns 0 when no subscriptions are active. These /// constants are conservative — a 50k-tag set occupies ~3 MB and registers clearly /// with the server's cache-flush heuristic. Driver.Galaxy-011: the stale /// "PR 4.4 sets this" comment is removed; PR 4.4 shipped the SubscriptionRegistry /// but never wired it here. /// public long GetMemoryFootprint() { const long BytesPerItemHandle = 64L; // TagBinding + reverse-map entry const long BytesPerSubscription = 256L; // SubscriptionEntry overhead return (_subscriptions.TrackedItemHandleCount * BytesPerItemHandle) + (_subscriptions.TrackedSubscriptionCount * BytesPerSubscription); } /// public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask; // ===== ITagDiscovery (PR 4.1) ===== /// public async Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(builder); // PR 4.3 — capture SecurityClassification per attribute. PR 4.W — also refresh // the per-platform probe watcher's membership after discovery so newly-added // $WinPlatform / $AppEngine objects start advising their ScanState attribute. var capturingBuilder = new SecurityCapturingBuilder(builder, _securityByFullRef); var source = _hierarchySource ??= BuildDefaultHierarchySource(); var discoverer = new GalaxyDiscoverer(source); await discoverer.DiscoverAsync(capturingBuilder, cancellationToken).ConfigureAwait(false); if (_probeWatcher is not null) { var hierarchy = await source.GetHierarchyAsync(cancellationToken).ConfigureAwait(false); var platforms = hierarchy .Where(o => o.TemplateChain.Any(t => string.Equals(t, "$WinPlatform", StringComparison.OrdinalIgnoreCase) || string.Equals(t, "$AppEngine", StringComparison.OrdinalIgnoreCase))) .Select(o => o.TagName) .Where(name => !string.IsNullOrEmpty(name)); await _probeWatcher.SyncPlatformsAsync(platforms, cancellationToken).ConfigureAwait(false); } } private SecurityClassification ResolveSecurity(string fullReference) => _securityByFullRef.TryGetValue(fullReference, out var sec) ? sec : SecurityClassification.FreeAccess; // ===== IReadable ===== /// public Task> ReadAsync( IReadOnlyList fullReferences, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(fullReferences); if (fullReferences.Count == 0) return Task.FromResult>([]); if (_dataReader is not null) { // Test-only path — tests inject a canned reader via the internal ctor. return _dataReader.ReadAsync(fullReferences, cancellationToken); } if (_subscriber is null) { throw new NotSupportedException( "GalaxyDriver.ReadAsync requires a connected GalaxyMxSession (production runtime not built). " + "Either inject a test seam via the internal ctor or call InitializeAsync against a real gateway."); } return ReadViaSubscribeOnceAsync(fullReferences, cancellationToken); } /// /// Production read path. MxAccess has no one-shot Read RPC — every value comes /// through the event stream. We synthesise a Read by: /// /// Subscribing the requested tags through the existing /// + . /// Waiting for the first OnDataChange per item handle (the gateway /// pushes the current value as the initial event after a SubscribeBulk). /// Unsubscribing. /// /// Tags the gw rejects at SubscribeBulk time, or that never publish before the /// caller's cancellation token fires, return a Bad-status snapshot in input order /// so the caller still sees one snapshot per requested reference. /// private async Task> ReadViaSubscribeOnceAsync( IReadOnlyList fullReferences, CancellationToken cancellationToken) { var pump = EnsureEventPumpStarted(); var subscriptionId = _subscriptions.NextSubscriptionId(); // Pre-allocate one TaskCompletionSource per full-reference so the OnDataChange // handler can complete them out-of-order as events arrive. Wired BEFORE the // SubscribeBulk call so we don't race with the first event the gw pushes. var pendingByRef = new Dictionary>( StringComparer.OrdinalIgnoreCase); foreach (var fullRef in fullReferences.Distinct(StringComparer.OrdinalIgnoreCase)) { pendingByRef[fullRef] = new TaskCompletionSource( TaskCreationOptions.RunContinuationsAsynchronously); } EventHandler handler = (_, args) => { // Filter to OUR subscription — the pump's OnDataChange fans out across all // subscriptions on the driver, and we don't want a parallel ISubscribable // caller's events to leak into our read. if (args.SubscriptionHandle is GalaxySubscriptionHandle gsh && gsh.SubscriptionId == subscriptionId && pendingByRef.TryGetValue(args.FullReference, out var tcs)) { tcs.TrySetResult(args.Snapshot); } }; pump.OnDataChange += handler; var bufferedIntervalMs = _options.MxAccess.PublishingIntervalMs; IReadOnlyList results; try { results = await _subscriber! .SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken) .ConfigureAwait(false); } catch { pump.OnDataChange -= handler; throw; } // Register bindings so the pump knows to dispatch events for these handles. // Driver.Galaxy-012: index the SubscribeBulk results once and correlate to // references in O(1) instead of FirstOrDefault per element (O(n²) over the // batch). On the 50k-tag soak path this turns a 2.5G-comparison loop into // a single Dictionary build + linear scan. var resultIndex = BuildResultIndex(results); var bindings = new List(fullReferences.Count); for (var i = 0; i < fullReferences.Count; i++) { var fullRef = fullReferences[i]; var itemHandle = resultIndex.TryGetValue(fullRef, out var match) && match is { WasSuccessful: true } ? match.ItemHandle : 0; bindings.Add(new TagBinding(fullRef, itemHandle)); // Tags the gw rejected up front — complete with Bad status now so the // wait below doesn't time out on them. if (itemHandle <= 0 && pendingByRef.TryGetValue(fullRef, out var rejectedTcs)) { rejectedTcs.TrySetResult(new DataValueSnapshot( Value: null, StatusCode: 0x80000000u, // Bad SourceTimestampUtc: null, ServerTimestampUtc: DateTime.UtcNow)); } } _subscriptions.Register(subscriptionId, bindings); try { // Wait for every pending TCS to complete or the caller's CT to fire. When the // CT fires before all values arrive, fill the still-pending entries with a // Bad-status snapshot rather than throwing — Read semantics let callers see // partial results. using var registration = cancellationToken.Register(() => { foreach (var tcs in pendingByRef.Values) { tcs.TrySetResult(new DataValueSnapshot( Value: null, StatusCode: 0x800B0000u, // BadTimeout SourceTimestampUtc: null, ServerTimestampUtc: DateTime.UtcNow)); } }); var snapshots = new DataValueSnapshot[fullReferences.Count]; for (var i = 0; i < fullReferences.Count; i++) { snapshots[i] = await pendingByRef[fullReferences[i]].Task.ConfigureAwait(false); } return snapshots; } finally { pump.OnDataChange -= handler; // Drop the bindings + unsubscribe the live handles. UnsubscribeBulkAsync's // failure isn't fatal — the registry is already cleared, so any straggling // event from the gw would be a no-op fan-out. _subscriptions.Remove(subscriptionId); var liveHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray(); if (liveHandles.Length > 0) { try { await _subscriber!.UnsubscribeBulkAsync(liveHandles, CancellationToken.None) .ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "GalaxyDriver.ReadViaSubscribeOnceAsync UnsubscribeBulk failed for {Count} handle(s) — registry already cleared.", liveHandles.Length); } } } } // ===== IWritable (PR 4.3) ===== /// public Task> WriteAsync( IReadOnlyList writes, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(writes); if (writes.Count == 0) return Task.FromResult>([]); if (_dataWriter is null) { // Mirror the IReadable fallback: production write path runs on top of // GalaxyMxSession (PR 4.2 skeleton; PR 4.4 wires the live session). Until // that lands, deployments selecting Galaxy:Backend=mxgateway can't write. throw new NotSupportedException( "GalaxyDriver.WriteAsync requires GatewayGalaxyDataWriter wired against a connected " + "GalaxyMxSession (PR 4.4). Until that lands, route writes through the legacy-host " + "backend (Galaxy:Backend=legacy-host)."); } return _dataWriter.WriteAsync(writes, ResolveSecurity, cancellationToken); } // ===== ISubscribable (PR 4.4) ===== /// public async Task SubscribeAsync( IReadOnlyList fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(fullReferences); if (_subscriber is null) { throw new NotSupportedException( "GalaxyDriver.SubscribeAsync requires a connected GalaxyMxSession + GatewayGalaxySubscriber. " + "PR 4.W wires the production session; until then route subscriptions through the legacy-host backend."); } var pump = EnsureEventPumpStarted(); var subscriptionId = _subscriptions.NextSubscriptionId(); if (fullReferences.Count == 0) { // Empty subscriptions register but never bind anything — keeps Unsubscribe // symmetric for callers that conditionally add tags later. _subscriptions.Register(subscriptionId, []); return new GalaxySubscriptionHandle(subscriptionId); } // PR 6.3 — when the caller doesn't set a publishing interval (TimeSpan.Zero or // negative), fall back to the configured MxAccess.PublishingIntervalMs. The // server's UA subscription publishingInterval drives this in production; tests // and infrastructure callers (probe watcher, deploy watcher) hit the fallback. var requested = (int)Math.Max(0, publishingInterval.TotalMilliseconds); var bufferedIntervalMs = requested > 0 ? requested : _options.MxAccess.PublishingIntervalMs; var results = await _subscriber .SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken) .ConfigureAwait(false); // Build the binding list in input order. Failed entries (gw rejected the tag) are // recorded with a non-positive ItemHandle so the caller can detect partial failure // by inspecting the returned handle's diagnostic context — full per-tag error // surface lands in PR 5.3's parity tests. // Driver.Galaxy-012: index results once, correlate in O(1) per reference rather // than FirstOrDefault inside the loop (O(n²) on the 50k-tag path). var resultIndex = BuildResultIndex(results); var bindings = new List(fullReferences.Count); for (var i = 0; i < fullReferences.Count; i++) { var fullRef = fullReferences[i]; var hasMatch = resultIndex.TryGetValue(fullRef, out var match); var itemHandle = hasMatch && match is { WasSuccessful: true } ? match.ItemHandle : 0; bindings.Add(new TagBinding(fullRef, itemHandle)); if (match is null || !match.WasSuccessful) { _logger.LogWarning( "Galaxy subscribe for {FullRef} failed: {Error}", fullRef, match?.ErrorMessage ?? ""); } } _subscriptions.Register(subscriptionId, bindings); _ = pump; // keep the pump alive for the subscription's lifetime return new GalaxySubscriptionHandle(subscriptionId); } /// public async Task UnsubscribeAsync(ISubscriptionHandle handle, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(handle); if (handle is not GalaxySubscriptionHandle gsh) { throw new ArgumentException( $"Subscription handle was not issued by this driver (expected GalaxySubscriptionHandle, got {handle.GetType().Name}).", nameof(handle)); } var bindings = _subscriptions.Remove(gsh.SubscriptionId); if (bindings is null) return; // already removed or never registered var liveItemHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray(); if (liveItemHandles.Length == 0 || _subscriber is null) return; try { await _subscriber.UnsubscribeBulkAsync(liveItemHandles, cancellationToken).ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "Galaxy UnsubscribeBulk failed for subscription {SubscriptionId} — registry already cleared on driver side.", gsh.SubscriptionId); } } /// /// Lazily start the on the first subscribe. The pump is /// shared across every subscription on this driver — fan-out happens through the /// reverse map, not by spinning a pump per /// subscription. /// private EventPump EnsureEventPumpStarted() { lock (_pumpLock) { if (_eventPump is not null) return _eventPump; _eventPump = new EventPump( _subscriber!, _subscriptions, _logger, channelCapacity: _options.MxAccess.EventPumpChannelCapacity, clientName: _options.MxAccess.ClientName, onStreamFault: OnEventPumpStreamFault); _eventPump.OnDataChange += OnPumpDataChange; _eventPump.Start(); return _eventPump; } } /// /// Stream-fault callback for the . The gw StreamEvents /// stream faulted (transient gateway drop, network blip, gw restart). Forward /// the cause to the so it drives reopen → /// replay; without this hand-off a transient transport drop permanently kills /// the event stream and GetHealth() keeps reporting Healthy. /// private void OnEventPumpStreamFault(Exception cause) { var supervisor = _supervisor; if (supervisor is null) { // No production runtime (skeleton / injected-seam path) — nothing to drive. _logger.LogWarning(cause, "GalaxyDriver {InstanceId} event stream faulted but no reconnect supervisor is wired.", _driverInstanceId); return; } try { supervisor.ReportTransportFailure(cause); } catch (ObjectDisposedException) { // Driver is being disposed — the stream fault is just shutdown noise. } } // ===== IAlarmSource ===== /// /// Start the gateway alarm feed (idempotent) and wire its transitions onto this /// driver's bridge. The feed is session-less — it does /// not depend on a data subscription or the . /// private void EnsureAlarmFeedStarted() { lock (_alarmFeedLock) { if (_alarmFeed is null) { throw new InvalidOperationException( "GalaxyDriver alarm feed is not wired. InitializeAsync must run (or a feed " + "seam must be injected via the internal ctor) before subscribing to alarms."); } if (_alarmFeedWired) return; _alarmFeed.OnAlarmTransition += OnAlarmFeedTransition; _alarmFeed.Start(); _alarmFeedWired = true; } } /// public Task SubscribeAlarmsAsync( IReadOnlyList sourceNodeIds, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(sourceNodeIds); // The driver doesn't multiplex alarm subscriptions per source-node-id today — // every active handle receives every transition off the gateway's session-less // StreamAlarms feed, and the server filters by source node before raising Part 9 // conditions. The subscription handle is a sentinel the server uses for // symmetric Unsubscribe. Same shape AbCip uses. EnsureAlarmFeedStarted(); var handle = new GalaxyAlarmSubscriptionHandle(Guid.NewGuid().ToString("N")); lock (_alarmHandlersLock) { _alarmSubscriptions.Add(handle); } return Task.FromResult(handle); } /// public Task UnsubscribeAlarmsAsync(IAlarmSubscriptionHandle handle, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(handle); if (handle is not GalaxyAlarmSubscriptionHandle gash) { throw new ArgumentException( $"Subscription handle was not issued by this driver (expected GalaxyAlarmSubscriptionHandle, got {handle.GetType().Name}).", nameof(handle)); } lock (_alarmHandlersLock) { _alarmSubscriptions.Remove(gash); } return Task.CompletedTask; } /// public async Task AcknowledgeAsync( IReadOnlyList acknowledgements, CancellationToken cancellationToken) { ObjectDisposedException.ThrowIf(_disposed, this); ArgumentNullException.ThrowIfNull(acknowledgements); if (acknowledgements.Count == 0) return; if (_alarmAcknowledger is null) { throw new NotSupportedException( "GalaxyDriver.AcknowledgeAsync requires GatewayGalaxyAlarmAcknowledger wired against a connected " + "GalaxyMxSession (PR B.2). InitializeAsync must run before alarm acknowledgements can flow."); } // Acks are issued one-by-one — the gateway RPC accepts a single alarm // reference per call. AlarmConditionState's per-condition Acknowledge in the // server-side ACL layer is the natural rate-limit, so issuing in series here // keeps the operator-comment ordering deterministic without bursting the // worker's STA queue. foreach (var ack in acknowledgements) { // ConditionId carries the alarm full reference for the Galaxy driver — // SourceNodeId is the OPC UA browse path, which the gateway can't address. // The server-side condition state pairs them through AlarmConditionService. var alarmFullReference = !string.IsNullOrEmpty(ack.ConditionId) ? ack.ConditionId : ack.SourceNodeId; await _alarmAcknowledger.AcknowledgeAsync( alarmFullReference, ack.Comment ?? string.Empty, operatorUser: string.Empty, // server-side ACL fills this from the OPC UA session cancellationToken).ConfigureAwait(false); } } /// /// Receives events from the gateway alarm /// feed and reshapes them into for OPC UA-side /// consumers. Fires only when at least one alarm /// subscription is active so a server that hasn't called /// yet doesn't surface untracked transitions. /// private void OnAlarmFeedTransition(object? sender, GalaxyAlarmTransition transition) { GalaxyAlarmSubscriptionHandle? handle; lock (_alarmHandlersLock) { // Pick the earliest-registered handle as the event owner. The server routes // by SourceNodeId (not by handle), so every active subscriber sees the same // transition regardless of which handle is attached here. Using the first // insertion-order entry is deterministic and stable as long as at least one // subscription remains — HashSet.First() was unstable across mutations // (Driver.Galaxy-006 fix). _alarmSubscriptions is a List, so [0] is always // the earliest-registered handle. handle = _alarmSubscriptions.Count > 0 ? _alarmSubscriptions[0] : null; } if (handle is null) return; var args = new AlarmEventArgs( SubscriptionHandle: handle, SourceNodeId: transition.SourceObjectReference, ConditionId: transition.AlarmFullReference, AlarmType: transition.AlarmTypeName, Message: transition.Description, Severity: transition.SeverityBucket, SourceTimestampUtc: transition.TransitionTimestampUtc, OperatorComment: string.IsNullOrEmpty(transition.OperatorComment) ? null : transition.OperatorComment, OriginalRaiseTimestampUtc: transition.OriginalRaiseTimestampUtc, AlarmCategory: string.IsNullOrEmpty(transition.Category) ? null : transition.Category); try { OnAlarmEvent?.Invoke(this, args); } catch (Exception ex) { _logger.LogWarning(ex, "GalaxyDriver OnAlarmEvent handler threw for {AlarmRef} — continuing.", transition.AlarmFullReference); } } /// /// Forwards every fan-out event to the public for /// ISubscribable consumers, AND routes ScanState changes to the per-platform /// probe watcher (PR 4.7) so platform health entries update without the watcher /// consuming the event stream itself. /// private void OnPumpDataChange(object? sender, DataChangeEventArgs args) { OnDataChange?.Invoke(this, args); if (_probeWatcher is not null && args.FullReference.EndsWith(PerPlatformProbeWatcher.ProbeSuffix, StringComparison.OrdinalIgnoreCase)) { // The probe decoder takes a raw quality byte. Recover it via the canonical // StatusCodeMap.ToQualityCategoryByte helper so the mapping lives in one // place next to its inverse (FromQualityByte) and cannot desync silently. var qualityByte = StatusCodeMap.ToQualityCategoryByte(args.Snapshot.StatusCode); _probeWatcher.OnProbeValueChanged(args.FullReference, args.Snapshot.Value, qualityByte); } } /// /// Lazily builds the default from /// _options.Gateway. Owned is disposed in /// . Tests bypass this by injecting their own source via the /// internal ctor. /// private IGalaxyHierarchySource BuildDefaultHierarchySource() { // Driver.Galaxy-009 fix: reuse a client that StartDeployWatcher may have already // created (??=) rather than always overwriting the field and leaking the first // instance. Both paths produce equivalent clients from the same options. _ownedRepositoryClient ??= GalaxyRepositoryClient.Create(BuildClientOptions(_options.Gateway)); return new TracedGalaxyHierarchySource( new GatewayGalaxyHierarchySource(_ownedRepositoryClient), _options.MxAccess.ClientName); } /// /// Asynchronous disposal. Prefer await using over using — the /// async path does not block the caller while awaiting EventPump / session / /// client shutdown (Driver.Galaxy-007: the sync path blocked on /// GetAwaiter().GetResult() for every async sub-component, risking a /// deadlock under thread-pool starvation). /// public async ValueTask DisposeAsync() { if (_disposed) return; _disposed = true; // Synchronous sub-components first — none of these block. try { _deployWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "DeployWatcher dispose failed"); } try { _supervisor?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ReconnectSupervisor dispose failed"); } try { _probeWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ProbeWatcher dispose failed"); } try { _transportForwarder?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "Transport forwarder dispose failed"); } // Async sub-components: await each so we don't block a thread-pool thread // on a slow shutdown (e.g. EventPump draining its channel, gRPC stream closing). EventPump? pump; lock (_pumpLock) { pump = _eventPump; _eventPump = null; } if (pump is not null) { try { await pump.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "EventPump dispose failed"); } } IGalaxyAlarmFeed? alarmFeed; lock (_alarmFeedLock) { alarmFeed = _alarmFeed; _alarmFeed = null; } if (alarmFeed is not null) { try { await alarmFeed.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "Alarm feed dispose failed"); } } if (_ownedMxSession is not null) { try { await _ownedMxSession.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "MxSession dispose failed"); } _ownedMxSession = null; } if (_ownedMxClient is not null) { try { await _ownedMxClient.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "MxClient dispose failed"); } _ownedMxClient = null; } if (_ownedRepositoryClient is not null) { try { await _ownedRepositoryClient.DisposeAsync().ConfigureAwait(false); } catch (Exception ex) { _logger.LogWarning(ex, "RepositoryClient dispose failed"); } _ownedRepositoryClient = null; } _hierarchySource = null; } /// /// Synchronous disposal. Prefer in async contexts — /// this path must block on every async sub-component shutdown. Provided for /// compatibility with using statements that cannot await. /// public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult(); /// /// Address-space builder wrapper that records each variable's /// into the supplied dictionary /// before delegating to the inner builder. Used by /// to capture per-tag classifications for the IWritable routing decision — /// PR 4.3 needs the data, but the discoverer itself doesn't (and shouldn't) /// know about the driver's internal state. /// private sealed class SecurityCapturingBuilder( IAddressSpaceBuilder inner, System.Collections.Concurrent.ConcurrentDictionary map) : IAddressSpaceBuilder { public IAddressSpaceBuilder Folder(string browseName, string displayName) => new SecurityCapturingBuilder(inner.Folder(browseName, displayName), map); public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo) { map[attributeInfo.FullName] = attributeInfo.SecurityClass; return inner.Variable(browseName, displayName, attributeInfo); } public void AddProperty(string browseName, DriverDataType dataType, object? value) => inner.AddProperty(browseName, dataType, value); } }