Files
lmxopcua/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Galaxy/GalaxyDriver.cs
Joseph Doherty 8fe7c8bea6 refactor(driver-galaxy): switch to sibling-repo MxGateway client + drop vendored libs
The sibling mxaccessgw repo (clients/dotnet/) restored a proper client
library + contracts under the new ZB.MOM.WW.MxGateway namespace, so the
binary-vendoring stopgap from PR Driver.Galaxy-016 can unwind via plan #1
of libs/README.md.

- csproj: replace <Reference HintPath="libs\MxGateway.*.dll"> with a
  ProjectReference into ..\..\..\..\mxaccessgw\clients\dotnet  ZB.MOM.WW.MxGateway.Client\. The five backfill PackageReference shims
  (Google.Protobuf, Grpc.Core.Api, Grpc.Net.Client, Polly.Core,
  Microsoft.Extensions.Logging.Abstractions) are now transitive again.
- Source: 'using MxGateway.X' -> 'using ZB.MOM.WW.MxGateway.X' across
  19 driver files + 14 test files. No fully-qualified MxGateway.* usages
  in code, so no behavioural changes — purely a using-prefix flip.
- libs/: deleted MxGateway.Client.dll, MxGateway.Contracts.dll, README.md
  (orphan after the unwind).

Verified: dotnet build clean (Release), all 245 Driver.Galaxy unit tests
pass, OtOpcUa service running with the new client DLL loaded
(opc.tcp://localhost:4840/OtOpcUa, no FileNotFound/TypeLoad/
MissingMethod in startup logs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 14:55:15 -04:00

1277 lines
61 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.MxGateway.Client;
using ZB.MOM.WW.MxGateway.Contracts.Proto;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// In-process .NET 10 Galaxy driver — the only Galaxy backend since PR 7.2 retired
/// the legacy <c>Galaxy.Host</c> / <c>Galaxy.Proxy</c> / <c>Galaxy.Shared</c>
/// projects and the <c>OtOpcUaGalaxyHost</c> Windows service. Implements the full
/// capability surface: <see cref="ITagDiscovery"/>, <see cref="IReadable"/>,
/// <see cref="IWritable"/>, <see cref="ISubscribable"/>, <see cref="IRediscoverable"/>,
/// <see cref="IHostConnectivityProbe"/>, and <see cref="IAlarmSource"/>. Galaxy
/// access flows through the in-process driver over gRPC to the separately
/// installed <c>mxaccessgw</c> gateway (sibling repo), which owns the MXAccess
/// COM apartment server-side.
/// </summary>
/// <remarks>
/// <para>
/// Registered as a Tier A in-process driver alongside Modbus / S7 / etc. via
/// <see cref="GalaxyDriverFactoryExtensions"/> under driver-type name
/// "GalaxyMxGateway".
/// </para>
/// <para>
/// Tests inject capability seams (<see cref="IGalaxyHierarchySource"/>,
/// <see cref="IGalaxyDataReader"/>, <see cref="IGalaxyDataWriter"/>,
/// <see cref="IGalaxySubscriber"/>, <see cref="IGalaxyAlarmAcknowledger"/>,
/// <see cref="IGalaxyAlarmFeed"/>) via the internal ctor so capability flow
/// can be exercised without a real gw round-trip.
/// </para>
/// </remarks>
public sealed class GalaxyDriver
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IRediscoverable, IHostConnectivityProbe, IAlarmSource, IDisposable, IAsyncDisposable
{
private readonly string _driverInstanceId;
private readonly GalaxyDriverOptions _options;
private readonly ILogger<GalaxyDriver> _logger;
// PR 4.1 — IGalaxyHierarchySource is the test seam for browse. When null, the driver
// lazily builds a GatewayGalaxyHierarchySource around a GalaxyRepositoryClient on
// first DiscoverAsync. Tests inject a fake source via the internal ctor to exercise
// GalaxyDiscoverer's translation logic without a real gRPC channel.
private IGalaxyHierarchySource? _hierarchySource;
private GalaxyRepositoryClient? _ownedRepositoryClient;
// PR 4.2 — IGalaxyDataReader is the test seam for IReadable. PR 4.4 supplies the
// production implementation that wraps GalaxyMxSession's SubscribeBulk + StreamEvents
// pump; until then ReadAsync throws NotSupportedException when the reader is null
// (legacy-host backend handles reads in production via DriverNodeManager's
// capability-routing).
private IGalaxyDataReader? _dataReader;
// PR 4.3 — IGalaxyDataWriter is the test seam for IWritable. Production wraps
// GalaxyMxSession via GatewayGalaxyDataWriter (Write / WriteSecured routing). The
// per-tag SecurityClassification map is populated during ITagDiscovery and consumed
// here at write time.
private IGalaxyDataWriter? _dataWriter;
private readonly System.Collections.Concurrent.ConcurrentDictionary<string, SecurityClassification>
_securityByFullRef = new(StringComparer.OrdinalIgnoreCase);
// PR 4.4 — subscription lifecycle. The pump consumes the gw event stream and fans
// out OnDataChange events to every registered driver subscription via the registry's
// reverse map. The subscriber is the test seam — production uses
// GatewayGalaxySubscriber over a connected GalaxyMxSession.
private IGalaxySubscriber? _subscriber;
private readonly SubscriptionRegistry _subscriptions = new();
private EventPump? _eventPump;
private readonly Lock _pumpLock = new();
// IAlarmSource implementation. Production-side acks route through
// GatewayGalaxyAlarmAcknowledger which calls the session-less
// MxGatewayClient.AcknowledgeAlarmAsync RPC; alarm transitions arrive on the
// gateway's session-less StreamAlarms feed via GatewayGalaxyAlarmFeed. Tests inject
// IGalaxyAlarmAcknowledger + IGalaxyAlarmFeed via the internal ctor to exercise the
// wiring without a running gateway. This driver bridges the feed's OnAlarmTransition
// onto IAlarmSource.OnAlarmEvent.
private IGalaxyAlarmAcknowledger? _alarmAcknowledger;
private IGalaxyAlarmFeed? _alarmFeed;
private readonly Lock _alarmHandlersLock = new();
private readonly Lock _alarmFeedLock = new();
private bool _alarmFeedWired;
// List preserves insertion order so OnAlarmFeedTransition always picks the
// earliest-registered handle — a deterministic choice that doesn't vary as
// handles are added/removed (Driver.Galaxy-006 fix: HashSet.First() is unstable).
private readonly List<GalaxyAlarmSubscriptionHandle> _alarmSubscriptions = new();
// PR 4.W — production runtime owned by InitializeAsync. The driver builds these
// when it opens a real gw session; tests bypass them by injecting seams via the
// internal ctor.
private GalaxyMxSession? _ownedMxSession;
private MxGatewayClient? _ownedMxClient;
// PR 4.5 — reconnect supervisor. Reflects in DriverState.Degraded while not Healthy.
private ReconnectSupervisor? _supervisor;
// PR 4.6 — IRediscoverable plumbing.
private DeployWatcher? _deployWatcher;
// PR 4.7 — IHostConnectivityProbe plumbing. The aggregator owns the merged
// transport+per-platform view; the forwarder is fed from the supervisor on
// transport state transitions; the probe watcher subscribes ScanState attributes
// for every discovered platform and pushes value changes to the aggregator.
private readonly HostStatusAggregator _hostStatuses = new();
private HostConnectivityForwarder? _transportForwarder;
private PerPlatformProbeWatcher? _probeWatcher;
private DriverHealth _health = new(DriverState.Unknown, null, null);
private bool _disposed;
/// <summary>
/// Server-pushed data-change notification. Fires from the
/// <see cref="EventPump"/>'s background loop; handlers should be cheap (or queue
/// onto another thread) to avoid blocking the gw event stream.
/// </summary>
public event EventHandler<DataChangeEventArgs>? OnDataChange;
/// <summary>Fires when the gateway signals a deploy-time change (PR 4.6 DeployWatcher).</summary>
public event EventHandler<RediscoveryEventArgs>? OnRediscoveryNeeded;
/// <summary>Fires when a host transitions Running ↔ Stopped (PR 4.7 HostStatusAggregator).</summary>
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
/// <inheritdoc />
public event EventHandler<AlarmEventArgs>? OnAlarmEvent;
public GalaxyDriver(
string driverInstanceId,
GalaxyDriverOptions options,
ILogger<GalaxyDriver>? logger = null)
: this(driverInstanceId, options,
hierarchySource: null, dataReader: null, dataWriter: null, subscriber: null,
alarmAcknowledger: null, alarmFeed: null, logger)
{
}
/// <summary>
/// Test-visible ctor — inject custom seams so <see cref="DiscoverAsync"/>,
/// <see cref="ReadAsync"/>, <see cref="WriteAsync"/>, and
/// <see cref="SubscribeAsync"/> can be exercised against canned data without
/// building real gRPC channels.
/// </summary>
internal GalaxyDriver(
string driverInstanceId,
GalaxyDriverOptions options,
IGalaxyHierarchySource? hierarchySource,
IGalaxyDataReader? dataReader = null,
IGalaxyDataWriter? dataWriter = null,
IGalaxySubscriber? subscriber = null,
IGalaxyAlarmAcknowledger? alarmAcknowledger = null,
IGalaxyAlarmFeed? alarmFeed = null,
ILogger<GalaxyDriver>? logger = null)
{
_driverInstanceId = !string.IsNullOrWhiteSpace(driverInstanceId)
? driverInstanceId
: throw new ArgumentException("Driver instance id required.", nameof(driverInstanceId));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? NullLogger<GalaxyDriver>.Instance;
_hierarchySource = hierarchySource;
_dataReader = dataReader;
_dataWriter = dataWriter;
_subscriber = subscriber;
_alarmAcknowledger = alarmAcknowledger;
_alarmFeed = alarmFeed;
// Forward the aggregator's transitions through IHostConnectivityProbe.
_hostStatuses.OnHostStatusChanged += (_, args) => OnHostStatusChanged?.Invoke(this, args);
}
/// <inheritdoc />
public string DriverInstanceId => _driverInstanceId;
/// <inheritdoc />
public string DriverType => GalaxyDriverFactoryExtensions.DriverTypeName;
/// <summary>Test-visible options snapshot.</summary>
internal GalaxyDriverOptions Options => _options;
/// <summary>
/// Test-visible entry into <see cref="ReplayAsync"/>. The supervisor's
/// <see cref="ReconnectSupervisor.ReportTransportFailure"/> drives this on a
/// background task in production; tests prefer to invoke it directly so the
/// <see cref="GalaxyReconnectOptions.ReplayOnSessionLost"/> branch can be
/// asserted deterministically (Driver.Galaxy-013).
/// </summary>
internal Task InvokeReplayForTestAsync(CancellationToken cancellationToken) =>
ReplayAsync(cancellationToken);
/// <inheritdoc />
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
// Tests inject seams via the internal ctor; production InitializeAsync builds
// the gateway client + session + per-capability runtime components from
// GalaxyDriverOptions. When seams are pre-injected we leave them alone (the
// test exercises the wired surface without a real gw round-trip).
if (_subscriber is null && _dataWriter is null && _hierarchySource is null)
{
await BuildProductionRuntimeAsync(cancellationToken).ConfigureAwait(false);
}
else
{
_logger.LogDebug(
"GalaxyDriver {InstanceId} initializing with pre-injected seams — production runtime build skipped",
_driverInstanceId);
}
StartDeployWatcher();
_logger.LogInformation(
"GalaxyDriver {InstanceId} initialized — endpoint={Endpoint} clientName={ClientName}",
_driverInstanceId, _options.Gateway.Endpoint, _options.MxAccess.ClientName);
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
}
/// <summary>
/// Build the production gw client + session + per-capability runtime components
/// from <c>_options</c>. Sets up the reconnect supervisor's reopen / replay
/// callbacks so a transport drop replays every active subscription on the
/// restored session.
/// </summary>
private async Task BuildProductionRuntimeAsync(CancellationToken cancellationToken)
{
var clientOptions = BuildClientOptions(_options.Gateway);
_ownedMxClient = MxGatewayClient.Create(clientOptions);
_ownedMxSession = new GalaxyMxSession(_options.MxAccess, _logger);
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
// PR 6.1 — wrap the gw-facing seams in tracing decorators so every Subscribe /
// Unsubscribe / Write / StreamEvents call emits a span on the
// "ZB.MOM.WW.OtOpcUa.Driver.Galaxy" ActivitySource. The host process's tracing
// listener (OTLP exporter, dotnet-trace, etc.) consumes these without the driver
// taking a dependency on the OpenTelemetry packages.
_subscriber = new TracedGalaxySubscriber(
new GatewayGalaxySubscriber(_ownedMxSession), _options.MxAccess.ClientName);
_dataWriter = new TracedGalaxyDataWriter(
new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger),
_options.MxAccess.ClientName);
_supervisor = new ReconnectSupervisor(
reopen: ReopenAsync,
replay: ReplayAsync,
options: new ReconnectOptions(
InitialBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.InitialBackoffMs),
MaxBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.MaxBackoffMs)),
logger: _logger);
_transportForwarder = new HostConnectivityForwarder(_options.MxAccess.ClientName, _hostStatuses, _logger);
_transportForwarder.SetTransport(HostState.Running); // initial state — we just connected
_supervisor.StateChanged += OnSupervisorStateChanged;
_probeWatcher = new PerPlatformProbeWatcher(
_subscriber, _hostStatuses, _logger,
bufferedUpdateIntervalMs: _options.MxAccess.PublishingIntervalMs);
// Wire the alarm acknowledger + feed to the live gateway client. Both are
// session-less — the gateway serves alarms from an always-on central monitor —
// so they hang off the owned MxGatewayClient, not the worker session.
_alarmAcknowledger ??= new GatewayGalaxyAlarmAcknowledger(_ownedMxClient, _logger);
_alarmFeed ??= new GatewayGalaxyAlarmFeed(
_ownedMxClient.StreamAlarmsAsync, _logger, _options.MxAccess.ClientName);
}
/// <summary>
/// Reopen callback for <see cref="ReconnectSupervisor"/>: re-Register the gw session.
/// If the session never connected, this is a fresh ConnectAsync; otherwise it's a
/// reconnect against the existing client.
/// </summary>
private async Task ReopenAsync(CancellationToken cancellationToken)
{
if (_ownedMxSession is null) return;
var clientOptions = BuildClientOptions(_options.Gateway);
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
}
/// <summary>
/// Replay callback. Walks every active subscription, re-issues SubscribeBulk for
/// its tag list, and <see cref="SubscriptionRegistry.Rebind">rebinds</see> the
/// registry with the fresh item handles the gateway returned — the pre-reconnect
/// handles are dead once the session reopened. The faulted <see cref="EventPump"/>
/// is recreated first so the replayed subscriptions have a live StreamEvents
/// consumer; without that restart the replayed tags are subscribed on the gw but
/// never reach <c>OnDataChange</c> (Driver.Galaxy-008). PR 6.x can swap this for
/// the gw's batched <c>ReplaySubscriptionsCommand</c> once it ships.
/// </summary>
private async Task ReplayAsync(CancellationToken cancellationToken)
{
if (_subscriber is null) return;
var entries = _subscriptions.SnapshotEntries();
if (entries.Count == 0) return;
// Driver.Galaxy-013: honor ReplayOnSessionLost. When operators opt out (false)
// we skip the per-tag SubscribeBulk fan-out — they're delegating to the
// gateway's session-level ReplaySubscriptions or accept post-reconnect tag
// loss. We still restart the EventPump so a future Subscribe call lands on
// a live consumer.
if (!_options.Reconnect.ReplayOnSessionLost)
{
RestartEventPumpForReplay();
_logger.LogInformation(
"GalaxyDriver {InstanceId} reconnect replay skipped — ReplayOnSessionLost=false ({SubCount} subscriptions tracked)",
_driverInstanceId, entries.Count);
return;
}
// The stream-fault that triggered this recovery left the old pump's RunAsync loop
// exited and its channel completed; EventPump.Start() is a no-op on a non-null but
// completed loop. Recreate the pump so the replayed subscriptions have a consumer.
RestartEventPumpForReplay();
var tagCount = 0;
foreach (var (subscriptionId, oldBindings) in entries)
{
var refs = oldBindings
.Select(b => b.FullReference)
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToArray();
if (refs.Length == 0) continue;
var results = await _subscriber
.SubscribeBulkAsync(refs, _options.MxAccess.PublishingIntervalMs, cancellationToken)
.ConfigureAwait(false);
var byAddress = BuildResultIndex(results);
var newBindings = new List<TagBinding>(refs.Length);
foreach (var fullRef in refs)
{
var itemHandle = byAddress.TryGetValue(fullRef, out var match) && match.WasSuccessful
? match.ItemHandle
: 0;
newBindings.Add(new TagBinding(fullRef, itemHandle));
}
// Rebind so the EventPump fan-out reverse map points at the post-reconnect
// handles; otherwise events on the new handles miss every subscription.
_subscriptions.Rebind(subscriptionId, newBindings);
tagCount += refs.Length;
}
_logger.LogInformation(
"GalaxyDriver {InstanceId} replay completed — {SubCount} subscriptions, {TagCount} tags re-subscribed",
_driverInstanceId, entries.Count, tagCount);
}
/// <summary>
/// Index a SubscribeBulk result list by tag address (OrdinalIgnoreCase) so the
/// subscribe / replay correlation loops are O(1) per reference rather than a
/// linear scan. Last-write-wins on a duplicate address (the gw shouldn't emit one).
/// </summary>
private static Dictionary<string, SubscribeResult> BuildResultIndex(IReadOnlyList<SubscribeResult> results)
{
var index = new Dictionary<string, SubscribeResult>(results.Count, StringComparer.OrdinalIgnoreCase);
foreach (var result in results)
{
if (!string.IsNullOrEmpty(result.TagAddress)) index[result.TagAddress] = result;
}
return index;
}
/// <summary>
/// Dispose the faulted <see cref="EventPump"/> and create a fresh one bound to the
/// same subscriber / registry, started immediately. Invoked from the reconnect
/// replay path. No-op when no pump was ever started (no active subscriptions).
/// </summary>
private void RestartEventPumpForReplay()
{
EventPump? old;
lock (_pumpLock)
{
old = _eventPump;
if (old is null) return; // pump never started — nothing to restart
_eventPump = null;
}
// Detach + dispose the faulted pump outside the lock so a slow shutdown doesn't
// block a concurrent EnsureEventPumpStarted; the old loop already exited on fault.
old.OnDataChange -= OnPumpDataChange;
try { old.DisposeAsync().AsTask().GetAwaiter().GetResult(); }
catch (Exception ex)
{
_logger.LogWarning(ex,
"GalaxyDriver {InstanceId} faulted EventPump dispose failed during replay — continuing.",
_driverInstanceId);
}
// EnsureEventPumpStarted creates + starts a fresh pump under the lock. Skip the
// recreate if the driver is being disposed — Dispose already tore the pump down
// and a fresh one here would leak past the driver's lifetime.
if (_disposed) return;
EnsureEventPumpStarted();
}
private void OnSupervisorStateChanged(object? sender, StateTransition transition)
{
// Reflect supervisor state in DriverHealth + transport forwarder.
_health = transition.Next switch
{
ReconnectSupervisor.State.Healthy => new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null),
_ => new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, transition.Cause),
};
if (_transportForwarder is not null)
{
var hostState = transition.Next == ReconnectSupervisor.State.Healthy
? HostState.Running
: HostState.Stopped;
_transportForwarder.SetTransport(hostState);
}
}
/// <summary>
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Four
/// forms supported, evaluated in order:
/// <list type="number">
/// <item><c>env:NAME</c> — reads <c>Environment.GetEnvironmentVariable(NAME)</c>.
/// Throws when the variable is unset, so a misconfigured deployment fails
/// fast at InitializeAsync rather than silently sending an empty key.</item>
/// <item><c>file:PATH</c> — reads UTF-8 text from <c>PATH</c>, trimming
/// whitespace. Lets operators stash the key in an ACL'd file outside the
/// repo (the same pattern as the legacy <c>.local/galaxy-host-secret.txt</c>).</item>
/// <item><c>dev:KEY</c> — explicit cleartext literal. The <c>dev:</c> prefix
/// is a deliberate opt-in signal (dev box, parity rig) so the resolver
/// doesn't emit a warning; production should never use this arm.</item>
/// <item>Anything else — used as the literal API key for back-compat with
/// configs that pre-date this resolver. When a logger is supplied the
/// resolver emits a startup warning so an operator who accidentally
/// committed a cleartext key sees it (Driver.Galaxy-010).</item>
/// </list>
/// A future PR can swap any of these arms for a DPAPI-backed lookup without
/// changing the call site.
/// </summary>
internal static string ResolveApiKey(string secretRef) => ResolveApiKey(secretRef, logger: null);
/// <summary>
/// Logger-aware overload. Emits a <see cref="LogLevel.Warning"/> if the secret
/// ref falls through to the back-compat literal arm (an unprefixed cleartext
/// API key in <c>DriverConfig</c> JSON). The <c>dev:</c> prefix is the explicit
/// opt-in path that doesn't warn.
/// </summary>
internal static string ResolveApiKey(string secretRef, ILogger? logger)
{
ArgumentException.ThrowIfNullOrEmpty(secretRef);
if (secretRef.StartsWith("env:", StringComparison.OrdinalIgnoreCase))
{
var name = secretRef[4..];
var value = Environment.GetEnvironmentVariable(name);
return !string.IsNullOrEmpty(value)
? value
: throw new InvalidOperationException(
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' resolves to env var '{name}', but it is unset.");
}
if (secretRef.StartsWith("file:", StringComparison.OrdinalIgnoreCase))
{
var path = secretRef[5..];
if (!File.Exists(path))
{
throw new InvalidOperationException(
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' points at '{path}', which doesn't exist.");
}
var contents = File.ReadAllText(path).Trim();
return !string.IsNullOrEmpty(contents)
? contents
: throw new InvalidOperationException(
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' file '{path}' is empty.");
}
if (secretRef.StartsWith("dev:", StringComparison.OrdinalIgnoreCase))
{
// Explicit dev opt-in — no warning, the operator deliberately chose a
// cleartext literal (dev box, parity rig).
return secretRef[4..];
}
// Back-compat literal arm. An unprefixed string is treated as the literal
// API key — but emit a warning so an operator who accidentally committed a
// cleartext key into DriverConfig sees it at startup. Use the dev: prefix
// to suppress this warning when the literal is intentional.
logger?.LogWarning(
"Galaxy.Gateway.ApiKeySecretRef is being treated as a literal cleartext API key. " +
"Prefer env:NAME, file:PATH, or the explicit dev:KEY prefix for dev rigs — " +
"a literal key in DriverConfig JSON is stored in cleartext in the central config DB.");
return secretRef;
}
private MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
// Driver.Galaxy-010: pass the logger so the literal-arm cleartext fallback
// surfaces a startup warning rather than silently shipping the key.
ApiKey = ResolveApiKey(gw.ApiKeySecretRef, _logger),
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds),
DefaultCallTimeout = TimeSpan.FromSeconds(gw.DefaultCallTimeoutSeconds),
StreamTimeout = gw.StreamTimeoutSeconds > 0 ? TimeSpan.FromSeconds(gw.StreamTimeoutSeconds) : null,
};
private void StartDeployWatcher()
{
if (!_options.Repository.WatchDeployEvents) return;
if (_ownedRepositoryClient is null && _hierarchySource is null) return;
// Reuse the lazily-built repository client (DiscoverAsync constructs it on demand).
// If discovery hasn't run yet, build the client here so the watcher has a target.
// Driver.Galaxy-009 fix: guard with ??= so if BuildDefaultHierarchySource later runs
// it reuses this client rather than overwriting the field and leaking the first instance.
_ownedRepositoryClient ??= MxGateway.Client.GalaxyRepositoryClient.Create(
BuildClientOptions(_options.Gateway));
var source = new GatewayGalaxyDeployWatchSource(_ownedRepositoryClient);
_deployWatcher = new DeployWatcher(source, _logger);
_deployWatcher.OnRediscoveryNeeded += (_, args) => OnRediscoveryNeeded?.Invoke(this, args);
// StartAsync schedules the background loop and returns Task.CompletedTask immediately.
// It throws InvalidOperationException synchronously if called twice (programming error).
// Driver.Galaxy-009 fix: don't discard the return value — observe any synchronous throw.
var startTask = _deployWatcher.StartAsync(CancellationToken.None);
// The task is already completed (StartAsync is synchronous); surface any synchronous fault.
if (startTask.IsFaulted) startTask.GetAwaiter().GetResult();
}
/// <inheritdoc />
/// <remarks>
/// <para>
/// In-place config reapply. The driver does not currently support
/// hot-swapping <see cref="GalaxyDriverOptions"/> at runtime — changing the
/// gateway endpoint, MxAccess client name, or reconnect policy requires
/// tearing down the gw session, supervisor, event pump, and address space.
/// The host stack handles that via DriverInstance restart, so this method
/// only accepts an equivalent config (no meaningful change) and refreshes
/// health; a non-equivalent reapply throws <see cref="NotSupportedException"/>
/// so the caller knows the change wasn't applied (Driver.Galaxy-013:
/// previously the method silently ignored <c>driverConfigJson</c>).
/// </para>
/// </remarks>
public Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (!string.IsNullOrWhiteSpace(driverConfigJson))
{
// Materialise the incoming config and compare against the live options. We
// refuse any change that would require a session teardown rather than
// pretending to apply it.
GalaxyDriverOptions incoming;
try
{
// Reuse the factory's parse pipeline so any missing-required-field
// error surfaces with the same diagnostic text as InitializeAsync.
var transient = GalaxyDriverFactoryExtensions.CreateInstance(_driverInstanceId, driverConfigJson);
incoming = transient.Options;
// The transient instance never started a runtime — disposing is cheap.
transient.Dispose();
}
catch (Exception ex) when (ex is not NotSupportedException and not ObjectDisposedException)
{
throw new NotSupportedException(
$"GalaxyDriver.ReinitializeAsync could not parse the incoming DriverConfig JSON for '{_driverInstanceId}': {ex.Message}",
ex);
}
if (!OptionsAreEquivalent(_options, incoming))
{
throw new NotSupportedException(
"GalaxyDriver.ReinitializeAsync does not support hot-swapping driver options at runtime " +
"(gateway endpoint, MxAccess client name, reconnect policy, etc.). Restart the DriverInstance " +
"through the host stack to apply a config change.");
}
}
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
return Task.CompletedTask;
}
/// <summary>
/// Compare two <see cref="GalaxyDriverOptions"/> for runtime equivalence — every
/// field that drives gw session shape, address space, or reconnect behaviour
/// must match. Records get value-equality from the language, so a direct
/// equality check is enough.
/// </summary>
private static bool OptionsAreEquivalent(GalaxyDriverOptions a, GalaxyDriverOptions b) => a == b;
/// <inheritdoc />
public Task ShutdownAsync(CancellationToken cancellationToken)
{
if (_disposed) return Task.CompletedTask;
_logger.LogInformation("GalaxyDriver {InstanceId} shutting down", _driverInstanceId);
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
return Task.CompletedTask;
}
/// <inheritdoc />
public DriverHealth GetHealth()
{
// Reconnect supervisor wins when degraded — the cached _health reflects the last
// successful operation, but ongoing recovery should surface as Degraded.
if (_supervisor?.IsDegraded == true)
{
return new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, _supervisor.LastError);
}
return _health;
}
// ===== IHostConnectivityProbe (PR 4.7 wire-up) =====
/// <inheritdoc />
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses() => _hostStatuses.Snapshot();
/// <inheritdoc />
/// <remarks>
/// Estimated footprint: 64 bytes × tracked item handles (one gw subscription entry
/// per bound tag) + 256 bytes × tracked driver subscriptions (registry overhead per
/// OPC UA monitored item). Returns 0 when no subscriptions are active. These
/// constants are conservative — a 50k-tag set occupies ~3 MB and registers clearly
/// with the server's cache-flush heuristic. Driver.Galaxy-011: the stale
/// "PR 4.4 sets this" comment is removed; PR 4.4 shipped the SubscriptionRegistry
/// but never wired it here.
/// </remarks>
public long GetMemoryFootprint()
{
const long BytesPerItemHandle = 64L; // TagBinding + reverse-map entry
const long BytesPerSubscription = 256L; // SubscriptionEntry overhead
return (_subscriptions.TrackedItemHandleCount * BytesPerItemHandle)
+ (_subscriptions.TrackedSubscriptionCount * BytesPerSubscription);
}
/// <inheritdoc />
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
// ===== ITagDiscovery (PR 4.1) =====
/// <inheritdoc />
public async Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(builder);
// PR 4.3 — capture SecurityClassification per attribute. PR 4.W — also refresh
// the per-platform probe watcher's membership after discovery so newly-added
// $WinPlatform / $AppEngine objects start advising their ScanState attribute.
var capturingBuilder = new SecurityCapturingBuilder(builder, _securityByFullRef);
var source = _hierarchySource ??= BuildDefaultHierarchySource();
var discoverer = new GalaxyDiscoverer(source);
await discoverer.DiscoverAsync(capturingBuilder, cancellationToken).ConfigureAwait(false);
if (_probeWatcher is not null)
{
var hierarchy = await source.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
var platforms = hierarchy
.Where(o => o.TemplateChain.Any(t =>
string.Equals(t, "$WinPlatform", StringComparison.OrdinalIgnoreCase)
|| string.Equals(t, "$AppEngine", StringComparison.OrdinalIgnoreCase)))
.Select(o => o.TagName)
.Where(name => !string.IsNullOrEmpty(name));
await _probeWatcher.SyncPlatformsAsync(platforms, cancellationToken).ConfigureAwait(false);
}
}
private SecurityClassification ResolveSecurity(string fullReference) =>
_securityByFullRef.TryGetValue(fullReference, out var sec) ? sec : SecurityClassification.FreeAccess;
// ===== IReadable =====
/// <inheritdoc />
public Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(fullReferences);
if (fullReferences.Count == 0) return Task.FromResult<IReadOnlyList<DataValueSnapshot>>([]);
if (_dataReader is not null)
{
// Test-only path — tests inject a canned reader via the internal ctor.
return _dataReader.ReadAsync(fullReferences, cancellationToken);
}
if (_subscriber is null)
{
throw new NotSupportedException(
"GalaxyDriver.ReadAsync requires a connected GalaxyMxSession (production runtime not built). " +
"Either inject a test seam via the internal ctor or call InitializeAsync against a real gateway.");
}
return ReadViaSubscribeOnceAsync(fullReferences, cancellationToken);
}
/// <summary>
/// Production read path. MxAccess has no one-shot Read RPC — every value comes
/// through the event stream. We synthesise a Read by:
/// <list type="number">
/// <item>Subscribing the requested tags through the existing
/// <see cref="SubscriptionRegistry"/> + <see cref="EventPump"/>.</item>
/// <item>Waiting for the first <c>OnDataChange</c> per item handle (the gateway
/// pushes the current value as the initial event after a SubscribeBulk).</item>
/// <item>Unsubscribing.</item>
/// </list>
/// Tags the gw rejects at SubscribeBulk time, or that never publish before the
/// caller's cancellation token fires, return a Bad-status snapshot in input order
/// so the caller still sees one snapshot per requested reference.
/// </summary>
private async Task<IReadOnlyList<DataValueSnapshot>> ReadViaSubscribeOnceAsync(
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
{
var pump = EnsureEventPumpStarted();
var subscriptionId = _subscriptions.NextSubscriptionId();
// Pre-allocate one TaskCompletionSource per full-reference so the OnDataChange
// handler can complete them out-of-order as events arrive. Wired BEFORE the
// SubscribeBulk call so we don't race with the first event the gw pushes.
var pendingByRef = new Dictionary<string, TaskCompletionSource<DataValueSnapshot>>(
StringComparer.OrdinalIgnoreCase);
foreach (var fullRef in fullReferences.Distinct(StringComparer.OrdinalIgnoreCase))
{
pendingByRef[fullRef] = new TaskCompletionSource<DataValueSnapshot>(
TaskCreationOptions.RunContinuationsAsynchronously);
}
EventHandler<DataChangeEventArgs> handler = (_, args) =>
{
// Filter to OUR subscription — the pump's OnDataChange fans out across all
// subscriptions on the driver, and we don't want a parallel ISubscribable
// caller's events to leak into our read.
if (args.SubscriptionHandle is GalaxySubscriptionHandle gsh
&& gsh.SubscriptionId == subscriptionId
&& pendingByRef.TryGetValue(args.FullReference, out var tcs))
{
tcs.TrySetResult(args.Snapshot);
}
};
pump.OnDataChange += handler;
var bufferedIntervalMs = _options.MxAccess.PublishingIntervalMs;
IReadOnlyList<SubscribeResult> results;
try
{
results = await _subscriber!
.SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken)
.ConfigureAwait(false);
}
catch
{
pump.OnDataChange -= handler;
throw;
}
// Register bindings so the pump knows to dispatch events for these handles.
// Driver.Galaxy-012: index the SubscribeBulk results once and correlate to
// references in O(1) instead of FirstOrDefault per element (O(n²) over the
// batch). On the 50k-tag soak path this turns a 2.5G-comparison loop into
// a single Dictionary build + linear scan.
var resultIndex = BuildResultIndex(results);
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var itemHandle = resultIndex.TryGetValue(fullRef, out var match) && match is { WasSuccessful: true }
? match.ItemHandle
: 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
// Tags the gw rejected up front — complete with Bad status now so the
// wait below doesn't time out on them.
if (itemHandle <= 0
&& pendingByRef.TryGetValue(fullRef, out var rejectedTcs))
{
rejectedTcs.TrySetResult(new DataValueSnapshot(
Value: null,
StatusCode: 0x80000000u, // Bad
SourceTimestampUtc: null,
ServerTimestampUtc: DateTime.UtcNow));
}
}
_subscriptions.Register(subscriptionId, bindings);
try
{
// Wait for every pending TCS to complete or the caller's CT to fire. When the
// CT fires before all values arrive, fill the still-pending entries with a
// Bad-status snapshot rather than throwing — Read semantics let callers see
// partial results.
using var registration = cancellationToken.Register(() =>
{
foreach (var tcs in pendingByRef.Values)
{
tcs.TrySetResult(new DataValueSnapshot(
Value: null,
StatusCode: 0x800B0000u, // BadTimeout
SourceTimestampUtc: null,
ServerTimestampUtc: DateTime.UtcNow));
}
});
var snapshots = new DataValueSnapshot[fullReferences.Count];
for (var i = 0; i < fullReferences.Count; i++)
{
snapshots[i] = await pendingByRef[fullReferences[i]].Task.ConfigureAwait(false);
}
return snapshots;
}
finally
{
pump.OnDataChange -= handler;
// Drop the bindings + unsubscribe the live handles. UnsubscribeBulkAsync's
// failure isn't fatal — the registry is already cleared, so any straggling
// event from the gw would be a no-op fan-out.
_subscriptions.Remove(subscriptionId);
var liveHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray();
if (liveHandles.Length > 0)
{
try
{
await _subscriber!.UnsubscribeBulkAsync(liveHandles, CancellationToken.None)
.ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"GalaxyDriver.ReadViaSubscribeOnceAsync UnsubscribeBulk failed for {Count} handle(s) — registry already cleared.",
liveHandles.Length);
}
}
}
}
// ===== IWritable (PR 4.3) =====
/// <inheritdoc />
public Task<IReadOnlyList<WriteResult>> WriteAsync(
IReadOnlyList<WriteRequest> writes, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(writes);
if (writes.Count == 0) return Task.FromResult<IReadOnlyList<WriteResult>>([]);
if (_dataWriter is null)
{
// Mirror the IReadable fallback: production write path runs on top of
// GalaxyMxSession (PR 4.2 skeleton; PR 4.4 wires the live session). Until
// that lands, deployments selecting Galaxy:Backend=mxgateway can't write.
throw new NotSupportedException(
"GalaxyDriver.WriteAsync requires GatewayGalaxyDataWriter wired against a connected " +
"GalaxyMxSession (PR 4.4). Until that lands, route writes through the legacy-host " +
"backend (Galaxy:Backend=legacy-host).");
}
return _dataWriter.WriteAsync(writes, ResolveSecurity, cancellationToken);
}
// ===== ISubscribable (PR 4.4) =====
/// <inheritdoc />
public async Task<ISubscriptionHandle> SubscribeAsync(
IReadOnlyList<string> fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(fullReferences);
if (_subscriber is null)
{
throw new NotSupportedException(
"GalaxyDriver.SubscribeAsync requires a connected GalaxyMxSession + GatewayGalaxySubscriber. " +
"PR 4.W wires the production session; until then route subscriptions through the legacy-host backend.");
}
var pump = EnsureEventPumpStarted();
var subscriptionId = _subscriptions.NextSubscriptionId();
if (fullReferences.Count == 0)
{
// Empty subscriptions register but never bind anything — keeps Unsubscribe
// symmetric for callers that conditionally add tags later.
_subscriptions.Register(subscriptionId, []);
return new GalaxySubscriptionHandle(subscriptionId);
}
// PR 6.3 — when the caller doesn't set a publishing interval (TimeSpan.Zero or
// negative), fall back to the configured MxAccess.PublishingIntervalMs. The
// server's UA subscription publishingInterval drives this in production; tests
// and infrastructure callers (probe watcher, deploy watcher) hit the fallback.
var requested = (int)Math.Max(0, publishingInterval.TotalMilliseconds);
var bufferedIntervalMs = requested > 0 ? requested : _options.MxAccess.PublishingIntervalMs;
var results = await _subscriber
.SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken)
.ConfigureAwait(false);
// Build the binding list in input order. Failed entries (gw rejected the tag) are
// recorded with a non-positive ItemHandle so the caller can detect partial failure
// by inspecting the returned handle's diagnostic context — full per-tag error
// surface lands in PR 5.3's parity tests.
// Driver.Galaxy-012: index results once, correlate in O(1) per reference rather
// than FirstOrDefault inside the loop (O(n²) on the 50k-tag path).
var resultIndex = BuildResultIndex(results);
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var hasMatch = resultIndex.TryGetValue(fullRef, out var match);
var itemHandle = hasMatch && match is { WasSuccessful: true } ? match.ItemHandle : 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
if (match is null || !match.WasSuccessful)
{
_logger.LogWarning(
"Galaxy subscribe for {FullRef} failed: {Error}",
fullRef, match?.ErrorMessage ?? "<no result returned>");
}
}
_subscriptions.Register(subscriptionId, bindings);
_ = pump; // keep the pump alive for the subscription's lifetime
return new GalaxySubscriptionHandle(subscriptionId);
}
/// <inheritdoc />
public async Task UnsubscribeAsync(ISubscriptionHandle handle, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(handle);
if (handle is not GalaxySubscriptionHandle gsh)
{
throw new ArgumentException(
$"Subscription handle was not issued by this driver (expected GalaxySubscriptionHandle, got {handle.GetType().Name}).",
nameof(handle));
}
var bindings = _subscriptions.Remove(gsh.SubscriptionId);
if (bindings is null) return; // already removed or never registered
var liveItemHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray();
if (liveItemHandles.Length == 0 || _subscriber is null) return;
try
{
await _subscriber.UnsubscribeBulkAsync(liveItemHandles, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy UnsubscribeBulk failed for subscription {SubscriptionId} — registry already cleared on driver side.",
gsh.SubscriptionId);
}
}
/// <summary>
/// Lazily start the <see cref="EventPump"/> on the first subscribe. The pump is
/// shared across every subscription on this driver — fan-out happens through the
/// <see cref="SubscriptionRegistry"/> reverse map, not by spinning a pump per
/// subscription.
/// </summary>
private EventPump EnsureEventPumpStarted()
{
lock (_pumpLock)
{
if (_eventPump is not null) return _eventPump;
_eventPump = new EventPump(
_subscriber!, _subscriptions, _logger,
channelCapacity: _options.MxAccess.EventPumpChannelCapacity,
clientName: _options.MxAccess.ClientName,
onStreamFault: OnEventPumpStreamFault);
_eventPump.OnDataChange += OnPumpDataChange;
_eventPump.Start();
return _eventPump;
}
}
/// <summary>
/// Stream-fault callback for the <see cref="EventPump"/>. The gw StreamEvents
/// stream faulted (transient gateway drop, network blip, gw restart). Forward
/// the cause to the <see cref="ReconnectSupervisor"/> so it drives reopen →
/// replay; without this hand-off a transient transport drop permanently kills
/// the event stream and <c>GetHealth()</c> keeps reporting Healthy.
/// </summary>
private void OnEventPumpStreamFault(Exception cause)
{
var supervisor = _supervisor;
if (supervisor is null)
{
// No production runtime (skeleton / injected-seam path) — nothing to drive.
_logger.LogWarning(cause,
"GalaxyDriver {InstanceId} event stream faulted but no reconnect supervisor is wired.",
_driverInstanceId);
return;
}
try
{
supervisor.ReportTransportFailure(cause);
}
catch (ObjectDisposedException)
{
// Driver is being disposed — the stream fault is just shutdown noise.
}
}
// ===== IAlarmSource =====
/// <summary>
/// Start the gateway alarm feed (idempotent) and wire its transitions onto this
/// driver's <see cref="OnAlarmEvent"/> bridge. The feed is session-less — it does
/// not depend on a data subscription or the <see cref="EventPump"/>.
/// </summary>
private void EnsureAlarmFeedStarted()
{
lock (_alarmFeedLock)
{
if (_alarmFeed is null)
{
throw new InvalidOperationException(
"GalaxyDriver alarm feed is not wired. InitializeAsync must run (or a feed " +
"seam must be injected via the internal ctor) before subscribing to alarms.");
}
if (_alarmFeedWired) return;
_alarmFeed.OnAlarmTransition += OnAlarmFeedTransition;
_alarmFeed.Start();
_alarmFeedWired = true;
}
}
/// <inheritdoc />
public Task<IAlarmSubscriptionHandle> SubscribeAlarmsAsync(
IReadOnlyList<string> sourceNodeIds, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(sourceNodeIds);
// The driver doesn't multiplex alarm subscriptions per source-node-id today —
// every active handle receives every transition off the gateway's session-less
// StreamAlarms feed, and the server filters by source node before raising Part 9
// conditions. The subscription handle is a sentinel the server uses for
// symmetric Unsubscribe. Same shape AbCip uses.
EnsureAlarmFeedStarted();
var handle = new GalaxyAlarmSubscriptionHandle(Guid.NewGuid().ToString("N"));
lock (_alarmHandlersLock)
{
_alarmSubscriptions.Add(handle);
}
return Task.FromResult<IAlarmSubscriptionHandle>(handle);
}
/// <inheritdoc />
public Task UnsubscribeAlarmsAsync(IAlarmSubscriptionHandle handle, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(handle);
if (handle is not GalaxyAlarmSubscriptionHandle gash)
{
throw new ArgumentException(
$"Subscription handle was not issued by this driver (expected GalaxyAlarmSubscriptionHandle, got {handle.GetType().Name}).",
nameof(handle));
}
lock (_alarmHandlersLock)
{
_alarmSubscriptions.Remove(gash);
}
return Task.CompletedTask;
}
/// <inheritdoc />
public async Task AcknowledgeAsync(
IReadOnlyList<AlarmAcknowledgeRequest> acknowledgements, CancellationToken cancellationToken)
{
ObjectDisposedException.ThrowIf(_disposed, this);
ArgumentNullException.ThrowIfNull(acknowledgements);
if (acknowledgements.Count == 0) return;
if (_alarmAcknowledger is null)
{
throw new NotSupportedException(
"GalaxyDriver.AcknowledgeAsync requires GatewayGalaxyAlarmAcknowledger wired against a connected " +
"GalaxyMxSession (PR B.2). InitializeAsync must run before alarm acknowledgements can flow.");
}
// Acks are issued one-by-one — the gateway RPC accepts a single alarm
// reference per call. AlarmConditionState's per-condition Acknowledge in the
// server-side ACL layer is the natural rate-limit, so issuing in series here
// keeps the operator-comment ordering deterministic without bursting the
// worker's STA queue.
foreach (var ack in acknowledgements)
{
// ConditionId carries the alarm full reference for the Galaxy driver —
// SourceNodeId is the OPC UA browse path, which the gateway can't address.
// The server-side condition state pairs them through AlarmConditionService.
var alarmFullReference = !string.IsNullOrEmpty(ack.ConditionId)
? ack.ConditionId
: ack.SourceNodeId;
await _alarmAcknowledger.AcknowledgeAsync(
alarmFullReference,
ack.Comment ?? string.Empty,
operatorUser: string.Empty, // server-side ACL fills this from the OPC UA session
cancellationToken).ConfigureAwait(false);
}
}
/// <summary>
/// Receives <see cref="GalaxyAlarmTransition"/> events from the gateway alarm
/// feed and reshapes them into <see cref="AlarmEventArgs"/> for OPC UA-side
/// consumers. Fires <see cref="OnAlarmEvent"/> only when at least one alarm
/// subscription is active so a server that hasn't called
/// <see cref="SubscribeAlarmsAsync"/> yet doesn't surface untracked transitions.
/// </summary>
private void OnAlarmFeedTransition(object? sender, GalaxyAlarmTransition transition)
{
GalaxyAlarmSubscriptionHandle? handle;
lock (_alarmHandlersLock)
{
// Pick the earliest-registered handle as the event owner. The server routes
// by SourceNodeId (not by handle), so every active subscriber sees the same
// transition regardless of which handle is attached here. Using the first
// insertion-order entry is deterministic and stable as long as at least one
// subscription remains — HashSet.First() was unstable across mutations
// (Driver.Galaxy-006 fix). _alarmSubscriptions is a List, so [0] is always
// the earliest-registered handle.
handle = _alarmSubscriptions.Count > 0
? _alarmSubscriptions[0]
: null;
}
if (handle is null) return;
var args = new AlarmEventArgs(
SubscriptionHandle: handle,
SourceNodeId: transition.SourceObjectReference,
ConditionId: transition.AlarmFullReference,
AlarmType: transition.AlarmTypeName,
Message: transition.Description,
Severity: transition.SeverityBucket,
SourceTimestampUtc: transition.TransitionTimestampUtc,
OperatorComment: string.IsNullOrEmpty(transition.OperatorComment) ? null : transition.OperatorComment,
OriginalRaiseTimestampUtc: transition.OriginalRaiseTimestampUtc,
AlarmCategory: string.IsNullOrEmpty(transition.Category) ? null : transition.Category);
try
{
OnAlarmEvent?.Invoke(this, args);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"GalaxyDriver OnAlarmEvent handler threw for {AlarmRef} — continuing.",
transition.AlarmFullReference);
}
}
/// <summary>
/// Forwards every fan-out event to the public <see cref="OnDataChange"/> for
/// ISubscribable consumers, AND routes ScanState changes to the per-platform
/// probe watcher (PR 4.7) so platform health entries update without the watcher
/// consuming the event stream itself.
/// </summary>
private void OnPumpDataChange(object? sender, DataChangeEventArgs args)
{
OnDataChange?.Invoke(this, args);
if (_probeWatcher is not null
&& args.FullReference.EndsWith(PerPlatformProbeWatcher.ProbeSuffix, StringComparison.OrdinalIgnoreCase))
{
// The probe decoder takes a raw quality byte. Recover it via the canonical
// StatusCodeMap.ToQualityCategoryByte helper so the mapping lives in one
// place next to its inverse (FromQualityByte) and cannot desync silently.
var qualityByte = StatusCodeMap.ToQualityCategoryByte(args.Snapshot.StatusCode);
_probeWatcher.OnProbeValueChanged(args.FullReference, args.Snapshot.Value, qualityByte);
}
}
/// <summary>
/// Lazily builds the default <see cref="IGalaxyHierarchySource"/> from
/// <c>_options.Gateway</c>. Owned <see cref="GalaxyRepositoryClient"/> is disposed in
/// <see cref="Dispose"/>. Tests bypass this by injecting their own source via the
/// internal ctor.
/// </summary>
private IGalaxyHierarchySource BuildDefaultHierarchySource()
{
// Driver.Galaxy-009 fix: reuse a client that StartDeployWatcher may have already
// created (??=) rather than always overwriting the field and leaking the first
// instance. Both paths produce equivalent clients from the same options.
_ownedRepositoryClient ??= GalaxyRepositoryClient.Create(BuildClientOptions(_options.Gateway));
return new TracedGalaxyHierarchySource(
new GatewayGalaxyHierarchySource(_ownedRepositoryClient), _options.MxAccess.ClientName);
}
/// <summary>
/// Asynchronous disposal. Prefer <c>await using</c> over <c>using</c> — the
/// async path does not block the caller while awaiting EventPump / session /
/// client shutdown (Driver.Galaxy-007: the sync path blocked on
/// <c>GetAwaiter().GetResult()</c> for every async sub-component, risking a
/// deadlock under thread-pool starvation).
/// </summary>
public async ValueTask DisposeAsync()
{
if (_disposed) return;
_disposed = true;
// Synchronous sub-components first — none of these block.
try { _deployWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "DeployWatcher dispose failed"); }
try { _supervisor?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ReconnectSupervisor dispose failed"); }
try { _probeWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ProbeWatcher dispose failed"); }
try { _transportForwarder?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "Transport forwarder dispose failed"); }
// Async sub-components: await each so we don't block a thread-pool thread
// on a slow shutdown (e.g. EventPump draining its channel, gRPC stream closing).
EventPump? pump;
lock (_pumpLock) { pump = _eventPump; _eventPump = null; }
if (pump is not null)
{
try { await pump.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "EventPump dispose failed"); }
}
IGalaxyAlarmFeed? alarmFeed;
lock (_alarmFeedLock) { alarmFeed = _alarmFeed; _alarmFeed = null; }
if (alarmFeed is not null)
{
try { await alarmFeed.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "Alarm feed dispose failed"); }
}
if (_ownedMxSession is not null)
{
try { await _ownedMxSession.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "MxSession dispose failed"); }
_ownedMxSession = null;
}
if (_ownedMxClient is not null)
{
try { await _ownedMxClient.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "MxClient dispose failed"); }
_ownedMxClient = null;
}
if (_ownedRepositoryClient is not null)
{
try { await _ownedRepositoryClient.DisposeAsync().ConfigureAwait(false); }
catch (Exception ex) { _logger.LogWarning(ex, "RepositoryClient dispose failed"); }
_ownedRepositoryClient = null;
}
_hierarchySource = null;
}
/// <summary>
/// Synchronous disposal. Prefer <see cref="DisposeAsync"/> in async contexts —
/// this path must block on every async sub-component shutdown. Provided for
/// compatibility with <c>using</c> statements that cannot <c>await</c>.
/// </summary>
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
/// <summary>
/// Address-space builder wrapper that records each variable's
/// <see cref="DriverAttributeInfo.SecurityClass"/> into the supplied dictionary
/// before delegating to the inner builder. Used by <see cref="DiscoverAsync"/>
/// to capture per-tag classifications for the IWritable routing decision —
/// PR 4.3 needs the data, but the discoverer itself doesn't (and shouldn't)
/// know about the driver's internal state.
/// </summary>
private sealed class SecurityCapturingBuilder(
IAddressSpaceBuilder inner,
System.Collections.Concurrent.ConcurrentDictionary<string, SecurityClassification> map)
: IAddressSpaceBuilder
{
public IAddressSpaceBuilder Folder(string browseName, string displayName)
=> new SecurityCapturingBuilder(inner.Folder(browseName, displayName), map);
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
{
map[attributeInfo.FullName] = attributeInfo.SecurityClass;
return inner.Variable(browseName, displayName, attributeInfo);
}
public void AddProperty(string browseName, DriverDataType dataType, object? value)
=> inner.AddProperty(browseName, dataType, value);
}
}