fix(driver-galaxy): resolve Low code-review findings (Driver.Galaxy-005,010,012,013)

- Driver.Galaxy-005: rewrite the EventPump BoundedChannelOptions comment
  to honestly describe the Wait+TryWrite pattern.
- Driver.Galaxy-010: ResolveApiKey now warns when a literal API key is
  used in production wiring; added an explicit dev: prefix for known
  cleartext-in-dev cases and rewrote the GalaxyGatewayOptions doc.
- Driver.Galaxy-012: O(1) reverse-lookup for SubscriptionRegistry
  dispatch via per-entry FullRefByItemHandle map; immutable hash-set for
  the cross-binding reverse map; SubscribeAsync / ReadViaSubscribeOnce
  use BuildResultIndex for per-reference correlation.
- Driver.Galaxy-013: ReinitializeAsync now validates the incoming JSON
  against the running options; ReplayOnSessionLost honoured by the
  Replay path; class summary rewritten to describe the shipped surface.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 07:45:08 -04:00
parent 5c513f99fd
commit 9f7ae20995
9 changed files with 444 additions and 58 deletions

View File

@@ -11,19 +11,29 @@ using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// In-process .NET 10 Galaxy driver — the v2 replacement for the Galaxy.Host /
/// Galaxy.Proxy pair. PR 4.0 ships the project skeleton with <see cref="IDriver"/>
/// bodies that wire to a future <c>IGalaxyGatewayClient</c> abstraction. Capability
/// interfaces (browse, read, write, subscribe, history routing, host probes) land in
/// PRs 4.14.7; the wiring sequence keeps every intermediate state buildable so the
/// <c>Galaxy:Backend</c> flag (PR 4.W) can flip between legacy-host and mxgateway
/// for parity testing.
/// In-process .NET 10 Galaxy driver — the only Galaxy backend since PR 7.2 retired
/// the legacy <c>Galaxy.Host</c> / <c>Galaxy.Proxy</c> / <c>Galaxy.Shared</c>
/// projects and the <c>OtOpcUaGalaxyHost</c> Windows service. Implements the full
/// capability surface: <see cref="ITagDiscovery"/>, <see cref="IReadable"/>,
/// <see cref="IWritable"/>, <see cref="ISubscribable"/>, <see cref="IRediscoverable"/>,
/// <see cref="IHostConnectivityProbe"/>, and <see cref="IAlarmSource"/>. Galaxy
/// access flows through the in-process driver over gRPC to the separately
/// installed <c>mxaccessgw</c> gateway (sibling repo), which owns the MXAccess
/// COM apartment server-side.
/// </summary>
/// <remarks>
/// This driver is registered as a Tier A in-process driver alongside Modbus / S7 / etc.
/// The legacy <c>GalaxyProxyDriver</c> (Driver.Galaxy.Proxy) coexists until PR 7.2;
/// <see cref="GalaxyDriverFactoryExtensions"/> registers under driver-type name
/// "GalaxyMxGateway" so both paths can be live simultaneously during parity testing.
/// <para>
/// Registered as a Tier A in-process driver alongside Modbus / S7 / etc. via
/// <see cref="GalaxyDriverFactoryExtensions"/> under driver-type name
/// "GalaxyMxGateway".
/// </para>
/// <para>
/// Tests inject capability seams (<see cref="IGalaxyHierarchySource"/>,
/// <see cref="IGalaxyDataReader"/>, <see cref="IGalaxyDataWriter"/>,
/// <see cref="IGalaxySubscriber"/>, <see cref="IGalaxyAlarmAcknowledger"/>,
/// <see cref="IGalaxyAlarmFeed"/>) via the internal ctor so capability flow
/// can be exercised without a real gw round-trip.
/// </para>
/// </remarks>
public sealed class GalaxyDriver
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IRediscoverable, IHostConnectivityProbe, IAlarmSource, IDisposable, IAsyncDisposable
@@ -171,6 +181,16 @@ public sealed class GalaxyDriver
/// <summary>Test-visible options snapshot.</summary>
internal GalaxyDriverOptions Options => _options;
/// <summary>
/// Test-visible entry into <see cref="ReplayAsync"/>. The supervisor's
/// <see cref="ReconnectSupervisor.ReportTransportFailure"/> drives this on a
/// background task in production; tests prefer to invoke it directly so the
/// <see cref="GalaxyReconnectOptions.ReplayOnSessionLost"/> branch can be
/// asserted deterministically (Driver.Galaxy-013).
/// </summary>
internal Task InvokeReplayForTestAsync(CancellationToken cancellationToken) =>
ReplayAsync(cancellationToken);
/// <inheritdoc />
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
@@ -275,6 +295,20 @@ public sealed class GalaxyDriver
var entries = _subscriptions.SnapshotEntries();
if (entries.Count == 0) return;
// Driver.Galaxy-013: honor ReplayOnSessionLost. When operators opt out (false)
// we skip the per-tag SubscribeBulk fan-out — they're delegating to the
// gateway's session-level ReplaySubscriptions or accept post-reconnect tag
// loss. We still restart the EventPump so a future Subscribe call lands on
// a live consumer.
if (!_options.Reconnect.ReplayOnSessionLost)
{
RestartEventPumpForReplay();
_logger.LogInformation(
"GalaxyDriver {InstanceId} reconnect replay skipped — ReplayOnSessionLost=false ({SubCount} subscriptions tracked)",
_driverInstanceId, entries.Count);
return;
}
// The stream-fault that triggered this recovery left the old pump's RunAsync loop
// exited and its channel completed; EventPump.Start() is a no-op on a non-null but
// completed loop. Recreate the pump so the replayed subscriptions have a consumer.
@@ -380,7 +414,7 @@ public sealed class GalaxyDriver
}
/// <summary>
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Three
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Four
/// forms supported, evaluated in order:
/// <list type="number">
/// <item><c>env:NAME</c> — reads <c>Environment.GetEnvironmentVariable(NAME)</c>.
@@ -389,13 +423,26 @@ public sealed class GalaxyDriver
/// <item><c>file:PATH</c> — reads UTF-8 text from <c>PATH</c>, trimming
/// whitespace. Lets operators stash the key in an ACL'd file outside the
/// repo (the same pattern as the legacy <c>.local/galaxy-host-secret.txt</c>).</item>
/// <item>Anything else — used as the literal API key. Convenient for dev,
/// and avoids breaking existing configs that pre-date this resolver.</item>
/// <item><c>dev:KEY</c> — explicit cleartext literal. The <c>dev:</c> prefix
/// is a deliberate opt-in signal (dev box, parity rig) so the resolver
/// doesn't emit a warning; production should never use this arm.</item>
/// <item>Anything else — used as the literal API key for back-compat with
/// configs that pre-date this resolver. When a logger is supplied the
/// resolver emits a startup warning so an operator who accidentally
/// committed a cleartext key sees it (Driver.Galaxy-010).</item>
/// </list>
/// A future PR can swap any of these arms for a DPAPI-backed lookup without
/// changing the call site.
/// </summary>
internal static string ResolveApiKey(string secretRef)
internal static string ResolveApiKey(string secretRef) => ResolveApiKey(secretRef, logger: null);
/// <summary>
/// Logger-aware overload. Emits a <see cref="LogLevel.Warning"/> if the secret
/// ref falls through to the back-compat literal arm (an unprefixed cleartext
/// API key in <c>DriverConfig</c> JSON). The <c>dev:</c> prefix is the explicit
/// opt-in path that doesn't warn.
/// </summary>
internal static string ResolveApiKey(string secretRef, ILogger? logger)
{
ArgumentException.ThrowIfNullOrEmpty(secretRef);
@@ -424,13 +471,30 @@ public sealed class GalaxyDriver
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' file '{path}' is empty.");
}
if (secretRef.StartsWith("dev:", StringComparison.OrdinalIgnoreCase))
{
// Explicit dev opt-in — no warning, the operator deliberately chose a
// cleartext literal (dev box, parity rig).
return secretRef[4..];
}
// Back-compat literal arm. An unprefixed string is treated as the literal
// API key — but emit a warning so an operator who accidentally committed a
// cleartext key into DriverConfig sees it at startup. Use the dev: prefix
// to suppress this warning when the literal is intentional.
logger?.LogWarning(
"Galaxy.Gateway.ApiKeySecretRef is being treated as a literal cleartext API key. " +
"Prefer env:NAME, file:PATH, or the explicit dev:KEY prefix for dev rigs — " +
"a literal key in DriverConfig JSON is stored in cleartext in the central config DB.");
return secretRef;
}
private static MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
private MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
ApiKey = ResolveApiKey(gw.ApiKeySecretRef),
// Driver.Galaxy-010: pass the logger so the literal-arm cleartext fallback
// surfaces a startup warning rather than silently shipping the key.
ApiKey = ResolveApiKey(gw.ApiKeySecretRef, _logger),
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds),
@@ -463,15 +527,64 @@ public sealed class GalaxyDriver
}
/// <inheritdoc />
/// <remarks>
/// <para>
/// In-place config reapply. The driver does not currently support
/// hot-swapping <see cref="GalaxyDriverOptions"/> at runtime — changing the
/// gateway endpoint, MxAccess client name, or reconnect policy requires
/// tearing down the gw session, supervisor, event pump, and address space.
/// The host stack handles that via DriverInstance restart, so this method
/// only accepts an equivalent config (no meaningful change) and refreshes
/// health; a non-equivalent reapply throws <see cref="NotSupportedException"/>
/// so the caller knows the change wasn't applied (Driver.Galaxy-013:
/// previously the method silently ignored <c>driverConfigJson</c>).
/// </para>
/// </remarks>
public Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
// In-place config reapply. PR 4.5's reconnect supervisor will swap the
// gateway-client options under the lock; for the skeleton we just refresh health.
ObjectDisposedException.ThrowIf(_disposed, this);
if (!string.IsNullOrWhiteSpace(driverConfigJson))
{
// Materialise the incoming config and compare against the live options. We
// refuse any change that would require a session teardown rather than
// pretending to apply it.
GalaxyDriverOptions incoming;
try
{
// Reuse the factory's parse pipeline so any missing-required-field
// error surfaces with the same diagnostic text as InitializeAsync.
var transient = GalaxyDriverFactoryExtensions.CreateInstance(_driverInstanceId, driverConfigJson);
incoming = transient.Options;
// The transient instance never started a runtime — disposing is cheap.
transient.Dispose();
}
catch (Exception ex) when (ex is not NotSupportedException and not ObjectDisposedException)
{
throw new NotSupportedException(
$"GalaxyDriver.ReinitializeAsync could not parse the incoming DriverConfig JSON for '{_driverInstanceId}': {ex.Message}",
ex);
}
if (!OptionsAreEquivalent(_options, incoming))
{
throw new NotSupportedException(
"GalaxyDriver.ReinitializeAsync does not support hot-swapping driver options at runtime " +
"(gateway endpoint, MxAccess client name, reconnect policy, etc.). Restart the DriverInstance " +
"through the host stack to apply a config change.");
}
}
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
return Task.CompletedTask;
}
/// <summary>
/// Compare two <see cref="GalaxyDriverOptions"/> for runtime equivalence — every
/// field that drives gw session shape, address space, or reconnect behaviour
/// must match. Records get value-equality from the language, so a direct
/// equality check is enough.
/// </summary>
private static bool OptionsAreEquivalent(GalaxyDriverOptions a, GalaxyDriverOptions b) => a == b;
/// <inheritdoc />
public Task ShutdownAsync(CancellationToken cancellationToken)
{
@@ -637,12 +750,18 @@ public sealed class GalaxyDriver
}
// Register bindings so the pump knows to dispatch events for these handles.
// Driver.Galaxy-012: index the SubscribeBulk results once and correlate to
// references in O(1) instead of FirstOrDefault per element (O(n²) over the
// batch). On the 50k-tag soak path this turns a 2.5G-comparison loop into
// a single Dictionary build + linear scan.
var resultIndex = BuildResultIndex(results);
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var match = results.FirstOrDefault(r => string.Equals(r.TagAddress, fullRef, StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
var itemHandle = resultIndex.TryGetValue(fullRef, out var match) && match is { WasSuccessful: true }
? match.ItemHandle
: 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
// Tags the gw rejected up front — complete with Bad status now so the
@@ -774,12 +893,15 @@ public sealed class GalaxyDriver
// recorded with a non-positive ItemHandle so the caller can detect partial failure
// by inspecting the returned handle's diagnostic context — full per-tag error
// surface lands in PR 5.3's parity tests.
// Driver.Galaxy-012: index results once, correlate in O(1) per reference rather
// than FirstOrDefault inside the loop (O(n²) on the 50k-tag path).
var resultIndex = BuildResultIndex(results);
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var match = results.FirstOrDefault(r => string.Equals(r.TagAddress, fullRef, StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
var hasMatch = resultIndex.TryGetValue(fullRef, out var match);
var itemHandle = hasMatch && match is { WasSuccessful: true } ? match.ItemHandle : 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
if (match is null || !match.WasSuccessful)
{