fix(driver-galaxy): resolve Low code-review findings (Driver.Galaxy-005,010,012,013)

- Driver.Galaxy-005: rewrite the EventPump BoundedChannelOptions comment
  to honestly describe the Wait+TryWrite pattern.
- Driver.Galaxy-010: ResolveApiKey now warns when a literal API key is
  used in production wiring; added an explicit dev: prefix for known
  cleartext-in-dev cases and rewrote the GalaxyGatewayOptions doc.
- Driver.Galaxy-012: O(1) reverse-lookup for SubscriptionRegistry
  dispatch via per-entry FullRefByItemHandle map; immutable hash-set for
  the cross-binding reverse map; SubscribeAsync / ReadViaSubscribeOnce
  use BuildResultIndex for per-reference correlation.
- Driver.Galaxy-013: ReinitializeAsync now validates the incoming JSON
  against the running options; ReplayOnSessionLost honoured by the
  Replay path; class summary rewritten to describe the shipped surface.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 07:45:08 -04:00
parent 5c513f99fd
commit 9f7ae20995
9 changed files with 444 additions and 58 deletions

View File

@@ -18,9 +18,20 @@ public sealed record GalaxyDriverOptions(
GalaxyReconnectOptions Reconnect);
/// <summary>
/// Connection details for the MxAccess gateway. <see cref="ApiKeySecretRef"/> resolves
/// through the server-side secret store (DPAPI for production, environment override for
/// dev) — the API key never appears in cleartext config.
/// Connection details for the MxAccess gateway. <see cref="ApiKeySecretRef"/> is
/// resolved by <c>GalaxyDriver.ResolveApiKey</c> at InitializeAsync time. Four forms
/// supported, in priority order:
/// <list type="bullet">
/// <item><c>env:NAME</c> — read from an environment variable (recommended for
/// production; the central config DB holds only the indirection, not the key).</item>
/// <item><c>file:PATH</c> — read from an ACL'd file outside the repo.</item>
/// <item><c>dev:KEY</c> — explicit cleartext opt-in for dev rigs / parity tests;
/// no startup warning.</item>
/// <item>Anything else — treated as a literal cleartext API key for back-compat.
/// The resolver emits a <c>Warning</c> at startup so an operator who accidentally
/// committed a cleartext key sees it (Driver.Galaxy-010); production should
/// migrate to <c>env:</c> or <c>file:</c>.</item>
/// </list>
/// </summary>
// PR 6.5 tuning notes:
// ConnectTimeoutSeconds = 10 — cold-start network path comfort margin; soak runs

View File

@@ -11,19 +11,29 @@ using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
/// <summary>
/// In-process .NET 10 Galaxy driver — the v2 replacement for the Galaxy.Host /
/// Galaxy.Proxy pair. PR 4.0 ships the project skeleton with <see cref="IDriver"/>
/// bodies that wire to a future <c>IGalaxyGatewayClient</c> abstraction. Capability
/// interfaces (browse, read, write, subscribe, history routing, host probes) land in
/// PRs 4.14.7; the wiring sequence keeps every intermediate state buildable so the
/// <c>Galaxy:Backend</c> flag (PR 4.W) can flip between legacy-host and mxgateway
/// for parity testing.
/// In-process .NET 10 Galaxy driver — the only Galaxy backend since PR 7.2 retired
/// the legacy <c>Galaxy.Host</c> / <c>Galaxy.Proxy</c> / <c>Galaxy.Shared</c>
/// projects and the <c>OtOpcUaGalaxyHost</c> Windows service. Implements the full
/// capability surface: <see cref="ITagDiscovery"/>, <see cref="IReadable"/>,
/// <see cref="IWritable"/>, <see cref="ISubscribable"/>, <see cref="IRediscoverable"/>,
/// <see cref="IHostConnectivityProbe"/>, and <see cref="IAlarmSource"/>. Galaxy
/// access flows through the in-process driver over gRPC to the separately
/// installed <c>mxaccessgw</c> gateway (sibling repo), which owns the MXAccess
/// COM apartment server-side.
/// </summary>
/// <remarks>
/// This driver is registered as a Tier A in-process driver alongside Modbus / S7 / etc.
/// The legacy <c>GalaxyProxyDriver</c> (Driver.Galaxy.Proxy) coexists until PR 7.2;
/// <see cref="GalaxyDriverFactoryExtensions"/> registers under driver-type name
/// "GalaxyMxGateway" so both paths can be live simultaneously during parity testing.
/// <para>
/// Registered as a Tier A in-process driver alongside Modbus / S7 / etc. via
/// <see cref="GalaxyDriverFactoryExtensions"/> under driver-type name
/// "GalaxyMxGateway".
/// </para>
/// <para>
/// Tests inject capability seams (<see cref="IGalaxyHierarchySource"/>,
/// <see cref="IGalaxyDataReader"/>, <see cref="IGalaxyDataWriter"/>,
/// <see cref="IGalaxySubscriber"/>, <see cref="IGalaxyAlarmAcknowledger"/>,
/// <see cref="IGalaxyAlarmFeed"/>) via the internal ctor so capability flow
/// can be exercised without a real gw round-trip.
/// </para>
/// </remarks>
public sealed class GalaxyDriver
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IRediscoverable, IHostConnectivityProbe, IAlarmSource, IDisposable, IAsyncDisposable
@@ -171,6 +181,16 @@ public sealed class GalaxyDriver
/// <summary>Test-visible options snapshot.</summary>
internal GalaxyDriverOptions Options => _options;
/// <summary>
/// Test-visible entry into <see cref="ReplayAsync"/>. The supervisor's
/// <see cref="ReconnectSupervisor.ReportTransportFailure"/> drives this on a
/// background task in production; tests prefer to invoke it directly so the
/// <see cref="GalaxyReconnectOptions.ReplayOnSessionLost"/> branch can be
/// asserted deterministically (Driver.Galaxy-013).
/// </summary>
internal Task InvokeReplayForTestAsync(CancellationToken cancellationToken) =>
ReplayAsync(cancellationToken);
/// <inheritdoc />
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
@@ -275,6 +295,20 @@ public sealed class GalaxyDriver
var entries = _subscriptions.SnapshotEntries();
if (entries.Count == 0) return;
// Driver.Galaxy-013: honor ReplayOnSessionLost. When operators opt out (false)
// we skip the per-tag SubscribeBulk fan-out — they're delegating to the
// gateway's session-level ReplaySubscriptions or accept post-reconnect tag
// loss. We still restart the EventPump so a future Subscribe call lands on
// a live consumer.
if (!_options.Reconnect.ReplayOnSessionLost)
{
RestartEventPumpForReplay();
_logger.LogInformation(
"GalaxyDriver {InstanceId} reconnect replay skipped — ReplayOnSessionLost=false ({SubCount} subscriptions tracked)",
_driverInstanceId, entries.Count);
return;
}
// The stream-fault that triggered this recovery left the old pump's RunAsync loop
// exited and its channel completed; EventPump.Start() is a no-op on a non-null but
// completed loop. Recreate the pump so the replayed subscriptions have a consumer.
@@ -380,7 +414,7 @@ public sealed class GalaxyDriver
}
/// <summary>
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Three
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Four
/// forms supported, evaluated in order:
/// <list type="number">
/// <item><c>env:NAME</c> — reads <c>Environment.GetEnvironmentVariable(NAME)</c>.
@@ -389,13 +423,26 @@ public sealed class GalaxyDriver
/// <item><c>file:PATH</c> — reads UTF-8 text from <c>PATH</c>, trimming
/// whitespace. Lets operators stash the key in an ACL'd file outside the
/// repo (the same pattern as the legacy <c>.local/galaxy-host-secret.txt</c>).</item>
/// <item>Anything else — used as the literal API key. Convenient for dev,
/// and avoids breaking existing configs that pre-date this resolver.</item>
/// <item><c>dev:KEY</c> — explicit cleartext literal. The <c>dev:</c> prefix
/// is a deliberate opt-in signal (dev box, parity rig) so the resolver
/// doesn't emit a warning; production should never use this arm.</item>
/// <item>Anything else — used as the literal API key for back-compat with
/// configs that pre-date this resolver. When a logger is supplied the
/// resolver emits a startup warning so an operator who accidentally
/// committed a cleartext key sees it (Driver.Galaxy-010).</item>
/// </list>
/// A future PR can swap any of these arms for a DPAPI-backed lookup without
/// changing the call site.
/// </summary>
internal static string ResolveApiKey(string secretRef)
internal static string ResolveApiKey(string secretRef) => ResolveApiKey(secretRef, logger: null);
/// <summary>
/// Logger-aware overload. Emits a <see cref="LogLevel.Warning"/> if the secret
/// ref falls through to the back-compat literal arm (an unprefixed cleartext
/// API key in <c>DriverConfig</c> JSON). The <c>dev:</c> prefix is the explicit
/// opt-in path that doesn't warn.
/// </summary>
internal static string ResolveApiKey(string secretRef, ILogger? logger)
{
ArgumentException.ThrowIfNullOrEmpty(secretRef);
@@ -424,13 +471,30 @@ public sealed class GalaxyDriver
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' file '{path}' is empty.");
}
if (secretRef.StartsWith("dev:", StringComparison.OrdinalIgnoreCase))
{
// Explicit dev opt-in — no warning, the operator deliberately chose a
// cleartext literal (dev box, parity rig).
return secretRef[4..];
}
// Back-compat literal arm. An unprefixed string is treated as the literal
// API key — but emit a warning so an operator who accidentally committed a
// cleartext key into DriverConfig sees it at startup. Use the dev: prefix
// to suppress this warning when the literal is intentional.
logger?.LogWarning(
"Galaxy.Gateway.ApiKeySecretRef is being treated as a literal cleartext API key. " +
"Prefer env:NAME, file:PATH, or the explicit dev:KEY prefix for dev rigs — " +
"a literal key in DriverConfig JSON is stored in cleartext in the central config DB.");
return secretRef;
}
private static MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
private MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
{
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
ApiKey = ResolveApiKey(gw.ApiKeySecretRef),
// Driver.Galaxy-010: pass the logger so the literal-arm cleartext fallback
// surfaces a startup warning rather than silently shipping the key.
ApiKey = ResolveApiKey(gw.ApiKeySecretRef, _logger),
UseTls = gw.UseTls,
CaCertificatePath = gw.CaCertificatePath,
ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds),
@@ -463,15 +527,64 @@ public sealed class GalaxyDriver
}
/// <inheritdoc />
/// <remarks>
/// <para>
/// In-place config reapply. The driver does not currently support
/// hot-swapping <see cref="GalaxyDriverOptions"/> at runtime — changing the
/// gateway endpoint, MxAccess client name, or reconnect policy requires
/// tearing down the gw session, supervisor, event pump, and address space.
/// The host stack handles that via DriverInstance restart, so this method
/// only accepts an equivalent config (no meaningful change) and refreshes
/// health; a non-equivalent reapply throws <see cref="NotSupportedException"/>
/// so the caller knows the change wasn't applied (Driver.Galaxy-013:
/// previously the method silently ignored <c>driverConfigJson</c>).
/// </para>
/// </remarks>
public Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
{
// In-place config reapply. PR 4.5's reconnect supervisor will swap the
// gateway-client options under the lock; for the skeleton we just refresh health.
ObjectDisposedException.ThrowIf(_disposed, this);
if (!string.IsNullOrWhiteSpace(driverConfigJson))
{
// Materialise the incoming config and compare against the live options. We
// refuse any change that would require a session teardown rather than
// pretending to apply it.
GalaxyDriverOptions incoming;
try
{
// Reuse the factory's parse pipeline so any missing-required-field
// error surfaces with the same diagnostic text as InitializeAsync.
var transient = GalaxyDriverFactoryExtensions.CreateInstance(_driverInstanceId, driverConfigJson);
incoming = transient.Options;
// The transient instance never started a runtime — disposing is cheap.
transient.Dispose();
}
catch (Exception ex) when (ex is not NotSupportedException and not ObjectDisposedException)
{
throw new NotSupportedException(
$"GalaxyDriver.ReinitializeAsync could not parse the incoming DriverConfig JSON for '{_driverInstanceId}': {ex.Message}",
ex);
}
if (!OptionsAreEquivalent(_options, incoming))
{
throw new NotSupportedException(
"GalaxyDriver.ReinitializeAsync does not support hot-swapping driver options at runtime " +
"(gateway endpoint, MxAccess client name, reconnect policy, etc.). Restart the DriverInstance " +
"through the host stack to apply a config change.");
}
}
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
return Task.CompletedTask;
}
/// <summary>
/// Compare two <see cref="GalaxyDriverOptions"/> for runtime equivalence — every
/// field that drives gw session shape, address space, or reconnect behaviour
/// must match. Records get value-equality from the language, so a direct
/// equality check is enough.
/// </summary>
private static bool OptionsAreEquivalent(GalaxyDriverOptions a, GalaxyDriverOptions b) => a == b;
/// <inheritdoc />
public Task ShutdownAsync(CancellationToken cancellationToken)
{
@@ -637,12 +750,18 @@ public sealed class GalaxyDriver
}
// Register bindings so the pump knows to dispatch events for these handles.
// Driver.Galaxy-012: index the SubscribeBulk results once and correlate to
// references in O(1) instead of FirstOrDefault per element (O(n²) over the
// batch). On the 50k-tag soak path this turns a 2.5G-comparison loop into
// a single Dictionary build + linear scan.
var resultIndex = BuildResultIndex(results);
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var match = results.FirstOrDefault(r => string.Equals(r.TagAddress, fullRef, StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
var itemHandle = resultIndex.TryGetValue(fullRef, out var match) && match is { WasSuccessful: true }
? match.ItemHandle
: 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
// Tags the gw rejected up front — complete with Bad status now so the
@@ -774,12 +893,15 @@ public sealed class GalaxyDriver
// recorded with a non-positive ItemHandle so the caller can detect partial failure
// by inspecting the returned handle's diagnostic context — full per-tag error
// surface lands in PR 5.3's parity tests.
// Driver.Galaxy-012: index results once, correlate in O(1) per reference rather
// than FirstOrDefault inside the loop (O(n²) on the 50k-tag path).
var resultIndex = BuildResultIndex(results);
var bindings = new List<TagBinding>(fullReferences.Count);
for (var i = 0; i < fullReferences.Count; i++)
{
var fullRef = fullReferences[i];
var match = results.FirstOrDefault(r => string.Equals(r.TagAddress, fullRef, StringComparison.OrdinalIgnoreCase));
var itemHandle = match is { WasSuccessful: true } ? match.ItemHandle : 0;
var hasMatch = resultIndex.TryGetValue(fullRef, out var match);
var itemHandle = hasMatch && match is { WasSuccessful: true } ? match.ItemHandle : 0;
bindings.Add(new TagBinding(fullRef, itemHandle));
if (match is null || !match.WasSuccessful)
{

View File

@@ -85,9 +85,14 @@ internal sealed class EventPump : IAsyncDisposable
}
_channel = Channel.CreateBounded<MxEvent>(new BoundedChannelOptions(channelCapacity)
{
// Newest-dropped policy: when full, the producer's TryWrite returns false
// and we account for the drop. We do this manually rather than relying on
// BoundedChannelFullMode.DropWrite so we can count drops without polling.
// Newest-dropped semantics: we use FullMode.Wait but never call the
// awaitable WriteAsync — only the synchronous TryWrite below in
// RunAsync. With Wait + TryWrite, a full channel makes TryWrite return
// false immediately, which we account for via the EventsDropped counter.
// We deliberately do NOT use BoundedChannelFullMode.DropWrite — that
// would silently discard the new event inside Channel<T> without
// surfacing the drop on a counter (Driver.Galaxy-005: keep the comment
// and the FullMode value consistent).
FullMode = BoundedChannelFullMode.Wait,
SingleReader = true,
SingleWriter = true,

View File

@@ -1,4 +1,5 @@
using System.Collections.Concurrent;
using System.Collections.Immutable;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
@@ -18,7 +19,11 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
internal sealed class SubscriptionRegistry
{
private readonly ConcurrentDictionary<long, SubscriptionEntry> _bySubscriptionId = new();
private readonly ConcurrentDictionary<int, ConcurrentBag<long>> _subscribersByItemHandle = new();
// Driver.Galaxy-012: use ImmutableHashSet<long> for the reverse map so removal is
// O(log n) instead of "rebuild the entire ConcurrentBag from a LINQ filter on every
// unsubscribe"; reads are lock-free because the immutable snapshot is published
// atomically via ConcurrentDictionary AddOrUpdate.
private readonly ConcurrentDictionary<int, ImmutableHashSet<long>> _subscribersByItemHandle = new();
private long _nextSubscriptionId;
public int TrackedSubscriptionCount => _bySubscriptionId.Count;
@@ -42,7 +47,7 @@ internal sealed class SubscriptionRegistry
_subscribersByItemHandle.AddOrUpdate(
binding.ItemHandle,
_ => [subscriptionId],
(_, bag) => { bag.Add(subscriptionId); return bag; });
(_, set) => set.Add(subscriptionId));
}
}
@@ -57,12 +62,10 @@ internal sealed class SubscriptionRegistry
foreach (var binding in entry.Bindings)
{
if (binding.ItemHandle <= 0) continue;
if (!_subscribersByItemHandle.TryGetValue(binding.ItemHandle, out var bag)) continue;
// Filter the bag to drop this subscription id. ConcurrentBag has no Remove —
// rebuild it from the remaining entries. The contention here is bounded by
// the number of tags in the dropped subscription.
var remaining = new ConcurrentBag<long>(bag.Where(id => id != subscriptionId));
// Driver.Galaxy-012: ImmutableHashSet.Remove is O(log n) and the result is
// published atomically — no need to rebuild from a LINQ filter.
if (!_subscribersByItemHandle.TryGetValue(binding.ItemHandle, out var set)) continue;
var remaining = set.Remove(subscriptionId);
if (remaining.IsEmpty) _subscribersByItemHandle.TryRemove(binding.ItemHandle, out _);
else _subscribersByItemHandle[binding.ItemHandle] = remaining;
}
@@ -74,18 +77,23 @@ internal sealed class SubscriptionRegistry
/// Look up the (subscription id, full reference) pairs that should receive an
/// OnDataChange for the given gw item handle. Returns empty when nobody subscribes.
/// </summary>
/// <remarks>
/// Driver.Galaxy-012: O(1) per subscriber via the per-entry
/// <c>FullRefByItemHandle</c> index, rather than a <c>FirstOrDefault</c> linear
/// scan of the binding list. At 50k tags / 1Hz this turns each dispatch from a
/// 50k-element scan into a single dictionary lookup.
/// </remarks>
public IReadOnlyList<(long SubscriptionId, string FullReference)> ResolveSubscribers(int itemHandle)
{
if (!_subscribersByItemHandle.TryGetValue(itemHandle, out var bag)) return [];
// Each subscription may include the tag once. Walk every active subscription that
// claims this handle and pull the full ref from its binding list.
// claims this handle and pull the full ref from its index in O(1).
var result = new List<(long, string)>();
foreach (var subId in bag.Distinct())
{
if (!_bySubscriptionId.TryGetValue(subId, out var entry)) continue;
var binding = entry.Bindings.FirstOrDefault(b => b.ItemHandle == itemHandle);
if (binding is { FullReference: { } fullRef })
if (entry.FullRefByItemHandle.TryGetValue(itemHandle, out var fullRef))
result.Add((subId, fullRef));
}
return result;
@@ -113,14 +121,14 @@ internal sealed class SubscriptionRegistry
{
if (!_bySubscriptionId.TryGetValue(subscriptionId, out var oldEntry)) return;
// Drop this subscription from every reverse-map bag it currently appears in. The
// Drop this subscription from every reverse-map set it currently appears in. The
// pre-reconnect item handles are stale once the gw re-issues fresh ones.
// Driver.Galaxy-012: ImmutableHashSet.Remove is O(log n) — no LINQ rebuild.
foreach (var binding in oldEntry.Bindings)
{
if (binding.ItemHandle <= 0) continue;
if (!_subscribersByItemHandle.TryGetValue(binding.ItemHandle, out var bag)) continue;
var remaining = new ConcurrentBag<long>(bag.Where(id => id != subscriptionId));
if (!_subscribersByItemHandle.TryGetValue(binding.ItemHandle, out var set)) continue;
var remaining = set.Remove(subscriptionId);
if (remaining.IsEmpty) _subscribersByItemHandle.TryRemove(binding.ItemHandle, out _);
else _subscribersByItemHandle[binding.ItemHandle] = remaining;
}
@@ -132,11 +140,38 @@ internal sealed class SubscriptionRegistry
_subscribersByItemHandle.AddOrUpdate(
binding.ItemHandle,
_ => [subscriptionId],
(_, bag) => { bag.Add(subscriptionId); return bag; });
(_, set) => set.Add(subscriptionId));
}
}
private sealed record SubscriptionEntry(long SubscriptionId, IReadOnlyList<TagBinding> Bindings);
/// <summary>
/// Per-subscription bookkeeping. <see cref="FullRefByItemHandle"/> is an index
/// over <see cref="Bindings"/> keyed by item handle so <c>ResolveSubscribers</c>
/// is O(1) per subscriber instead of a linear scan of every binding
/// (Driver.Galaxy-012). Failed bindings (item handle ≤ 0) are excluded from the
/// index because the EventPump only dispatches for positive handles.
/// </summary>
private sealed class SubscriptionEntry
{
public long SubscriptionId { get; }
public IReadOnlyList<TagBinding> Bindings { get; }
public IReadOnlyDictionary<int, string> FullRefByItemHandle { get; }
public SubscriptionEntry(long subscriptionId, IReadOnlyList<TagBinding> bindings)
{
SubscriptionId = subscriptionId;
Bindings = bindings;
var index = new Dictionary<int, string>(bindings.Count);
foreach (var binding in bindings)
{
if (binding.ItemHandle <= 0) continue; // failed gw subscribe — no events expected
// Last-write-wins on duplicates; the driver doesn't double-register a handle
// within a single subscription, but be defensive.
index[binding.ItemHandle] = binding.FullReference;
}
FullRefByItemHandle = index;
}
}
}
/// <summary>