fix(core): resolve Low code-review findings (Core-004,008,009,010,011,012)

- Core-004: add ConfigureAwait(false) to DriverHost.RegisterAsync /
  UnregisterAsync / DisposeAsync.
- Core-008: rewrite the BuildAddressSpaceAsync XML doc to correctly name
  the caller (OpcUaApplicationHost.PopulateAddressSpaces) that owns the
  per-driver isolation.
- Core-009: snapshot DriverResilienceOptions once per non-idempotent write
  in CapabilityInvoker.ExecuteWriteAsync.
- Core-010: switch DriverResilienceOptions.Resolve to TryGetValue with a
  diagnostic error message when a tier table is missing a capability.
- Core-011: add an optional diagnostic callback to PermissionTrieBuilder
  so production callers can surface scope-path mismatches.
- Core-012: correct the stale WedgeDetector ctor summary and add the
  Reconnecting row to DriverHealthReport's state matrix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 05:38:09 -04:00
parent ff2e75ab98
commit 8be6afbda4
15 changed files with 656 additions and 28 deletions

View File

@@ -26,11 +26,27 @@ public static class PermissionTrieBuilder
/// Build a trie for one cluster/generation from the supplied rows. The caller is
/// responsible for pre-filtering rows to the target generation + cluster.
/// </summary>
/// <param name="clusterId">Cluster the trie is being built for; rows for other clusters are skipped.</param>
/// <param name="generationId">Config-generation the rows belong to; stamped on the returned trie.</param>
/// <param name="rows">ACL rows for this cluster + generation.</param>
/// <param name="scopePaths">
/// Optional <c>ScopeId</c> → multi-level trie-path lookup. When supplied, sub-cluster rows
/// descend to their structurally-correct trie node. When null, sub-cluster rows fall back
/// to a direct child of the trie root keyed on <c>ScopeId</c> — deterministic-test mode.
/// </param>
/// <param name="diagnostic">
/// Optional callback invoked when a sub-cluster row's <c>ScopeId</c> cannot be located
/// in <paramref name="scopePaths"/>. Production callers should wire a logger here so
/// orphaned grants surface — silently dropping them under the wrong trie level was the
/// Core-011 production hazard. The callback fires only when <paramref name="scopePaths"/>
/// is non-null (a null lookup is the explicit deterministic-test fallback mode).
/// </param>
public static PermissionTrie Build(
string clusterId,
long generationId,
IReadOnlyList<NodeAcl> rows,
IReadOnlyDictionary<string, NodeAclPath>? scopePaths = null)
IReadOnlyDictionary<string, NodeAclPath>? scopePaths = null,
Action<PermissionTrieBuildDiagnostic>? diagnostic = null)
{
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
ArgumentNullException.ThrowIfNull(rows);
@@ -45,7 +61,7 @@ public static class PermissionTrieBuilder
var node = row.ScopeKind switch
{
NodeAclScopeKind.Cluster => trie.Root,
_ => Descend(trie.Root, row, scopePaths),
_ => Descend(trie.Root, row, scopePaths, diagnostic),
};
if (node is not null)
@@ -55,16 +71,30 @@ public static class PermissionTrieBuilder
return trie;
}
private static PermissionTrieNode? Descend(PermissionTrieNode root, NodeAcl row, IReadOnlyDictionary<string, NodeAclPath>? scopePaths)
private static PermissionTrieNode? Descend(
PermissionTrieNode root,
NodeAcl row,
IReadOnlyDictionary<string, NodeAclPath>? scopePaths,
Action<PermissionTrieBuildDiagnostic>? diagnostic)
{
if (string.IsNullOrEmpty(row.ScopeId)) return null;
// For sub-cluster scopes the caller supplies a path lookup so we know the containing
// namespace / UnsArea / UnsLine ids. Without a path lookup we fall back to putting the
// row directly under the root using its ScopeId — works for deterministic tests, not
// for production where the hierarchy must be honored.
// for production where the hierarchy must be honored. If a scopePaths lookup IS
// provided but is missing the row's ScopeId, surface a diagnostic so the caller can
// log the orphan instead of silently dropping the grant under an unreachable node.
if (scopePaths is null || !scopePaths.TryGetValue(row.ScopeId, out var path))
{
if (scopePaths is not null)
{
diagnostic?.Invoke(new PermissionTrieBuildDiagnostic(
NodeAclId: row.NodeAclId,
ScopeKind: row.ScopeKind,
ScopeId: row.ScopeId,
Reason: PermissionTrieBuildDiagnosticReason.MissingScopePath));
}
return EnsureChild(root, row.ScopeId);
}
@@ -95,3 +125,30 @@ public static class PermissionTrieBuilder
/// applicable; or (for SystemPlatform kind) NamespaceId / FolderSegment / .../TagId.
/// </param>
public sealed record NodeAclPath(IReadOnlyList<string> Segments);
/// <summary>
/// Diagnostic emitted by <see cref="PermissionTrieBuilder.Build"/> when a row could not be
/// placed at its structurally-correct trie node. Production callers should log these so
/// orphaned grants surface instead of being silently dropped under an unreachable node
/// (Core-011).
/// </summary>
/// <param name="NodeAclId">The offending row's logical id.</param>
/// <param name="ScopeKind">The row's <see cref="NodeAclScopeKind"/>.</param>
/// <param name="ScopeId">The row's <c>ScopeId</c> that could not be located.</param>
/// <param name="Reason">Why the diagnostic fired.</param>
public sealed record PermissionTrieBuildDiagnostic(
string NodeAclId,
NodeAclScopeKind ScopeKind,
string ScopeId,
PermissionTrieBuildDiagnosticReason Reason);
/// <summary>Reasons <see cref="PermissionTrieBuildDiagnostic"/> can be emitted.</summary>
public enum PermissionTrieBuildDiagnosticReason
{
/// <summary>
/// The row's <c>ScopeId</c> was not present in the supplied <c>scopePaths</c> lookup.
/// The grant is placed as a direct child of the trie root keyed on <c>ScopeId</c> — a
/// position the production trie walker cannot reach for multi-level scopes.
/// </summary>
MissingScopePath,
}

View File

@@ -52,7 +52,7 @@ public sealed class DriverHost : IAsyncDisposable
_drivers[id] = driver;
}
try { await driver.InitializeAsync(driverConfigJson, ct); }
try { await driver.InitializeAsync(driverConfigJson, ct).ConfigureAwait(false); }
catch
{
// Keep the driver registered — operator will see Faulted state and can reinitialize.
@@ -69,7 +69,7 @@ public sealed class DriverHost : IAsyncDisposable
_drivers.Remove(driverInstanceId);
}
try { await driver.ShutdownAsync(ct); }
try { await driver.ShutdownAsync(ct).ConfigureAwait(false); }
catch { /* shutdown is best-effort; logs elsewhere */ }
}
@@ -84,7 +84,7 @@ public sealed class DriverHost : IAsyncDisposable
foreach (var driver in snapshot)
{
try { await driver.ShutdownAsync(CancellationToken.None); } catch { /* ignore */ }
try { await driver.ShutdownAsync(CancellationToken.None).ConfigureAwait(false); } catch { /* ignore */ }
(driver as IDisposable)?.Dispose();
}
}

View File

@@ -15,11 +15,13 @@ namespace ZB.MOM.WW.OtOpcUa.Core.Observability;
/// → /readyz 503 (not yet ready).</item>
/// <item><see cref="DriverState.Healthy"/> → /readyz 200.</item>
/// <item><see cref="DriverState.Degraded"/> → /readyz 200 with flagged driver IDs.</item>
/// <item><see cref="DriverState.Reconnecting"/> → /readyz 200 with flagged driver IDs
/// (driver alive but not serving live data; same verdict as Degraded).</item>
/// <item><see cref="DriverState.Faulted"/> → /readyz 503.</item>
/// </list>
/// The overall verdict is computed across the fleet: any Faulted → Faulted; any
/// Unknown/Initializing → NotReady; any Degraded → Degraded; else Healthy. An empty fleet
/// is Healthy (nothing to degrade).
/// Unknown/Initializing → NotReady; any Degraded or Reconnecting → Degraded; else
/// Healthy. An empty fleet is Healthy (nothing to degrade).
/// </remarks>
public static class DriverHealthReport
{

View File

@@ -39,8 +39,11 @@ public class GenericDriverNodeManager(IDriver driver) : IDisposable
/// If called a second time (e.g. Galaxy redeploy via <c>IRediscoverable.OnRediscoveryNeeded</c>)
/// the previous alarm subscription is torn down and the sink registry is cleared before
/// re-walking, preventing double delivery of alarm transitions.
/// Exception isolation (marking the driver's subtree Faulted) is the caller's responsibility —
/// exceptions from <see cref="ITagDiscovery.DiscoverAsync"/> propagate to the caller.
/// Exception isolation (per decision #12 — marking the driver's subtree Faulted while other
/// drivers stay available) is the caller's responsibility; exceptions from
/// <see cref="ITagDiscovery.DiscoverAsync"/> propagate unhandled to the caller. The Server
/// project's <c>OpcUaApplicationHost.PopulateAddressSpaces</c> wraps this call in a per-driver
/// try/catch that logs + leaves the driver's subtree empty until a Reinitialize succeeds.
/// </summary>
public async Task BuildAddressSpaceAsync(IAddressSpaceBuilder builder, CancellationToken ct)
{

View File

@@ -118,11 +118,15 @@ public sealed class CapabilityInvoker
if (!isIdempotent)
{
var noRetryOptions = _optionsAccessor() with
// Snapshot the options exactly once per call — invoking _optionsAccessor twice can
// (a) observe two different snapshots if an Admin edit lands between them and
// (b) wastes an allocation on the per-write hot path (Phase 6.1 1% pipeline budget).
var snapshot = _optionsAccessor();
var noRetryOptions = snapshot with
{
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
{
[DriverCapability.Write] = _optionsAccessor().Resolve(DriverCapability.Write) with { RetryCount = 0 },
[DriverCapability.Write] = snapshot.Resolve(DriverCapability.Write) with { RetryCount = 0 },
},
};
var pipeline = _builder.GetOrCreate(_driverInstanceId, $"{hostName}::non-idempotent", DriverCapability.Write, noRetryOptions);

View File

@@ -42,13 +42,27 @@ public sealed record DriverResilienceOptions
/// Look up the effective policy for a capability, falling back to tier defaults when no
/// override is configured. Never returns null.
/// </summary>
/// <exception cref="KeyNotFoundException">
/// Thrown when neither the override map nor the tier defaults carry an entry for the
/// requested capability. The <c>TierDefaults_Cover_EveryCapability</c> invariant test
/// in <c>DriverResilienceOptionsTests</c> guarantees every defined enum value is present
/// in each tier's table, so this only fires when a caller passes an out-of-range value
/// or someone adds a <see cref="DriverCapability"/> member without updating
/// <see cref="GetTierDefaults"/>. The message names the missing capability and tier.
/// </exception>
public CapabilityPolicy Resolve(DriverCapability capability)
{
if (CapabilityPolicies.TryGetValue(capability, out var policy))
return policy;
var defaults = GetTierDefaults(Tier);
return defaults[capability];
if (defaults.TryGetValue(capability, out var fallback))
return fallback;
throw new KeyNotFoundException(
$"No policy defined for capability '{capability}' under tier '{Tier}'. " +
$"This indicates a {nameof(DriverCapability)} enum value missing from {nameof(GetTierDefaults)} — " +
"add the capability to every tier's default table.");
}
/// <summary>

View File

@@ -23,7 +23,15 @@ public sealed class WedgeDetector
/// <summary>Wedge-detection threshold; pass &lt; 60 s and the detector clamps to 60 s.</summary>
public TimeSpan Threshold { get; }
/// <summary>Whether the driver reported itself <see cref="DriverState.Healthy"/> at construction.</summary>
/// <summary>
/// Construct with the wedge-detection threshold; values below 60 s clamp to 60 s so
/// the detector never fires below the documented floor.
/// </summary>
/// <param name="threshold">
/// Time without a successful unit of work after which a Healthy driver with pending
/// work is considered Faulted. Clamped to a minimum of 60 s per the plan-default of
/// 5 × PublishingInterval.
/// </param>
public WedgeDetector(TimeSpan threshold)
{
Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold;