fix(core): resolve Low code-review findings (Core-004,008,009,010,011,012)
- Core-004: add ConfigureAwait(false) to DriverHost.RegisterAsync / UnregisterAsync / DisposeAsync. - Core-008: rewrite the BuildAddressSpaceAsync XML doc to correctly name the caller (OpcUaApplicationHost.PopulateAddressSpaces) that owns the per-driver isolation. - Core-009: snapshot DriverResilienceOptions once per non-idempotent write in CapabilityInvoker.ExecuteWriteAsync. - Core-010: switch DriverResilienceOptions.Resolve to TryGetValue with a diagnostic error message when a tier table is missing a capability. - Core-011: add an optional diagnostic callback to PermissionTrieBuilder so production callers can surface scope-path mismatches. - Core-012: correct the stale WedgeDetector ctor summary and add the Reconnecting row to DriverHealthReport's state matrix. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -26,11 +26,27 @@ public static class PermissionTrieBuilder
|
||||
/// Build a trie for one cluster/generation from the supplied rows. The caller is
|
||||
/// responsible for pre-filtering rows to the target generation + cluster.
|
||||
/// </summary>
|
||||
/// <param name="clusterId">Cluster the trie is being built for; rows for other clusters are skipped.</param>
|
||||
/// <param name="generationId">Config-generation the rows belong to; stamped on the returned trie.</param>
|
||||
/// <param name="rows">ACL rows for this cluster + generation.</param>
|
||||
/// <param name="scopePaths">
|
||||
/// Optional <c>ScopeId</c> → multi-level trie-path lookup. When supplied, sub-cluster rows
|
||||
/// descend to their structurally-correct trie node. When null, sub-cluster rows fall back
|
||||
/// to a direct child of the trie root keyed on <c>ScopeId</c> — deterministic-test mode.
|
||||
/// </param>
|
||||
/// <param name="diagnostic">
|
||||
/// Optional callback invoked when a sub-cluster row's <c>ScopeId</c> cannot be located
|
||||
/// in <paramref name="scopePaths"/>. Production callers should wire a logger here so
|
||||
/// orphaned grants surface — silently dropping them under the wrong trie level was the
|
||||
/// Core-011 production hazard. The callback fires only when <paramref name="scopePaths"/>
|
||||
/// is non-null (a null lookup is the explicit deterministic-test fallback mode).
|
||||
/// </param>
|
||||
public static PermissionTrie Build(
|
||||
string clusterId,
|
||||
long generationId,
|
||||
IReadOnlyList<NodeAcl> rows,
|
||||
IReadOnlyDictionary<string, NodeAclPath>? scopePaths = null)
|
||||
IReadOnlyDictionary<string, NodeAclPath>? scopePaths = null,
|
||||
Action<PermissionTrieBuildDiagnostic>? diagnostic = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(clusterId);
|
||||
ArgumentNullException.ThrowIfNull(rows);
|
||||
@@ -45,7 +61,7 @@ public static class PermissionTrieBuilder
|
||||
var node = row.ScopeKind switch
|
||||
{
|
||||
NodeAclScopeKind.Cluster => trie.Root,
|
||||
_ => Descend(trie.Root, row, scopePaths),
|
||||
_ => Descend(trie.Root, row, scopePaths, diagnostic),
|
||||
};
|
||||
|
||||
if (node is not null)
|
||||
@@ -55,16 +71,30 @@ public static class PermissionTrieBuilder
|
||||
return trie;
|
||||
}
|
||||
|
||||
private static PermissionTrieNode? Descend(PermissionTrieNode root, NodeAcl row, IReadOnlyDictionary<string, NodeAclPath>? scopePaths)
|
||||
private static PermissionTrieNode? Descend(
|
||||
PermissionTrieNode root,
|
||||
NodeAcl row,
|
||||
IReadOnlyDictionary<string, NodeAclPath>? scopePaths,
|
||||
Action<PermissionTrieBuildDiagnostic>? diagnostic)
|
||||
{
|
||||
if (string.IsNullOrEmpty(row.ScopeId)) return null;
|
||||
|
||||
// For sub-cluster scopes the caller supplies a path lookup so we know the containing
|
||||
// namespace / UnsArea / UnsLine ids. Without a path lookup we fall back to putting the
|
||||
// row directly under the root using its ScopeId — works for deterministic tests, not
|
||||
// for production where the hierarchy must be honored.
|
||||
// for production where the hierarchy must be honored. If a scopePaths lookup IS
|
||||
// provided but is missing the row's ScopeId, surface a diagnostic so the caller can
|
||||
// log the orphan instead of silently dropping the grant under an unreachable node.
|
||||
if (scopePaths is null || !scopePaths.TryGetValue(row.ScopeId, out var path))
|
||||
{
|
||||
if (scopePaths is not null)
|
||||
{
|
||||
diagnostic?.Invoke(new PermissionTrieBuildDiagnostic(
|
||||
NodeAclId: row.NodeAclId,
|
||||
ScopeKind: row.ScopeKind,
|
||||
ScopeId: row.ScopeId,
|
||||
Reason: PermissionTrieBuildDiagnosticReason.MissingScopePath));
|
||||
}
|
||||
return EnsureChild(root, row.ScopeId);
|
||||
}
|
||||
|
||||
@@ -95,3 +125,30 @@ public static class PermissionTrieBuilder
|
||||
/// applicable; or (for SystemPlatform kind) NamespaceId / FolderSegment / .../TagId.
|
||||
/// </param>
|
||||
public sealed record NodeAclPath(IReadOnlyList<string> Segments);
|
||||
|
||||
/// <summary>
|
||||
/// Diagnostic emitted by <see cref="PermissionTrieBuilder.Build"/> when a row could not be
|
||||
/// placed at its structurally-correct trie node. Production callers should log these so
|
||||
/// orphaned grants surface instead of being silently dropped under an unreachable node
|
||||
/// (Core-011).
|
||||
/// </summary>
|
||||
/// <param name="NodeAclId">The offending row's logical id.</param>
|
||||
/// <param name="ScopeKind">The row's <see cref="NodeAclScopeKind"/>.</param>
|
||||
/// <param name="ScopeId">The row's <c>ScopeId</c> that could not be located.</param>
|
||||
/// <param name="Reason">Why the diagnostic fired.</param>
|
||||
public sealed record PermissionTrieBuildDiagnostic(
|
||||
string NodeAclId,
|
||||
NodeAclScopeKind ScopeKind,
|
||||
string ScopeId,
|
||||
PermissionTrieBuildDiagnosticReason Reason);
|
||||
|
||||
/// <summary>Reasons <see cref="PermissionTrieBuildDiagnostic"/> can be emitted.</summary>
|
||||
public enum PermissionTrieBuildDiagnosticReason
|
||||
{
|
||||
/// <summary>
|
||||
/// The row's <c>ScopeId</c> was not present in the supplied <c>scopePaths</c> lookup.
|
||||
/// The grant is placed as a direct child of the trie root keyed on <c>ScopeId</c> — a
|
||||
/// position the production trie walker cannot reach for multi-level scopes.
|
||||
/// </summary>
|
||||
MissingScopePath,
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ public sealed class DriverHost : IAsyncDisposable
|
||||
_drivers[id] = driver;
|
||||
}
|
||||
|
||||
try { await driver.InitializeAsync(driverConfigJson, ct); }
|
||||
try { await driver.InitializeAsync(driverConfigJson, ct).ConfigureAwait(false); }
|
||||
catch
|
||||
{
|
||||
// Keep the driver registered — operator will see Faulted state and can reinitialize.
|
||||
@@ -69,7 +69,7 @@ public sealed class DriverHost : IAsyncDisposable
|
||||
_drivers.Remove(driverInstanceId);
|
||||
}
|
||||
|
||||
try { await driver.ShutdownAsync(ct); }
|
||||
try { await driver.ShutdownAsync(ct).ConfigureAwait(false); }
|
||||
catch { /* shutdown is best-effort; logs elsewhere */ }
|
||||
}
|
||||
|
||||
@@ -84,7 +84,7 @@ public sealed class DriverHost : IAsyncDisposable
|
||||
|
||||
foreach (var driver in snapshot)
|
||||
{
|
||||
try { await driver.ShutdownAsync(CancellationToken.None); } catch { /* ignore */ }
|
||||
try { await driver.ShutdownAsync(CancellationToken.None).ConfigureAwait(false); } catch { /* ignore */ }
|
||||
(driver as IDisposable)?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,11 +15,13 @@ namespace ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
/// → /readyz 503 (not yet ready).</item>
|
||||
/// <item><see cref="DriverState.Healthy"/> → /readyz 200.</item>
|
||||
/// <item><see cref="DriverState.Degraded"/> → /readyz 200 with flagged driver IDs.</item>
|
||||
/// <item><see cref="DriverState.Reconnecting"/> → /readyz 200 with flagged driver IDs
|
||||
/// (driver alive but not serving live data; same verdict as Degraded).</item>
|
||||
/// <item><see cref="DriverState.Faulted"/> → /readyz 503.</item>
|
||||
/// </list>
|
||||
/// The overall verdict is computed across the fleet: any Faulted → Faulted; any
|
||||
/// Unknown/Initializing → NotReady; any Degraded → Degraded; else Healthy. An empty fleet
|
||||
/// is Healthy (nothing to degrade).
|
||||
/// Unknown/Initializing → NotReady; any Degraded or Reconnecting → Degraded; else
|
||||
/// Healthy. An empty fleet is Healthy (nothing to degrade).
|
||||
/// </remarks>
|
||||
public static class DriverHealthReport
|
||||
{
|
||||
|
||||
@@ -39,8 +39,11 @@ public class GenericDriverNodeManager(IDriver driver) : IDisposable
|
||||
/// If called a second time (e.g. Galaxy redeploy via <c>IRediscoverable.OnRediscoveryNeeded</c>)
|
||||
/// the previous alarm subscription is torn down and the sink registry is cleared before
|
||||
/// re-walking, preventing double delivery of alarm transitions.
|
||||
/// Exception isolation (marking the driver's subtree Faulted) is the caller's responsibility —
|
||||
/// exceptions from <see cref="ITagDiscovery.DiscoverAsync"/> propagate to the caller.
|
||||
/// Exception isolation (per decision #12 — marking the driver's subtree Faulted while other
|
||||
/// drivers stay available) is the caller's responsibility; exceptions from
|
||||
/// <see cref="ITagDiscovery.DiscoverAsync"/> propagate unhandled to the caller. The Server
|
||||
/// project's <c>OpcUaApplicationHost.PopulateAddressSpaces</c> wraps this call in a per-driver
|
||||
/// try/catch that logs + leaves the driver's subtree empty until a Reinitialize succeeds.
|
||||
/// </summary>
|
||||
public async Task BuildAddressSpaceAsync(IAddressSpaceBuilder builder, CancellationToken ct)
|
||||
{
|
||||
|
||||
@@ -118,11 +118,15 @@ public sealed class CapabilityInvoker
|
||||
|
||||
if (!isIdempotent)
|
||||
{
|
||||
var noRetryOptions = _optionsAccessor() with
|
||||
// Snapshot the options exactly once per call — invoking _optionsAccessor twice can
|
||||
// (a) observe two different snapshots if an Admin edit lands between them and
|
||||
// (b) wastes an allocation on the per-write hot path (Phase 6.1 1% pipeline budget).
|
||||
var snapshot = _optionsAccessor();
|
||||
var noRetryOptions = snapshot with
|
||||
{
|
||||
CapabilityPolicies = new Dictionary<DriverCapability, CapabilityPolicy>
|
||||
{
|
||||
[DriverCapability.Write] = _optionsAccessor().Resolve(DriverCapability.Write) with { RetryCount = 0 },
|
||||
[DriverCapability.Write] = snapshot.Resolve(DriverCapability.Write) with { RetryCount = 0 },
|
||||
},
|
||||
};
|
||||
var pipeline = _builder.GetOrCreate(_driverInstanceId, $"{hostName}::non-idempotent", DriverCapability.Write, noRetryOptions);
|
||||
|
||||
@@ -42,13 +42,27 @@ public sealed record DriverResilienceOptions
|
||||
/// Look up the effective policy for a capability, falling back to tier defaults when no
|
||||
/// override is configured. Never returns null.
|
||||
/// </summary>
|
||||
/// <exception cref="KeyNotFoundException">
|
||||
/// Thrown when neither the override map nor the tier defaults carry an entry for the
|
||||
/// requested capability. The <c>TierDefaults_Cover_EveryCapability</c> invariant test
|
||||
/// in <c>DriverResilienceOptionsTests</c> guarantees every defined enum value is present
|
||||
/// in each tier's table, so this only fires when a caller passes an out-of-range value
|
||||
/// or someone adds a <see cref="DriverCapability"/> member without updating
|
||||
/// <see cref="GetTierDefaults"/>. The message names the missing capability and tier.
|
||||
/// </exception>
|
||||
public CapabilityPolicy Resolve(DriverCapability capability)
|
||||
{
|
||||
if (CapabilityPolicies.TryGetValue(capability, out var policy))
|
||||
return policy;
|
||||
|
||||
var defaults = GetTierDefaults(Tier);
|
||||
return defaults[capability];
|
||||
if (defaults.TryGetValue(capability, out var fallback))
|
||||
return fallback;
|
||||
|
||||
throw new KeyNotFoundException(
|
||||
$"No policy defined for capability '{capability}' under tier '{Tier}'. " +
|
||||
$"This indicates a {nameof(DriverCapability)} enum value missing from {nameof(GetTierDefaults)} — " +
|
||||
"add the capability to every tier's default table.");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -23,7 +23,15 @@ public sealed class WedgeDetector
|
||||
/// <summary>Wedge-detection threshold; pass < 60 s and the detector clamps to 60 s.</summary>
|
||||
public TimeSpan Threshold { get; }
|
||||
|
||||
/// <summary>Whether the driver reported itself <see cref="DriverState.Healthy"/> at construction.</summary>
|
||||
/// <summary>
|
||||
/// Construct with the wedge-detection threshold; values below 60 s clamp to 60 s so
|
||||
/// the detector never fires below the documented floor.
|
||||
/// </summary>
|
||||
/// <param name="threshold">
|
||||
/// Time without a successful unit of work after which a Healthy driver with pending
|
||||
/// work is considered Faulted. Clamped to a minimum of 60 s per the plan-default of
|
||||
/// 5 × PublishingInterval.
|
||||
/// </param>
|
||||
public WedgeDetector(TimeSpan threshold)
|
||||
{
|
||||
Threshold = threshold < TimeSpan.FromSeconds(60) ? TimeSpan.FromSeconds(60) : threshold;
|
||||
|
||||
Reference in New Issue
Block a user