fix(server): resolve Low code-review findings (Server-004,006,008,012,014,015)

- Server-004: pass the role-derived display name to UserIdentity's base
  ctor (the SDK's DisplayName has no public setter) and drop the dead
  Display property; make RoleBasedIdentity internal sealed.
- Server-006: derive a bounded CancellationToken from the SDK's
  OperationContext.OperationDeadline in OnReadValue / OnWriteValue so a
  stalled driver call can no longer pin the request thread.
- Server-008: mark handled slots via CallMethodRequest.Processed = true
  in RouteScriptedAlarmMethodCalls (the SDK skips on Processed, not on a
  Good error slot).
- Server-012: PeerHttpProbeLoop.ProbeAsync stops mutating client.Timeout
  per call; uses a per-request CancellationTokenSource linked to the
  shutdown token instead.
- Server-014: wire SealedBootstrap into Program.cs via AddSealedBootstrap
  + OpcUaServerService so the generation-sealed cache + stale-config flag
  + resilient reader actually run; /healthz now reflects cache-fallback
  state.
- Server-015: replace the stale 'PR 16 / PR 17 minimum-viable scope'
  class summaries on OtOpcUaServer and OpcUaServerOptions with the
  shipped LDAP + anonymous-role + configurable security-profile prose.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 07:24:20 -04:00
parent 2b33b64a58
commit 6134050ceb
14 changed files with 698 additions and 40 deletions

View File

@@ -73,11 +73,20 @@ public sealed class PeerHttpProbeLoop(
{
var url = $"http://{peer.Host}:{peer.DashboardPort}/healthz";
var healthy = false;
// Server-012: bound the request via a linked CTS rather than mutating
// client.Timeout on a factory-vended HttpClient. IHttpClientFactory may pool /
// recycle the underlying handler, so mutating client.Timeout per call races with
// the factory's lifecycle and crosses ownership boundaries. A per-call CTS is the
// canonical way to enforce a per-request deadline.
using var timeoutCts = new CancellationTokenSource(_options.HttpProbeTimeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(
cancellationToken, timeoutCts.Token);
try
{
using var client = httpClientFactory.CreateClient(HttpClientName);
client.Timeout = _options.HttpProbeTimeout;
using var response = await client.GetAsync(url, cancellationToken).ConfigureAwait(false);
using var response = await client.GetAsync(url, linkedCts.Token).ConfigureAwait(false);
healthy = response.IsSuccessStatusCode;
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
@@ -86,9 +95,9 @@ public sealed class PeerHttpProbeLoop(
}
catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException or OperationCanceledException)
{
// Any transport-level failure counts as unhealthy — connection refused, timeout,
// DNS fail, TLS fail. Swallow + mark unhealthy; don't log every tick, only when
// state transitions.
// Any transport-level failure counts as unhealthy — connection refused, timeout
// (linked CTS expired), DNS fail, TLS fail. Swallow + mark unhealthy; don't log
// every tick, only when state transitions.
healthy = false;
}

View File

@@ -462,6 +462,63 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
}
}
/// <summary>
/// Server-006 fallback for the synchronous OnRead/OnWrite hooks when the stack hasn't
/// plumbed an <see cref="IOperationContext.OperationDeadline"/> through. Bounds the
/// <c>.GetAwaiter().GetResult()</c> wait so a stalled driver can't pin a request
/// thread indefinitely (default <c>MaxRequestThreadCount = 100</c>).
/// </summary>
internal static readonly TimeSpan DefaultSynchronousHookTimeout = TimeSpan.FromSeconds(30);
/// <summary>
/// Derives a <see cref="CancellationTokenSource"/> bounded by the OPC UA stack's
/// <see cref="IOperationContext.OperationDeadline"/>, falling back to
/// <paramref name="fallback"/> when no deadline has been plumbed. The synchronous
/// <c>OnReadValue</c> / <c>OnWriteValue</c> stack hooks consume the returned token so
/// a stuck driver call can be abandoned instead of pinning a request thread for the
/// full pipeline timeout (Server-006).
/// </summary>
/// <remarks>
/// The OPC UA stack's <c>SessionSystemContext.OperationContext</c> exposes the
/// per-request deadline computed from the client's <c>TimeoutHint</c> +
/// <see cref="TransportQuotas.OperationTimeout"/>. An expired deadline produces an
/// immediately-cancelled token so the invoker call short-circuits without dispatching
/// to the driver. A <see cref="DateTime.MinValue"/> deadline (the SDK's "not plumbed"
/// sentinel) is treated as missing and falls back to <paramref name="fallback"/>.
/// </remarks>
internal static CancellationTokenSource DeriveOperationCancellation(ISystemContext context, TimeSpan fallback)
{
// Cast: ISystemContext is read-only; SystemContext (the concrete base) exposes
// OperationContext as IOperationContext. Server-side, every ISystemContext the
// stack passes is a SessionSystemContext (subclass of SystemContext), so the cast
// succeeds in practice. The null guard keeps the helper safe for tests / future
// overrides that don't follow that contract.
var opCtx = (context as SystemContext)?.OperationContext;
var deadline = opCtx?.OperationDeadline ?? DateTime.MinValue;
// DateTime.MinValue is the legacy "no deadline plumbed" sentinel; DateTime.MaxValue
// is the SDK's "client didn't supply a TimeoutHint" default (OperationContext's
// initial value when RequestHeader.TimeoutHint == 0). Both must collapse to the
// fallback timeout — otherwise MinValue compares as already-expired and MaxValue
// overflows CancellationTokenSource(TimeSpan)'s Int32.MaxValue-ms ceiling.
if (deadline == DateTime.MinValue || deadline == DateTime.MaxValue)
return new CancellationTokenSource(fallback);
var remaining = deadline - DateTime.UtcNow;
if (remaining <= TimeSpan.Zero)
{
var cts = new CancellationTokenSource();
cts.Cancel();
return cts;
}
// CancellationTokenSource(TimeSpan) caps at ~24.86 days (Int32.MaxValue ms). A
// legitimate per-request deadline far in the future (e.g. a TimeoutHint of hours)
// exceeds that limit; clamp to the fallback so we never throw ArgumentOutOfRangeException
// from inside the read path.
if (remaining.TotalMilliseconds > int.MaxValue) return new CancellationTokenSource(fallback);
return new CancellationTokenSource(remaining);
}
private ServiceResult OnReadValue(ISystemContext context, NodeState node, NumericRange indexRange,
QualifiedName dataEncoding, ref object? value, ref StatusCode statusCode, ref DateTime timestamp)
{
@@ -490,11 +547,15 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
}
}
// Server-006: bound the synchronous .GetAwaiter().GetResult() wait by the stack's
// OperationDeadline so a stalled driver can't pin a request thread for the full
// pipeline timeout. The previous CancellationToken.None left every read uncancellable.
using var cts = DeriveOperationCancellation(context, DefaultSynchronousHookTimeout);
var result = _invoker.ExecuteAsync(
DriverCapability.Read,
ResolveHostFor(fullRef),
async ct => (IReadOnlyList<DataValueSnapshot>)await readable.ReadAsync([fullRef], ct).ConfigureAwait(false),
CancellationToken.None).AsTask().GetAwaiter().GetResult();
cts.Token).AsTask().GetAwaiter().GetResult();
if (result.Count == 0)
{
statusCode = StatusCodes.BadNoData;
@@ -505,6 +566,12 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
statusCode = snap.StatusCode;
timestamp = snap.ServerTimestampUtc;
}
catch (OperationCanceledException)
{
// The deadline expired or the client cancelled. Surface BadTimeout so the client
// sees the actual outcome (the pre-fix uncancellable path would have hung instead).
statusCode = StatusCodes.BadTimeout;
}
catch (Exception ex)
{
_logger.LogWarning(ex, "OnReadValue failed for {NodeId}", node.NodeId);
@@ -745,11 +812,15 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
engine.AddCommentAsync(alarmId, user, comment ?? string.Empty, CancellationToken.None)
.GetAwaiter().GetResult();
// Mark the slot as handled so base.Call skips it. A pre-populated Good
// result (not null and not Bad) is the signal the base class uses to
// skip per-slot dispatch — set StatusCode to Good explicitly.
// Mark the slot as handled so base.Call skips it. Server-008: the SDK's
// CustomNodeManager2.Call (and CallInternalAsync) skip slots whose
// CallMethodRequest.Processed flag is true — the errors[i] value is the
// per-slot status, not the skip signal. Without Processed=true the stack's
// built-in Part 9 Acknowledge/Confirm handler would also fire and the engine
// would observe a double transition.
results[i] = new CallMethodResult { StatusCode = StatusCodes.Good };
errors[i] = ServiceResult.Good;
request.Processed = true;
}
catch (ArgumentException ex)
{
@@ -757,11 +828,15 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
// as BadInvalidArgument so the OPC UA client sees a meaningful status.
errors[i] = new ServiceResult(StatusCodes.BadInvalidArgument,
ex.Message, ex.Message);
// The engine rejected the call, but we still routed it — base.Call must
// not re-run the method on the stack's built-in handler. Server-008.
request.Processed = true;
}
catch (Exception ex)
{
errors[i] = new ServiceResult(StatusCodes.BadInternalError,
ex.Message, ex.Message);
request.Processed = true;
}
}
}
@@ -1354,13 +1429,16 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
{
var isIdempotent = _writeIdempotentByFullRef.GetValueOrDefault(fullRef!, false);
var capturedValue = value;
// Server-006: same deadline-derived cancellation as OnReadValue so a stalled
// driver write can't pin a request thread for the full pipeline timeout.
using var cts = DeriveOperationCancellation(context, DefaultSynchronousHookTimeout);
var results = _invoker.ExecuteWriteAsync(
ResolveHostFor(fullRef!),
isIdempotent,
async ct => (IReadOnlyList<WriteResult>)await _writable.WriteAsync(
[new DriverWriteRequest(fullRef!, capturedValue)],
ct).ConfigureAwait(false),
CancellationToken.None).AsTask().GetAwaiter().GetResult();
cts.Token).AsTask().GetAwaiter().GetResult();
if (results.Count > 0 && results[0].StatusCode != 0)
{
statusCode = results[0].StatusCode;
@@ -1368,6 +1446,12 @@ public sealed class DriverNodeManager : CustomNodeManager2, IAddressSpaceBuilder
}
return ServiceResult.Good;
}
catch (OperationCanceledException)
{
// Deadline expired or client cancelled — surface BadTimeout instead of the
// generic BadInternalError so the client sees the actual outcome.
return new ServiceResult(StatusCodes.BadTimeout);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Write failed for {FullRef}", fullRef);

View File

@@ -20,10 +20,15 @@ public enum OpcUaSecurityProfile
/// <summary>
/// OPC UA server endpoint + application-identity configuration. Bound from the
/// <c>OpcUaServer</c> section of <c>appsettings.json</c>. PR 17 minimum-viable scope: no LDAP,
/// no security profiles beyond None — those wire in alongside a future deployment-policy PR
/// that reads from the central config DB instead of appsettings.
/// <c>OpcUaServer</c> section of <c>appsettings.json</c>.
/// </summary>
/// <remarks>
/// Covers the endpoint URL + application identity, the PKI store root + auto-trust toggle,
/// the optional <c>/healthz</c> health-endpoints listener, the configurable transport
/// <see cref="SecurityProfile"/>, the <see cref="Ldap"/> binding for UserName token
/// validation, and the optional <see cref="AnonymousRoles"/> set that grants anonymous
/// sessions a configurable role list. See <c>docs/security.md</c> for the full guide.
/// </remarks>
public sealed class OpcUaServerOptions
{
public const string SectionName = "OpcUaServer";

View File

@@ -15,10 +15,27 @@ namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
/// <summary>
/// <see cref="StandardServer"/> subclass that wires one <see cref="DriverNodeManager"/> per
/// registered driver from <see cref="DriverHost"/>. Anonymous endpoint on
/// <c>opc.tcp://0.0.0.0:4840</c>, no security — PR 16 minimum-viable scope; LDAP + security
/// profiles are deferred to their own PR on top of this.
/// registered driver from <see cref="DriverHost"/>. Endpoint URL, transport security
/// profile (<see cref="OpcUaSecurityProfile"/>), and LDAP-backed UserName authentication
/// are all driven by <see cref="OpcUaServerOptions"/>.
/// </summary>
/// <remarks>
/// <para>
/// <see cref="OnImpersonateUser"/> accepts the stack's <c>AnonymousIdentityToken</c>
/// and <c>UserNameIdentityToken</c>. Anonymous sessions either receive the stack's
/// default empty identity (production default) or a configurable role set via
/// <see cref="OpcUaServerOptions.AnonymousRoles"/>. UserName sessions are validated
/// against the injected <see cref="IUserAuthenticator"/> — the production binding is
/// the LDAP authenticator under <see cref="OpcUaServerOptions.Ldap"/>; failed binds
/// throw <see cref="ServiceResultException"/> with
/// <see cref="StatusCodes.BadUserAccessDenied"/>.
/// </para>
/// <para>
/// Authenticated identities are wrapped in <see cref="RoleBasedIdentity"/> which
/// carries the LDAP-resolved roles + groups so <see cref="DriverNodeManager"/>'s
/// server-layer authorization gate can evaluate per-tag ACLs without re-reading LDAP.
/// </para>
/// </remarks>
public sealed class OtOpcUaServer : StandardServer
{
private readonly DriverHost _driverHost;
@@ -184,16 +201,31 @@ public sealed class OtOpcUaServer : StandardServer
/// <see cref="ILdapGroupsBearer"/> (data-plane: <see cref="AuthorizationGate"/> evaluator).
/// Anonymous identity (no roles configured) still uses the stack's default UserIdentity.
/// </summary>
private sealed class RoleBasedIdentity : UserIdentity, IRoleBearer, ILdapGroupsBearer
/// <remarks>
/// Server-004 — the LDAP-resolved display name is stamped onto the base
/// <see cref="UserIdentity.DisplayName"/> property (settable on the stack's
/// <c>UserIdentity</c>) so <c>DriverNodeManager.ResolveCallUser</c> picks it up
/// through the <see cref="IUserIdentity"/> interface. Without that assignment,
/// <c>DisplayName</c> resolves to the username and scripted-alarm
/// Ack/Confirm/Shelve audit entries record the raw username instead of the
/// human-readable LDAP <c>cn</c>.
/// </remarks>
internal sealed class RoleBasedIdentity : UserIdentity, IRoleBearer, ILdapGroupsBearer
{
public IReadOnlyList<string> Roles { get; }
public IReadOnlyList<string> LdapGroups { get; }
public string? Display { get; }
public RoleBasedIdentity(string userName, string? displayName, IReadOnlyList<string> roles, IReadOnlyList<string> ldapGroups)
: base(userName, "")
: base(displayName ?? userName, "")
{
Display = displayName;
// The base UserIdentity(string, string) ctor seeds DisplayName from its first arg
// (the stack's UserIdentity.DisplayName getter is sealed-virtual on this pinned
// SDK version, so it cannot be overridden or re-assigned). Pass the LDAP-resolved
// display name as that first arg so DriverNodeManager.ResolveCallUser — which
// reads IUserIdentity.DisplayName — stamps the human-readable name onto
// scripted-alarm Ack / Confirm / Shelve audit entries. When LDAP returns no
// display name, fall back to the username so every audit row still carries an
// identity (matches the pre-fix observable behaviour).
Roles = roles;
LdapGroups = ldapGroups;
}

View File

@@ -15,7 +15,11 @@ namespace ZB.MOM.WW.OtOpcUa.Server;
/// runs until stopped.
/// </summary>
public sealed class OpcUaServerService(
NodeBootstrap bootstrap,
// Server-014 — consume SealedBootstrap so the Phase 6.1 Stream D resilient-reader +
// GenerationSealedCache + StaleConfigFlag chain actually runs on every boot. The
// legacy NodeBootstrap is still registered for integration tests that construct it
// directly, but the production service uses the generation-sealed path.
SealedBootstrap bootstrap,
DriverHost driverHost,
OpcUaApplicationHost applicationHost,
DriverEquipmentContentRegistry equipmentContentRegistry,

View File

@@ -116,6 +116,12 @@ builder.Services.AddSingleton<ILocalConfigCache>(_ => new LiteDbConfigCache(opti
builder.Services.AddSingleton<DriverHost>();
builder.Services.AddSingleton<NodeBootstrap>();
// Server-014: register the Phase 6.1 Stream D generation-sealed bootstrap chain. Production
// uses SealedBootstrap; NodeBootstrap stays registered for back-compat with the integration
// tests that depend on it directly. StaleConfigFlag is resolved into OpcUaApplicationHost so
// /healthz surfaces the stale-config signal when the cache fallback path serves a request.
builder.Services.AddSealedBootstrap(options);
// Task #248 — driver-instance bootstrap pipeline. DriverFactoryRegistry is the
// type-name → factory map; each driver project's static Register call pre-loads
// its factory so the bootstrapper can materialise DriverInstance rows from the
@@ -217,7 +223,11 @@ builder.Services.AddSingleton<OpcUaApplicationHost>(sp =>
equipmentContentLookup: registry.Get,
historyRouter: sp.GetRequiredService<IHistoryRouter>(),
alarmConditionService: sp.GetRequiredService<AlarmConditionService>(),
configDbHealthy: () => dbHealthCache.IsHealthy);
configDbHealthy: () => dbHealthCache.IsHealthy,
// Server-014: surface StaleConfigFlag into /healthz so a cache-fallback bootstrap
// (central DB unreachable + sealed snapshot served) flips usingStaleConfig=true on
// the next probe. Without this wire-up the flag was inert.
staleConfigFlag: sp.GetRequiredService<ZB.MOM.WW.OtOpcUa.Configuration.LocalCache.StaleConfigFlag>());
});
builder.Services.AddHostedService<OpcUaServerService>();

View File

@@ -12,14 +12,17 @@ namespace ZB.MOM.WW.OtOpcUa.Server;
/// sealed snapshot to fall back to.
/// </summary>
/// <remarks>
/// <para>Alongside the original <see cref="NodeBootstrap"/> (which uses the single-file
/// <see cref="ILocalConfigCache"/>). Program.cs can switch to this one once operators are
/// ready for the generation-sealed semantics. The original stays for backward compat
/// with the three integration tests that construct <see cref="NodeBootstrap"/> directly.</para>
/// <para>Server-014 — registered in DI via <c>ServerWiring.AddSealedBootstrap</c> and
/// consumed by <c>OpcUaServerService</c>. The legacy <see cref="NodeBootstrap"/> stays
/// registered alongside for the three integration tests that construct it directly, but
/// production boots through this wrapper so <see cref="GenerationSealedCache"/> +
/// <see cref="ResilientConfigReader"/> + <see cref="StaleConfigFlag"/> run on every
/// start-up and <c>/healthz</c>'s <c>usingStaleConfig</c> reflects the cache-fallback
/// state.</para>
///
/// <para>Closes release blocker #2 in <c>docs/v2/v2-release-readiness.md</c> — the
/// generation-sealed cache + resilient reader + stale-config flag ship as unit-tested
/// primitives in PR #81 but no production path consumed them until this wrapper.</para>
/// primitives in PR #81; this wrapper is the production consumer that wires them in.</para>
/// </remarks>
public sealed class SealedBootstrap
{

View File

@@ -0,0 +1,57 @@
using Microsoft.Extensions.DependencyInjection;
using ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
namespace ZB.MOM.WW.OtOpcUa.Server;
/// <summary>
/// DI registration helpers consumed by <c>Program.cs</c>. Extracted so tests can assert
/// the production wire-up actually composes without spinning up the full <c>Host</c>.
/// </summary>
public static class ServerWiring
{
/// <summary>
/// Server-014 — registers the Phase 6.1 Stream D generation-sealed bootstrap chain:
/// <see cref="GenerationSealedCache"/>, <see cref="StaleConfigFlag"/>,
/// <see cref="ResilientConfigReader"/>, and <see cref="SealedBootstrap"/>. Without these
/// registrations <c>OpcUaServerService</c> cannot consume the sealed bootstrap and the
/// <see cref="StaleConfigFlag"/> stays inert — <c>/healthz</c>'s <c>usingStaleConfig</c>
/// never flips on a DB outage with a warm cache.
/// </summary>
/// <remarks>
/// The cache root is sourced from <see cref="NodeOptions.LocalCachePath"/> — same path
/// the legacy <see cref="NodeBootstrap"/> uses for its LiteDB cache, so both bootstrap
/// paths persist alongside each other while the migration completes.
/// </remarks>
public static IServiceCollection AddSealedBootstrap(this IServiceCollection services, NodeOptions options)
{
// Use a sibling directory off LocalCachePath so the LiteDB file and the
// GenerationSealedCache snapshots don't clash. The cache root is a directory;
// LocalCachePath is canonically the LiteDB file path.
var cacheRoot = ResolveCacheRoot(options.LocalCachePath);
// Register NodeOptions only if the caller hasn't already done so — Program.cs
// registers it earlier in its DI chain, but the wiring helper supports standalone
// unit tests that want to compose just the SealedBootstrap chain.
if (!services.Any(d => d.ServiceType == typeof(NodeOptions)))
services.AddSingleton(options);
services.AddSingleton(new GenerationSealedCache(cacheRoot));
services.AddSingleton<StaleConfigFlag>();
services.AddSingleton<ResilientConfigReader>();
services.AddSingleton<SealedBootstrap>();
return services;
}
private static string ResolveCacheRoot(string localCachePath)
{
// LocalCachePath is the LiteDB file (e.g. "config_cache.db"); the sealed cache is a
// directory. Pick a sibling folder so the two don't share a path.
if (string.IsNullOrWhiteSpace(localCachePath))
return Path.Combine(Path.GetTempPath(), "otopcua-sealed-cache");
var dir = Path.GetDirectoryName(localCachePath);
var name = Path.GetFileNameWithoutExtension(localCachePath);
var root = string.IsNullOrEmpty(dir)
? $"{name}.sealed"
: Path.Combine(dir, $"{name}.sealed");
return root;
}
}