fix(server): resolve Low code-review findings (Server-004,006,008,012,014,015)

- Server-004: pass the role-derived display name to UserIdentity's base
  ctor (the SDK's DisplayName has no public setter) and drop the dead
  Display property; make RoleBasedIdentity internal sealed.
- Server-006: derive a bounded CancellationToken from the SDK's
  OperationContext.OperationDeadline in OnReadValue / OnWriteValue so a
  stalled driver call can no longer pin the request thread.
- Server-008: mark handled slots via CallMethodRequest.Processed = true
  in RouteScriptedAlarmMethodCalls (the SDK skips on Processed, not on a
  Good error slot).
- Server-012: PeerHttpProbeLoop.ProbeAsync stops mutating client.Timeout
  per call; uses a per-request CancellationTokenSource linked to the
  shutdown token instead.
- Server-014: wire SealedBootstrap into Program.cs via AddSealedBootstrap
  + OpcUaServerService so the generation-sealed cache + stale-config flag
  + resilient reader actually run; /healthz now reflects cache-fallback
  state.
- Server-015: replace the stale 'PR 16 / PR 17 minimum-viable scope'
  class summaries on OtOpcUaServer and OpcUaServerOptions with the
  shipped LDAP + anonymous-role + configurable security-profile prose.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-23 07:24:20 -04:00
parent 2b33b64a58
commit 6134050ceb
14 changed files with 698 additions and 40 deletions

View File

@@ -0,0 +1,116 @@
using Opc.Ua;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Server.OpcUa;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
/// <summary>
/// Regression for Server-006 — synchronous OnReadValue / OnWriteValue stack hooks must
/// derive a <see cref="CancellationToken"/> from the operation deadline so a stalled
/// driver call doesn't pin a request thread for the full pipeline timeout. The shared
/// helper <see cref="DriverNodeManager.DeriveOperationCancellation"/> turns the
/// <see cref="ISystemContext"/>'s <c>OperationDeadline</c> into a linked CTS.
/// </summary>
[Trait("Category", "Unit")]
public sealed class DriverNodeManagerCancellationTests
{
/// <summary>
/// Build a SystemContext bound to the supplied IOperationContext. SystemContext's
/// OperationContext setter is protected, so we use the public <c>Copy</c> method
/// which clones the context onto the supplied operation context.
/// </summary>
private static ISystemContext ContextWithDeadline(DateTime deadline)
=> new SystemContext().Copy(new StubOperationContext(deadline));
[Fact]
public void Future_deadline_produces_uncancelled_token()
{
var ctx = ContextWithDeadline(DateTime.UtcNow.AddSeconds(30));
using var cts = DriverNodeManager.DeriveOperationCancellation(ctx, fallback: TimeSpan.FromSeconds(10));
cts.Token.IsCancellationRequested.ShouldBeFalse();
}
[Fact]
public void Past_deadline_produces_already_cancelled_token()
{
var ctx = ContextWithDeadline(DateTime.UtcNow.AddSeconds(-5));
using var cts = DriverNodeManager.DeriveOperationCancellation(ctx, fallback: TimeSpan.FromSeconds(10));
cts.Token.IsCancellationRequested.ShouldBeTrue(
"an expired OperationDeadline must surface as an immediately-cancelled token so the "
+ "stalled driver call returns without burning a request thread");
}
[Fact]
public void Missing_deadline_uses_fallback_timeout()
{
// No OperationContext attached → no deadline plumbed; helper falls back to the
// supplied timeout so an OnReadValue hook into a stalled driver can't hang the
// request thread indefinitely.
var ctx = new SystemContext();
using var cts = DriverNodeManager.DeriveOperationCancellation(ctx, fallback: TimeSpan.FromMilliseconds(20));
cts.Token.WaitHandle.WaitOne(TimeSpan.FromMilliseconds(500)).ShouldBeTrue(
"fallback timeout must fire so a missing deadline cannot hang the request thread");
cts.Token.IsCancellationRequested.ShouldBeTrue();
}
[Fact]
public void DateTime_MinValue_deadline_uses_fallback_timeout()
{
// IOperationContext.OperationDeadline is `DateTime.MinValue` when the stack hasn't
// plumbed a deadline through. The helper treats that as "no deadline" and falls
// back to the supplied timeout, otherwise an MinValue would surface as
// already-cancelled and short-circuit every read.
var ctx = ContextWithDeadline(DateTime.MinValue);
using var cts = DriverNodeManager.DeriveOperationCancellation(ctx, fallback: TimeSpan.FromMilliseconds(20));
cts.Token.WaitHandle.WaitOne(TimeSpan.FromMilliseconds(500)).ShouldBeTrue();
cts.Token.IsCancellationRequested.ShouldBeTrue();
}
[Fact]
public void DateTime_MaxValue_deadline_uses_fallback_timeout_not_overflow()
{
// OperationContext sets OperationDeadline = DateTime.MaxValue when the client's
// RequestHeader.TimeoutHint is zero (the default). DateTime.MaxValue - UtcNow
// overflows CancellationTokenSource(TimeSpan)'s Int32.MaxValue-ms cap, so the
// helper must collapse it to the fallback — otherwise the read throws
// ArgumentOutOfRangeException from inside DeriveOperationCancellation and surfaces
// as BadInternalError on every read (regression that broke OpcUaServerIntegrationTests).
var ctx = ContextWithDeadline(DateTime.MaxValue);
using var cts = DriverNodeManager.DeriveOperationCancellation(ctx, fallback: TimeSpan.FromSeconds(30));
cts.Token.IsCancellationRequested.ShouldBeFalse("MaxValue deadline + 30 s fallback must produce a live token");
}
[Fact]
public void Null_context_returns_uncancelled_token_with_fallback()
{
// Defensive — OnReadValue receives an ISystemContext from the stack so the helper
// shouldn't NRE if a future override passes through a null context.
using var cts = DriverNodeManager.DeriveOperationCancellation(context: null!, fallback: TimeSpan.FromSeconds(30));
cts.Token.IsCancellationRequested.ShouldBeFalse();
}
/// <summary>Minimal IOperationContext for deadline testing.</summary>
private sealed class StubOperationContext(DateTime deadline) : IOperationContext
{
public DateTime OperationDeadline { get; } = deadline;
public NodeId? SessionId => null;
public IUserIdentity? UserIdentity => null;
public IList<string>? PreferredLocales => null;
public DiagnosticsMasks DiagnosticsMask => DiagnosticsMasks.None;
public StringTable StringTable { get; } = new StringTable();
public StatusCode OperationStatus => StatusCodes.Good;
public string? AuditEntryId => null;
}
}

View File

@@ -115,6 +115,34 @@ public sealed class PeerHttpProbeLoopTests : IDisposable
tracker.Get("B").HttpHealthy.ShouldBeFalse();
}
[Fact]
public async Task Tick_does_not_mutate_factory_vended_client_Timeout()
{
// Server-012: timeouts belong on the named-client registration or a per-request CTS,
// NOT on a factory-vended HttpClient (which IHttpClientFactory may pool/recycle).
// Mutating client.Timeout per tick is at minimum a bad smell and races with
// IHttpClientFactory's lifecycle expectations.
var coordinator = await SeedAndInitializeAsync("A",
("A", RedundancyRole.Primary, "urn:A"),
("B", RedundancyRole.Secondary, "urn:B"));
var tracker = new PeerReachabilityTracker();
var factoryInitialTimeout = TimeSpan.FromMinutes(2);
var factory = new RecordingHttpClientFactory(
_ => new HttpResponseMessage(HttpStatusCode.OK),
factoryInitialTimeout);
var loop = new PeerHttpProbeLoop(coordinator, tracker, factory, NullLogger<PeerHttpProbeLoop>.Instance,
options: new PeerProbeOptions { HttpProbeTimeout = TimeSpan.FromSeconds(3) });
await loop.TickAsync(CancellationToken.None);
factory.LastCreatedClient.ShouldNotBeNull();
factory.LastCreatedClient.Timeout.ShouldBe(factoryInitialTimeout,
"the probe loop must not mutate the factory-vended HttpClient's Timeout — "
+ "per-call timeout should be enforced via a CancellationToken or via "
+ "AddHttpClient.ConfigureHttpClient on the named registration.");
}
// ---- fixture helpers ---------------------------------------------------
private async Task<RedundancyCoordinator> SeedAndInitializeAsync(string selfNodeId, params (string id, RedundancyRole role, string appUri)[] nodes)
@@ -158,4 +186,30 @@ public sealed class PeerHttpProbeLoopTests : IDisposable
=> Task.FromResult(respond(request));
}
}
/// <summary>
/// Server-012 — captures the most-recently-vended <see cref="HttpClient"/> so the
/// test can assert the probe loop didn't mutate its <see cref="HttpClient.Timeout"/>.
/// </summary>
private sealed class RecordingHttpClientFactory(
Func<HttpRequestMessage, HttpResponseMessage> respond,
TimeSpan initialTimeout) : IHttpClientFactory
{
public HttpClient? LastCreatedClient { get; private set; }
public HttpClient CreateClient(string name)
{
var client = new HttpClient(new RecordingHandler(respond), disposeHandler: true)
{
Timeout = initialTimeout,
};
LastCreatedClient = client;
return client;
}
private sealed class RecordingHandler(Func<HttpRequestMessage, HttpResponseMessage> respond) : HttpMessageHandler
{
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
=> Task.FromResult(respond(request));
}
}
}

View File

@@ -0,0 +1,176 @@
using Opc.Ua;
using Serilog;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Core.AlarmHistorian;
using ZB.MOM.WW.OtOpcUa.Core.ScriptedAlarms;
using ZB.MOM.WW.OtOpcUa.Core.Scripting;
using ZB.MOM.WW.OtOpcUa.Server.OpcUa;
using ZB.MOM.WW.OtOpcUa.Server.Phase7;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests.Phase7;
/// <summary>
/// Regression for Server-008 — <c>RouteScriptedAlarmMethodCalls</c> must mark a handled
/// <see cref="CallMethodRequest"/> slot as <c>Processed = true</c> so the stack's
/// <c>CustomNodeManager2.Call</c> skips it. The pre-fix code relied on the slot's
/// <c>errors[i]</c> being <c>ServiceResult.Good</c>, but the SDK's actual skip predicate is
/// <see cref="CallMethodRequest.Processed"/>; without setting it, the stack's built-in
/// Part 9 Acknowledge / Confirm handler would also fire, producing a double transition.
/// </summary>
[Trait("Category", "Unit")]
public sealed class ScriptedAlarmMethodRoutingProcessedFlagTests
{
private static ScriptedAlarmEngine BuildActiveEngine(string alarmId)
{
var upstream = new CachedTagUpstreamSource();
var logger = new LoggerConfiguration().CreateLogger();
var factory = new ScriptLoggerFactory(logger);
var engine = new ScriptedAlarmEngine(upstream, new InMemoryAlarmStateStore(), factory, logger);
var defs = new List<ScriptedAlarmDefinition>
{
new(AlarmId: alarmId,
EquipmentPath: "/eq",
AlarmName: alarmId,
Kind: AlarmKind.LimitAlarm,
Severity: AlarmSeverity.Medium,
MessageTemplate: "msg",
PredicateScriptSource: "return true;"),
};
engine.LoadAsync(defs, CancellationToken.None).GetAwaiter().GetResult();
return engine;
}
private static ScriptedAlarmEngine BuildEngine(string alarmId)
{
var upstream = new CachedTagUpstreamSource();
var logger = new LoggerConfiguration().CreateLogger();
var factory = new ScriptLoggerFactory(logger);
var engine = new ScriptedAlarmEngine(upstream, new InMemoryAlarmStateStore(), factory, logger);
var defs = new List<ScriptedAlarmDefinition>
{
new(AlarmId: alarmId,
EquipmentPath: "/eq",
AlarmName: alarmId,
Kind: AlarmKind.LimitAlarm,
Severity: AlarmSeverity.Medium,
MessageTemplate: "msg",
PredicateScriptSource: "return false;"),
};
engine.LoadAsync(defs, CancellationToken.None).GetAwaiter().GetResult();
return engine;
}
private static CallMethodRequest AcknowledgeRequest(string conditionNodeId)
=> new()
{
ObjectId = new NodeId(conditionNodeId, 2),
MethodId = MethodIds.AcknowledgeableConditionType_Acknowledge,
InputArguments =
{
new Variant(new byte[] { 1, 2, 3 }),
new Variant(new LocalizedText("ack-comment")),
},
};
private static CallMethodRequest AddCommentRequest(string conditionNodeId)
=> new()
{
ObjectId = new NodeId(conditionNodeId, 2),
MethodId = MethodIds.ConditionType_AddComment,
InputArguments =
{
new Variant(new byte[] { 1, 2, 3 }),
new Variant(new LocalizedText("comment-text")),
},
};
[Fact]
public void Handled_Acknowledge_marks_Processed_true_so_baseCall_skips_the_slot()
{
using var engine = BuildActiveEngine("al-1");
var calls = new List<CallMethodRequest> { AcknowledgeRequest("al-1.Condition") };
var results = new List<CallMethodResult> { new CallMethodResult() };
var errors = new List<ServiceResult> { null! };
var index = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
["al-1.Condition"] = "al-1",
};
DriverNodeManager.RouteScriptedAlarmMethodCalls(
new NamedIdentity("ops-user"), calls, results, errors, engine, index);
calls[0].Processed.ShouldBeTrue(
"CustomNodeManager2.Call/CallInternalAsync skips slots with Processed=true. "
+ "Without this flag, base.Call would re-dispatch the Acknowledge to the stack's "
+ "built-in Part 9 handler and the engine would observe a double transition.");
}
[Fact]
public void Handled_AddComment_marks_Processed_true()
{
using var engine = BuildEngine("al-1");
var calls = new List<CallMethodRequest> { AddCommentRequest("al-1.Condition") };
var results = new List<CallMethodResult> { new CallMethodResult() };
var errors = new List<ServiceResult> { null! };
var index = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
["al-1.Condition"] = "al-1",
};
DriverNodeManager.RouteScriptedAlarmMethodCalls(
new NamedIdentity("ops-user"), calls, results, errors, engine, index);
calls[0].Processed.ShouldBeTrue("AddComment handled by the engine must not re-dispatch via base.Call");
}
[Fact]
public void Engine_error_path_also_marks_Processed_so_baseCall_does_not_re_run_the_method()
{
using var engine = BuildEngine("al-1");
var calls = new List<CallMethodRequest>
{
// Index maps to an alarm id the engine doesn't know — engine throws
// ArgumentException, helper sets errors[i] = BadInvalidArgument.
AcknowledgeRequest("al-1.Condition"),
};
var results = new List<CallMethodResult> { new CallMethodResult() };
var errors = new List<ServiceResult> { null! };
var index = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
["al-1.Condition"] = "al-NOT-IN-ENGINE",
};
DriverNodeManager.RouteScriptedAlarmMethodCalls(
new NamedIdentity("ops-user"), calls, results, errors, engine, index);
ServiceResult.IsBad(errors[0]).ShouldBeTrue("engine error path");
calls[0].Processed.ShouldBeTrue(
"even when the engine returns Bad, the slot was handled — base.Call must not "
+ "re-dispatch the method against the OPC UA built-in handler.");
}
[Fact]
public void Unhandled_slot_leaves_Processed_false_so_baseCall_drives_it()
{
using var engine = BuildActiveEngine("al-1");
var genericMethod = new CallMethodRequest
{
ObjectId = new NodeId("some-driver-method", 2),
MethodId = new NodeId("driver-method", 2),
};
var calls = new List<CallMethodRequest> { genericMethod };
var results = new List<CallMethodResult> { new CallMethodResult() };
var errors = new List<ServiceResult> { null! };
DriverNodeManager.RouteScriptedAlarmMethodCalls(
new NamedIdentity("ops-user"), calls, results, errors, engine,
conditionIdToAlarmId: new Dictionary<string, string>());
calls[0].Processed.ShouldBeFalse("non-alarm methods must fall through to base.Call");
errors[0].ShouldBeNull("unhandled slot's error must stay null for the base implementation");
}
private sealed class NamedIdentity(string displayName) : UserIdentity(displayName, "") { }
}

View File

@@ -0,0 +1,56 @@
using Opc.Ua;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Server.OpcUa;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
/// <summary>
/// Regression for Server-004 — the production
/// <see cref="OtOpcUaServer.RoleBasedIdentity"/> must surface the LDAP-resolved display
/// name through <see cref="IUserIdentity.DisplayName"/>, since
/// <c>DriverNodeManager.ResolveCallUser</c> reads the base interface property when stamping
/// audit identities on scripted-alarm Acknowledge / Confirm / Shelve calls.
/// </summary>
[Trait("Category", "Unit")]
public sealed class RoleBasedIdentityTests
{
[Fact]
public void DisplayName_returns_LDAP_resolved_display_name_when_present()
{
IUserIdentity identity = new OtOpcUaServer.RoleBasedIdentity(
userName: "alice",
displayName: "Alice Smith",
roles: new[] { "WriteOperate" },
ldapGroups: new[] { "ot_operators" });
identity.DisplayName.ShouldBe("Alice Smith",
"DriverNodeManager.ResolveCallUser reads IUserIdentity.DisplayName for audit entries; "
+ "RoleBasedIdentity must surface the LDAP-resolved name, not just the username.");
}
[Fact]
public void DisplayName_falls_back_to_userName_when_LDAP_display_name_is_null()
{
IUserIdentity identity = new OtOpcUaServer.RoleBasedIdentity(
userName: "alice",
displayName: null,
roles: [],
ldapGroups: []);
identity.DisplayName.ShouldBe("alice",
"absent an LDAP display name, audit entries should still carry the username.");
}
[Fact]
public void ResolveCallUser_yields_LDAP_resolved_display_name()
{
IUserIdentity identity = new OtOpcUaServer.RoleBasedIdentity(
userName: "alice",
displayName: "Alice Smith",
roles: [],
ldapGroups: []);
DriverNodeManager.ResolveCallUser(identity).ShouldBe("Alice Smith");
}
}

View File

@@ -0,0 +1,52 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
namespace ZB.MOM.WW.OtOpcUa.Server.Tests;
/// <summary>
/// Regression for Server-014 — <see cref="SealedBootstrap"/> exists in the source tree and
/// is referenced by <c>docs/v2/v2-release-readiness.md</c> as the closed release blocker for
/// generation-sealed config plumbing, but it was never registered in the production DI
/// container. The release blocker remained de-facto open. This test asserts the DI
/// registrations (which <c>Program.cs</c> performs at startup) actually compose: every
/// dependency <see cref="SealedBootstrap"/> needs — <see cref="GenerationSealedCache"/>,
/// <see cref="ResilientConfigReader"/>, <see cref="StaleConfigFlag"/> — must be resolvable
/// so the production wire-up doesn't fail with a missing-service exception at startup.
/// </summary>
[Trait("Category", "Unit")]
public sealed class SealedBootstrapWiringTests
{
[Fact]
public void SealedBootstrap_and_its_dependencies_are_registered_in_DI()
{
var tempRoot = Path.Combine(Path.GetTempPath(), $"otopcua-sealed-bootstrap-wiring-{Guid.NewGuid():N}");
try
{
// Mirror Program.cs's registrations of NodeOptions + the SealedBootstrap chain.
var services = new ServiceCollection();
ZB.MOM.WW.OtOpcUa.Server.ServerWiring.AddSealedBootstrap(services, new NodeOptions
{
NodeId = "test-node",
ClusterId = "test-cluster",
ConfigDbConnectionString = "Server=fake;Database=fake;Integrated Security=true;",
LocalCachePath = tempRoot,
});
services.AddSingleton(NullLoggerFactory.Instance);
services.AddLogging();
using var sp = services.BuildServiceProvider();
sp.GetRequiredService<GenerationSealedCache>().ShouldNotBeNull();
sp.GetRequiredService<ResilientConfigReader>().ShouldNotBeNull();
sp.GetRequiredService<StaleConfigFlag>().ShouldNotBeNull();
sp.GetRequiredService<SealedBootstrap>().ShouldNotBeNull();
}
finally
{
try { if (Directory.Exists(tempRoot)) Directory.Delete(tempRoot, recursive: true); } catch { }
}
}
}