From a0203503a7b7d473afe3c2d206366ee1e7827dcd Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 09:46:47 -0400 Subject: [PATCH] Code-review 2026-05-20 sweep: re-review at 1cd51bb, resolve 72 findings across all 11 modules Re-reviewed every module/client against the 10-category checklist (REVIEW-PROCESS.md) at commit 1cd51bb, filed 72 new findings, and fixed them in three priority waves (3 High, 17 Medium, 52 Low). Highs - Server-017: enumerate AcknowledgeAlarm / QueryActiveAlarms in GatewayGrpcScopeResolver so non-admin keys can use them; document the mapping in docs/Authorization.md; add interceptor tests. - Client.Java-013: add the five missing bulk-method stubs to the CLI FakeSession so the test module compiles on a clean tree. - Client.Rust-013: fix the clippy::doc_lazy_continuation regression in generated tonic code by reformatting the ReadBulkCommand proto comment and scoping a #![allow(...)] to the generated submodules. Mediums (highlights) - Server: unify GatewaySession state-lock discipline (-015) and make DisposeAsync race-safe against in-flight CloseAsync (-016); add constraint-enforcement test coverage for the bulk-plan path (-021). - Worker: introduce StaRuntimeShutdownException so RunAlarmPollLoop can distinguish graceful shutdown from a real STA-affinity violation (-016); have the watchdog skip StaHung while CurrentCommandCorrelationId is non-empty so a legitimate slow ReadBulk no longer self-faults (-017). - Tests: add per-method round-trip + cancellation coverage for the 11 GatewaySession bulk methods (-013); replace the real TCP probe in GalaxyHierarchyCacheTests with an IGalaxyRepository fake (-016). - IntegrationTests: drive the StreamEvents writer in the live Write test and assert OnWriteComplete (-012); add live tests for Unadvise/RemoveItem/Unregister ordering, WriteSecured, and abnormal worker exit (-014). - Worker.Tests: replace MxAccessSession reflection with an internal CreateForTesting factory (-016); cover WorkerCancel and unexpected-body envelope branches (-017). - Client.Java: cancel MxEventStream when close() races beforeStart() (-014); return a CancellingCompletableFuture that actually forwards cancellation through .thenApply chains (-015). - Client.Python: drop the silent localhost-plaintext downgrade in the CLI; require explicit --plaintext (-013). - Client.Rust: stop bench-read-bulk from polluting success-latency histograms with failed-call durations (-015); add coverage for the five MalformedReply paths, the bulk-write helpers, the Error::Unavailable mapping, and the unary-fault path (-016). - Contracts: extend docs/Contracts.md with the bulk read/write command family (-009). Lows (highlights) - Server: cap GalaxyGlobMatcher.RegexCache; align WorkerAlarmRpcDispatcher missing-session handling; drop the duplicate dashboard @page routes; refresh IAlarmRpcDispatcher XML doc. - Worker: surface SetXmlAlarmQuery COM failures; remove dead subscriptionExpression / ExecutingCommand arms; preserve factory-supplied runtime sessions; split MxAlarmSnapshot.cs into three files. - Tests: dispose the WebApplication in seven test classes; rebuild FakeWorkerProcess.WaitForExitAsync against a real TaskCompletion source; switch the heartbeat-expires test to ManualTimeProvider; add InvariantCulture to the remaining DateTimeOffset.Parse sites; document GalaxyFilterInputSafetyTests in GatewayTesting.md. - IntegrationTests: comment fixes, RecordingServerStreamWriter IDisposable, class-level [Trait], single-source ZB default connection string. - Worker.Tests: replace silent-return gating with LiveMxAccessFact so absent env vars SKIP not pass; PascalCase rename of probe [Fact]s; deterministic deadline test; new frame-protocol error tests; ComputeTransitions diff-coverage; relocate dev-rig probes to Probes/. - Contracts: add round-trip coverage and per-field redaction / Galaxy-identifier comments to the protos. - Client.Dotnet: introduce clients/dotnet/Directory.Build.props so TreatWarningsAsErrors / analysers apply; document DiscoverHierarchyOptions and IMxGatewayCliClient; require typed bulk-read handles in CLI; surface AcknowledgeAlarm transport faults through Translate(). - Client.Go: kill dead code in alarms_test / fakeGalaxyServer / runWriteBulkVariant; document the six new subcommands in writeUsage; drain galaxy-watch events on limit; switch io.EOF comparisons to errors.Is. - Client.Java: shared shutdown helpers + new shutdownTimeout option; regex-based credential redaction; Long.toUnsignedString for uint64 sequence; doc fixes. - Client.Python: combine duplicate imports; add coverage for _percentile / bench-read-bulk / MAX_AGGREGATE_EVENTS / _api_key_from_env; populate pyproject metadata and ship py.typed. - Client.Rust: expose next_correlation_id() so CLI ping/close stop hard-coding correlation IDs; resync RustClientDesign.md with the current Session / Error surface and CLI subcommand set. Co-Authored-By: Claude Opus 4.7 (1M context) --- clients/dotnet/Directory.Build.props | 17 + .../IMxGatewayCliClient.cs | 6 + .../MxGatewayClientCli.cs | 54 +- .../FakeGatewayTransport.cs | 2 +- .../MxGatewayClientAlarmsTests.cs | 38 +- .../DiscoverHierarchyOptions.cs | 43 + .../GalaxyRepositoryClient.cs | 18 +- clients/go/cmd/mxgw-go/main.go | 35 +- clients/go/mxgateway/alarms_test.go | 4 - clients/go/mxgateway/galaxy.go | 3 +- clients/go/mxgateway/galaxy_test.go | 24 +- clients/go/mxgateway/session.go | 2 +- clients/java/README.md | 27 +- .../mxgateway/cli/MxGatewayCli.java | 7 +- .../mxgateway/cli/MxGatewayCliTests.java | 50 ++ .../client/GalaxyRepositoryClient.java | 128 +-- .../mxgateway/client/MxEventStream.java | 28 +- .../mxgateway/client/MxGatewayChannels.java | 148 +++- .../mxgateway/client/MxGatewayClient.java | 76 +- .../client/MxGatewayClientOptions.java | 32 + .../mxgateway/client/MxGatewaySecrets.java | 34 +- .../client/MxGatewayFixtureTests.java | 31 + .../client/MxGatewayLowFindingsIITests.java | 182 +++++ .../client/MxGatewayMediumFindingsTests.java | 135 +++- .../v1/GalaxyRepositoryOuterClass.java | 150 ++++ .../mxaccess_gateway/v1/MxaccessGateway.java | 154 ++++ clients/python/README.md | 6 + clients/python/pyproject.toml | 33 + clients/python/src/mxgateway/py.typed | 0 clients/python/src/mxgateway_cli/commands.py | 46 +- clients/python/tests/test_cli.py | 152 +++- .../tests/test_cli_bench_and_helpers.py | 454 +++++++++++ clients/rust/RustClientDesign.md | 66 +- clients/rust/crates/mxgw-cli/src/main.rs | 250 ++++-- clients/rust/src/generated.rs | 3 + clients/rust/src/session.rs | 12 +- clients/rust/tests/client_behavior.rs | 424 +++++++++- code-reviews/Client.Dotnet/findings.md | 126 ++- code-reviews/Client.Go/findings.md | 127 ++- code-reviews/Client.Java/findings.md | 148 +++- code-reviews/Client.Python/findings.md | 283 ++++++- code-reviews/Client.Rust/findings.md | 129 ++- code-reviews/Contracts/findings.md | 99 ++- code-reviews/IntegrationTests/findings.md | 114 ++- code-reviews/README.md | 94 ++- code-reviews/Server/findings.md | 148 +++- code-reviews/Tests/findings.md | 127 ++- code-reviews/Worker.Tests/findings.md | 156 +++- code-reviews/Worker/findings.md | 133 ++- docs/Authorization.md | 14 +- docs/Contracts.md | 42 + docs/GatewayTesting.md | 62 +- docs/MxAccessWorkerInstanceDesign.md | 22 +- docs/Sessions.md | 11 +- .../Generated/GalaxyRepository.cs | 22 + .../Generated/MxaccessGateway.cs | 10 + .../Protos/galaxy_repository.proto | 14 + .../Protos/mxaccess_gateway.proto | 6 + .../DashboardLdapLiveTests.cs | 6 +- .../Galaxy/GalaxyRepositoryLiveTests.cs | 5 +- .../LiveGalaxyRepositoryFactAttribute.cs | 10 +- .../WorkerLiveMxAccessSmokeTests.cs | 657 ++++++++++++++- .../Components/Pages/ApiKeysPage.razor | 1 - .../Components/Pages/DashboardHome.razor | 1 - .../Components/Pages/EventsPage.razor | 1 - .../Components/Pages/GalaxyPage.razor | 1 - .../Components/Pages/SessionDetailsPage.razor | 1 - .../Components/Pages/SessionsPage.razor | 1 - .../Components/Pages/SettingsPage.razor | 1 - .../Components/Pages/WorkersPage.razor | 1 - .../Galaxy/GalaxyGlobMatcher.cs | 75 +- .../Galaxy/GalaxyHierarchyCache.cs | 4 +- .../Galaxy/GalaxyRepository.cs | 2 +- .../Galaxy/GalaxyRepositoryOptions.cs | 11 +- ...xyRepositoryServiceCollectionExtensions.cs | 1 + .../Galaxy/IGalaxyRepository.cs | 30 + .../Authorization/GatewayGrpcScopeResolver.cs | 2 + .../Sessions/GatewaySession.cs | 108 ++- .../Sessions/IAlarmRpcDispatcher.cs | 18 +- .../Sessions/WorkerAlarmRpcDispatcher.cs | 9 +- .../Contracts/GatewayContractInfoTests.cs | 13 +- .../ProtobufContractRoundTripTests.cs | 459 +++++++++++ .../Galaxy/GalaxyFilterInputSafetyTests.cs | 23 + .../Galaxy/GalaxyHierarchyCacheTests.cs | 59 +- .../Dashboard/DashboardCookieOptionsTests.cs | 4 +- .../DashboardSnapshotServiceTests.cs | 33 +- .../Gateway/GatewayApplicationTests.cs | 56 +- .../GatewayEndToEndFakeWorkerSmokeTests.cs | 17 +- .../MxAccessGatewayServiceConstraintTests.cs | 757 ++++++++++++++++++ .../Grpc/MxAccessGatewayServiceTests.cs | 5 +- .../Gateway/Sessions/GatewaySessionTests.cs | 247 ++++++ .../Sessions/SessionManagerBulkTests.cs | 711 ++++++++++++++++ .../Gateway/Sessions/SessionManagerTests.cs | 2 +- .../Sessions/WorkerAlarmRpcDispatcherTests.cs | 40 +- .../Gateway/Workers/WorkerClientTests.cs | 14 +- ...atewayGrpcAuthorizationInterceptorTests.cs | 88 ++ .../GatewayGrpcScopeResolverTests.cs | 2 + .../PredicateConstraintEnforcer.cs | 89 ++ .../Ipc/WorkerFrameProtocolTests.cs | 103 +++ .../Ipc/WorkerPipeSessionTests.cs | 175 +++- .../MxAccess/AlarmCommandExecutorTests.cs | 77 +- .../MxAccess/AlarmCommandHandlerTests.cs | 16 +- .../MxAccess/MxAccessLiveComCreationTests.cs | 47 +- .../MxAccess/MxAccessStaSessionTests.cs | 49 ++ .../MxAccess/MxAccessValueCacheTests.cs | 31 +- .../MxAccess/WnWrapAlarmConsumerXmlTests.cs | 168 ++++ .../{ => Probes}/AlarmClientWmProbeTests.cs | 2 +- .../{ => Probes}/AlarmsLiveSmokeTests.cs | 2 +- .../{ => Probes}/WnWrapConsumerProbeTests.cs | 2 +- .../Sta/StaRuntimeTests.cs | 13 +- .../TestSupport/FakeRuntimeSession.cs | 32 +- .../TestSupport/LiveMxAccessFactAttribute.cs | 36 + .../TestSupport/NoopMxAccessServer.cs | 92 +++ src/MxGateway.Worker/Ipc/WorkerPipeSession.cs | 59 +- .../MxAccess/MxAccessSession.cs | 29 + .../MxAccess/MxAccessStaSession.cs | 24 +- .../MxAccess/MxAlarmSnapshot.cs | 33 - .../MxAccess/MxAlarmStateKind.cs | 17 + .../MxAccess/MxAlarmTransitionEvent.cs | 20 + .../MxAccess/WnWrapAlarmConsumer.cs | 87 +- src/MxGateway.Worker/Sta/StaRuntime.cs | 5 +- .../Sta/StaRuntimeShutdownException.cs | 35 + 122 files changed, 8723 insertions(+), 757 deletions(-) create mode 100644 clients/dotnet/Directory.Build.props create mode 100644 clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayLowFindingsIITests.java create mode 100644 clients/python/src/mxgateway/py.typed create mode 100644 clients/python/tests/test_cli_bench_and_helpers.py create mode 100644 src/MxGateway.Server/Galaxy/IGalaxyRepository.cs create mode 100644 src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs create mode 100644 src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs create mode 100644 src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs create mode 100644 src/MxGateway.Tests/TestSupport/PredicateConstraintEnforcer.cs rename src/MxGateway.Worker.Tests/{ => Probes}/AlarmClientWmProbeTests.cs (99%) rename src/MxGateway.Worker.Tests/{ => Probes}/AlarmsLiveSmokeTests.cs (99%) rename src/MxGateway.Worker.Tests/{ => Probes}/WnWrapConsumerProbeTests.cs (99%) create mode 100644 src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs create mode 100644 src/MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs create mode 100644 src/MxGateway.Worker/MxAccess/MxAlarmStateKind.cs create mode 100644 src/MxGateway.Worker/MxAccess/MxAlarmTransitionEvent.cs create mode 100644 src/MxGateway.Worker/Sta/StaRuntimeShutdownException.cs diff --git a/clients/dotnet/Directory.Build.props b/clients/dotnet/Directory.Build.props new file mode 100644 index 0000000..26b82a9 --- /dev/null +++ b/clients/dotnet/Directory.Build.props @@ -0,0 +1,17 @@ + + + + latest + enable + enable + true + latest + true + true + + diff --git a/clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs b/clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs index ba04136..e7e5e6b 100644 --- a/clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs +++ b/clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs @@ -3,6 +3,12 @@ using MxGateway.Contracts.Proto.Galaxy; namespace MxGateway.Client.Cli; +/// +/// Minimal transport surface the CLI talks to. Exposes only the gateway and +/// Galaxy Repository RPCs the CLI needs so tests can substitute an in-process +/// fake without standing up a real gRPC channel. The production binding is a +/// thin adapter over and . +/// public interface IMxGatewayCliClient : IAsyncDisposable { /// diff --git a/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs b/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs index c1cae7f..eeffa45 100644 --- a/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs +++ b/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs @@ -635,7 +635,7 @@ public static class MxGatewayClientCli }), cancellationToken) .ConfigureAwait(false); - int serverHandle = registerReply.Register?.ServerHandle ?? registerReply.ReturnValue.Int32Value; + int serverHandle = RequireRegisterServerHandle(registerReply); SubscribeBulkCommand subscribe = new() { ServerHandle = serverHandle }; subscribe.TagAddresses.Add(tags); @@ -893,7 +893,7 @@ public static class MxGatewayClientCli }), cancellationToken) .ConfigureAwait(false); - int serverHandle = registerReply.Register?.ServerHandle ?? registerReply.ReturnValue.Int32Value; + int serverHandle = RequireRegisterServerHandle(registerReply); SubscribeBulkCommand subscribe = new() { ServerHandle = serverHandle }; subscribe.TagAddresses.Add(tags); @@ -941,11 +941,16 @@ public static class MxGatewayClientCli continue; } - if (firstSteadyEventUtc is null) + // Guarded by latencyLock so parallel sessions can't tear a 64-bit + // DateTime? read or stomp an already-set firstSteadyEventUtc with + // a later timestamp from a slower-to-start session. The lock is + // already held by the latency append a few lines below, so the + // extra cost is one uncontended lock acquisition per event. + lock (latencyLock) { - firstSteadyEventUtc = nowUtc; + firstSteadyEventUtc ??= nowUtc; + lastSteadyEventUtc = nowUtc; } - lastSteadyEventUtc = nowUtc; Interlocked.Increment(ref steadyEvents); if (mxEvent.Family == MxEventFamily.OnDataChange) { @@ -1258,7 +1263,7 @@ public static class MxGatewayClientCli Kind = MxCommandKind.Register, Register = new RegisterCommand { ClientName = arguments.GetOptional("client-name") ?? "mxgw-dotnet-smoke" }, }, - reply => reply.Register?.ServerHandle ?? reply.ReturnValue.Int32Value, + RequireRegisterServerHandle, commandReplies, cancellationToken) .ConfigureAwait(false); @@ -1276,7 +1281,7 @@ public static class MxGatewayClientCli ItemDefinition = arguments.GetRequired("item"), }, }, - reply => reply.AddItem?.ItemHandle ?? reply.ReturnValue.Int32Value, + RequireAddItemItemHandle, commandReplies, cancellationToken) .ConfigureAwait(false); @@ -1408,6 +1413,41 @@ public static class MxGatewayClientCli return reply; } + /// + /// Returns the server handle from a successful register reply, or throws + /// when the typed + /// payload is absent. Mirrors the SDK-level + /// contract: a successful reply without the typed payload is a gateway protocol + /// error, not a license to fall through to ReturnValue.Int32Value (which is 0 + /// when the reply carries no return value). + /// + private static int RequireRegisterServerHandle(MxCommandReply reply) + { + return reply.Register?.ServerHandle + ?? throw CreateMissingPayloadException(reply, "register"); + } + + /// + /// Returns the item handle from a successful add_item reply, or throws + /// when the typed + /// payload is absent. See for the rationale. + /// + private static int RequireAddItemItemHandle(MxCommandReply reply) + { + return reply.AddItem?.ItemHandle + ?? throw CreateMissingPayloadException(reply, "add_item"); + } + + private static MxGatewayException CreateMissingPayloadException( + MxCommandReply reply, + string expectedPayload) + { + return new MxGatewayException( + $"Gateway reply for command kind={reply.Kind} reported success but is missing " + + $"the required '{expectedPayload}' payload; cannot resolve a handle. " + + $"session={reply.SessionId}; correlation={reply.CorrelationId}"); + } + private static MxCommandRequest CreateCommandRequest( string sessionId, MxCommand command) diff --git a/clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs b/clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs index ea36dc7..554d6a1 100644 --- a/clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs +++ b/clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs @@ -216,7 +216,7 @@ internal sealed class FakeGatewayTransport(MxGatewayClientOptions options) : IMx AcknowledgeAlarmCalls.Add((request, callOptions)); if (AcknowledgeAlarmExceptions.TryDequeue(out Exception? exception)) { - throw exception; + throw Translate(exception, callOptions); } return Task.FromResult(_acknowledgeReplies.Count > 0 diff --git a/clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs b/clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs index 516ee05..3423c2b 100644 --- a/clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs +++ b/clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs @@ -73,19 +73,17 @@ public sealed class MxGatewayClientAlarmsTests } [Fact] - public async Task AcknowledgeAlarmAsync_MapsUnauthenticated_RpcException_ToTypedException() + public async Task AcknowledgeAlarmAsync_SurfacesRpcExceptionFromFakeTransportVerbatim_WhenMappingDisabled() { + // Default FakeGatewayTransport.MapTransportExceptions is false, matching the + // historical pass-through shape: a thrown RpcException reaches the caller as + // RpcException rather than being mapped to a typed MxGatewayException. This + // test pins that shape so a future change can't silently flip it. FakeGatewayTransport transport = CreateTransport(); transport.AcknowledgeAlarmExceptions.Enqueue( new RpcException(new Status(StatusCode.Unauthenticated, "expired key"))); await using MxGatewayClient client = CreateClient(transport); - // Note: the FakeGatewayTransport surfaces RpcException directly (it does not run - // through GrpcMxGatewayClientTransport's mapping); the fake's contract here is to - // pass the exception verbatim. RpcException → typed exception mapping is covered - // in the GrpcMxGatewayClientTransport-level tests; the SDK-level test pins the - // pass-through shape so a future migration to direct mapping won't silently - // change observable behaviour. var ex = await Assert.ThrowsAsync( () => client.AcknowledgeAlarmAsync(new AcknowledgeAlarmRequest { @@ -97,6 +95,32 @@ public sealed class MxGatewayClientAlarmsTests Assert.Equal(StatusCode.Unauthenticated, ex.StatusCode); } + [Fact] + public async Task AcknowledgeAlarmAsync_MapsUnauthenticated_RpcException_ToTypedException() + { + // Production parity: GrpcMxGatewayClientTransport.AcknowledgeAlarmAsync runs + // every thrown RpcException through RpcExceptionMapper.Map, so callers see + // MxGatewayAuthenticationException (for Unauthenticated) rather than the raw + // RpcException. The fake transport reproduces that mapping when + // MapTransportExceptions is set, letting this SDK-level test cover the same + // observable behaviour without standing up a real gRPC channel. + FakeGatewayTransport transport = CreateTransport(); + transport.MapTransportExceptions = true; + transport.AcknowledgeAlarmExceptions.Enqueue( + new RpcException(new Status(StatusCode.Unauthenticated, "expired key"))); + await using MxGatewayClient client = CreateClient(transport); + + var ex = await Assert.ThrowsAsync( + () => client.AcknowledgeAlarmAsync(new AcknowledgeAlarmRequest + { + SessionId = "session-fixture", + AlarmFullReference = "Tank01.Level.HiHi", + Comment = string.Empty, + OperatorUser = "alice", + })); + Assert.Equal(StatusCode.Unauthenticated, ex.StatusCode); + } + [Fact] public async Task QueryActiveAlarmsAsync_StreamsEnqueuedSnapshots() { diff --git a/clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs b/clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs index 2ef067f..9d2dccb 100644 --- a/clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs +++ b/clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs @@ -1,24 +1,67 @@ namespace MxGateway.Client; +/// +/// Server-side filters and shape options for +/// . +/// Each property maps directly to the corresponding field on the +/// DiscoverHierarchyRequest proto so the gateway can narrow the +/// hierarchy walk before serializing it back to the client. +/// public sealed record DiscoverHierarchyOptions { + /// + /// Root Galaxy object id to start the walk from. When set, takes + /// precedence over and . + /// public int? RootGobjectId { get; init; } + /// + /// Root tag (assigned) name to start the walk from. Used when + /// is null. + /// public string? RootTagName { get; init; } + /// + /// Root contained-name dotted path to start the walk from. Used when + /// neither nor are set. + /// public string? RootContainedPath { get; init; } + /// + /// Maximum traversal depth below the root, inclusive. Leave null for the + /// server default (unbounded). + /// public int? MaxDepth { get; init; } + /// + /// Galaxy category ids to include. Empty means all categories. + /// public IReadOnlyList CategoryIds { get; init; } = Array.Empty(); + /// + /// Template tag names that must appear somewhere in each returned + /// object's template chain. Empty means no template filter. + /// public IReadOnlyList TemplateChainContains { get; init; } = Array.Empty(); + /// + /// Optional glob (e.g. "Tank*") matched against each object's tag name. + /// public string? TagNameGlob { get; init; } + /// + /// When set, overrides whether each returned GalaxyObject includes + /// its dynamic attribute list. Leave null to use the server default. + /// public bool? IncludeAttributes { get; init; } + /// + /// When true, restrict results to objects that bear at least one configured alarm. + /// public bool AlarmBearingOnly { get; init; } + /// + /// When true, restrict results to objects that have at least one historized attribute. + /// public bool HistorizedOnly { get; init; } } diff --git a/clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs b/clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs index 77c2272..8079d90 100644 --- a/clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs +++ b/clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs @@ -23,7 +23,7 @@ public sealed class GalaxyRepositoryClient : IAsyncDisposable private readonly GrpcChannel? _channel; private readonly IGalaxyRepositoryClientTransport _transport; private readonly ResiliencePipeline _safeUnaryRetryPipeline; - private bool _disposed; + private int _disposed; /// /// Initializes a Galaxy Repository client with custom transport and options. @@ -182,6 +182,17 @@ public sealed class GalaxyRepositoryClient : IAsyncDisposable return await DiscoverHierarchyAsync(new DiscoverHierarchyOptions(), cancellationToken).ConfigureAwait(false); } + /// + /// Enumerates the deployed Galaxy object hierarchy with caller-supplied + /// server-side filters. Each returned may include + /// its dynamic attributes (controlled by ), + /// so callers can determine which tag references they may subscribe to via + /// the MxAccessGateway service. The client transparently follows the + /// gateway's pagination cursor until the hierarchy is fully drained. + /// + /// Server-side filter and shape options. + /// Cancellation token. + /// The filtered collection of Galaxy objects. public async Task> DiscoverHierarchyAsync( DiscoverHierarchyOptions options, CancellationToken cancellationToken = default) @@ -338,12 +349,11 @@ public sealed class GalaxyRepositoryClient : IAsyncDisposable /// public ValueTask DisposeAsync() { - if (_disposed) + if (Interlocked.Exchange(ref _disposed, 1) != 0) { return ValueTask.CompletedTask; } - _disposed = true; _channel?.Dispose(); return ValueTask.CompletedTask; } @@ -444,6 +454,6 @@ public sealed class GalaxyRepositoryClient : IAsyncDisposable private void ThrowIfDisposed() { - ObjectDisposedException.ThrowIf(_disposed, this); + ObjectDisposedException.ThrowIf(Volatile.Read(ref _disposed) != 0, this); } } diff --git a/clients/go/cmd/mxgw-go/main.go b/clients/go/cmd/mxgw-go/main.go index e52a408..640902e 100644 --- a/clients/go/cmd/mxgw-go/main.go +++ b/clients/go/cmd/mxgw-go/main.go @@ -389,25 +389,30 @@ func runReadBulk(ctx context.Context, args []string, stdout, stderr io.Writer) e } func runWriteBulk(ctx context.Context, args []string, stdout, stderr io.Writer) error { - return runWriteBulkVariant(ctx, args, stdout, stderr, "write-bulk", false, false) + return runWriteBulkVariant(ctx, args, stdout, stderr, "write-bulk", false) } func runWrite2Bulk(ctx context.Context, args []string, stdout, stderr io.Writer) error { - return runWriteBulkVariant(ctx, args, stdout, stderr, "write2-bulk", true, false) + return runWriteBulkVariant(ctx, args, stdout, stderr, "write2-bulk", true) } func runWriteSecuredBulk(ctx context.Context, args []string, stdout, stderr io.Writer) error { - return runWriteBulkVariant(ctx, args, stdout, stderr, "write-secured-bulk", false, true) + return runWriteBulkVariant(ctx, args, stdout, stderr, "write-secured-bulk", false) } func runWriteSecured2Bulk(ctx context.Context, args []string, stdout, stderr io.Writer) error { - return runWriteBulkVariant(ctx, args, stdout, stderr, "write-secured2-bulk", true, true) + return runWriteBulkVariant(ctx, args, stdout, stderr, "write-secured2-bulk", true) } // runWriteBulkVariant shares the flag-parsing + entry-build skeleton across -// the four bulk-write families. withTimestamp adds a --timestamp-value flag; -// secured switches from --user-id to --current-user-id / --verifier-user-id. -func runWriteBulkVariant(ctx context.Context, args []string, stdout, stderr io.Writer, command string, withTimestamp bool, secured bool) error { +// the four bulk-write families. command selects which of the four routes +// runs; withTimestamp adds a --timestamp-value flag for the Write2 / Secured2 +// variants. Secured-only flags (--current-user-id / --verifier-user-id) are +// only registered for the secured variants and the non-secured -user-id flag +// is only registered for Write/Write2, so a wrong-variant flag becomes a +// clean "flag provided but not defined" error instead of silently no-op'ing. +func runWriteBulkVariant(ctx context.Context, args []string, stdout, stderr io.Writer, command string, withTimestamp bool) error { + secured := command == "write-secured-bulk" || command == "write-secured2-bulk" flags := flag.NewFlagSet(command, flag.ContinueOnError) flags.SetOutput(stderr) common := bindCommonFlags(flags) @@ -417,9 +422,13 @@ func runWriteBulkVariant(ctx context.Context, args []string, stdout, stderr io.W itemHandles := flags.String("item-handles", "", "comma-separated item handles") valueType := flags.String("type", "string", "value type: bool, int32, int64, float, double, string") values := flags.String("values", "", "comma-separated values (one per item handle)") - userID := flags.Int("user-id", 0, "MXAccess user id (Write/Write2 variants)") - currentUserID := flags.Int("current-user-id", 0, "MXAccess current user id (Secured variants)") - verifierUserID := flags.Int("verifier-user-id", 0, "MXAccess verifier user id (Secured variants)") + var userID, currentUserID, verifierUserID *int + if secured { + currentUserID = flags.Int("current-user-id", 0, "MXAccess current user id (Secured variants)") + verifierUserID = flags.Int("verifier-user-id", 0, "MXAccess verifier user id (Secured variants)") + } else { + userID = flags.Int("user-id", 0, "MXAccess user id (Write/Write2 variants)") + } timestampValue := flags.String("timestamp-value", "", "RFC 3339 timestamp shared across all entries (Write2/WriteSecured2 variants)") if err := flags.Parse(args); err != nil { @@ -507,7 +516,6 @@ func runWriteBulkVariant(ctx context.Context, args []string, stdout, stderr io.W default: return fmt.Errorf("unsupported bulk write command %q", command) } - _ = secured // currently only used for routing above; reserved for future per-variant validation return writeWriteBulkOutput(stdout, *jsonOutput, command, options, results, err) } @@ -1061,7 +1069,7 @@ type protojsonMessage interface { } func writeUsage(writer io.Writer) { - fmt.Fprintln(writer, "usage: mxgw-go ") + fmt.Fprintln(writer, "usage: mxgw-go ") } func dialGalaxyForCommand(ctx context.Context, common *commonOptions) (*mxgateway.GalaxyClient, commonOptions, error) { @@ -1245,6 +1253,9 @@ func runGalaxyWatch(ctx context.Context, args []string, stdout, stderr io.Writer count++ if *limit > 0 && count >= *limit { cancelStream() + // Allow goroutine to drain. + for range events { + } return nil } case streamErr, ok := <-errs: diff --git a/clients/go/mxgateway/alarms_test.go b/clients/go/mxgateway/alarms_test.go index 5c5ae1f..df5ecb1 100644 --- a/clients/go/mxgateway/alarms_test.go +++ b/clients/go/mxgateway/alarms_test.go @@ -64,10 +64,6 @@ func TestAcknowledgeAlarmRejectsNilRequest(t *testing.T) { defer cleanup() _, err := client.AcknowledgeAlarm(context.Background(), nil) - if err == nil || !errors.Is(err, errors.Unwrap(err)) && err.Error() != "mxgateway: acknowledge alarm request is required" { - // Accept either: the helper returned the literal sentinel, or the - // generic transport error — both prove nil was rejected. - } if err == nil { t.Fatalf("AcknowledgeAlarm(nil) returned no error") } diff --git a/clients/go/mxgateway/galaxy.go b/clients/go/mxgateway/galaxy.go index 892a6dc..1df45a4 100644 --- a/clients/go/mxgateway/galaxy.go +++ b/clients/go/mxgateway/galaxy.go @@ -2,6 +2,7 @@ package mxgateway import ( "context" + "errors" "io" "time" @@ -186,7 +187,7 @@ func (c *GalaxyClient) WatchDeployEvents( } continue } - if recvErr == io.EOF { + if errors.Is(recvErr, io.EOF) { return } if status.Code(recvErr) == codes.Canceled || ctx.Err() != nil { diff --git a/clients/go/mxgateway/galaxy_test.go b/clients/go/mxgateway/galaxy_test.go index 185db1b..ba91d26 100644 --- a/clients/go/mxgateway/galaxy_test.go +++ b/clients/go/mxgateway/galaxy_test.go @@ -372,15 +372,14 @@ func newGalaxyBufconnClient(t *testing.T, fake *fakeGalaxyServer) (*GalaxyClient type fakeGalaxyServer struct { pb.UnimplementedGalaxyRepositoryServer - testReply *pb.TestConnectionReply - testAuth string - failTest bool - deployReply *pb.GetLastDeployTimeReply - discoverReply *pb.DiscoverHierarchyReply - watchEvents []*pb.DeployEvent - watchRequest *pb.WatchDeployEventsRequest - watchSendInterval time.Duration - watchHoldOpen bool + testReply *pb.TestConnectionReply + testAuth string + failTest bool + deployReply *pb.GetLastDeployTimeReply + discoverReply *pb.DiscoverHierarchyReply + watchEvents []*pb.DeployEvent + watchRequest *pb.WatchDeployEventsRequest + watchHoldOpen bool } func (s *fakeGalaxyServer) TestConnection(ctx context.Context, req *pb.TestConnectionRequest) (*pb.TestConnectionReply, error) { @@ -414,13 +413,6 @@ func (s *fakeGalaxyServer) WatchDeployEvents(req *pb.WatchDeployEventsRequest, s if err := stream.Send(event); err != nil { return err } - if s.watchSendInterval > 0 { - select { - case <-time.After(s.watchSendInterval): - case <-stream.Context().Done(): - return stream.Context().Err() - } - } } if s.watchHoldOpen { <-stream.Context().Done() diff --git a/clients/go/mxgateway/session.go b/clients/go/mxgateway/session.go index 0775de2..6c897fe 100644 --- a/clients/go/mxgateway/session.go +++ b/clients/go/mxgateway/session.go @@ -599,7 +599,7 @@ func (s *Session) subscribeEventsAfter(ctx context.Context, afterWorkerSequence } continue } - if err == io.EOF || status.Code(err) == codes.Canceled || streamCtx.Err() != nil { + if errors.Is(err, io.EOF) || status.Code(err) == codes.Canceled || streamCtx.Err() != nil { return } sendEventResult( diff --git a/clients/java/README.md b/clients/java/README.md index 348005f..56a699b 100644 --- a/clients/java/README.md +++ b/clients/java/README.md @@ -88,8 +88,9 @@ observe the close result or handle a close-time failure. `MxGatewayClient` and `GalaxyRepositoryClient` implement `AutoCloseable`. For a client that owns its channel (built with `connect`), the try-with-resources -`close()` shuts the channel down and waits up to the configured connect timeout -for termination, forcibly shutting it down on timeout, so in-flight calls and +`close()` shuts the channel down and waits up to the configured +`shutdownTimeout` (default 10 s, independent of `connectTimeout`) for +termination, forcibly shutting it down on timeout, so in-flight calls and Netty event-loop threads are not left running after the block exits. If the calling thread is interrupted while waiting, the channel is forcibly shut down and the interrupt flag is restored. `closeAndAwaitTermination()` does the same @@ -99,12 +100,22 @@ blocking-aware shutdown. `close()` is a no-op for a caller-managed channel. `MxEventStream` implements `Iterator` and `AutoCloseable`. Closing it cancels the underlying gRPC stream. Canceling or timing out a Java client call only stops the client from waiting; it does not abort an in-flight MXAccess COM -call on the worker STA. The event stream uses gRPC's default auto-inbound flow -control with a fixed 16-element buffer and no client-side flow control: this is -the gateway's documented fail-fast event-backpressure model, so a consumer that -stalls long enough to fill the buffer triggers an overflow that cancels the -subscription and surfaces an `MxGatewayException` from the next `next()` call. -Drain events promptly and be prepared to resubscribe with a resume cursor. +call on the worker STA. Closing an `MxEventStream` *before* the gRPC call has +attached its observer (a real race when callers cancel immediately after +subscribing) is safe — the close is replayed in the observer's `beforeStart` +and the underlying call is cancelled, matching `DeployEventStream` behaviour. +The event stream uses gRPC's default auto-inbound flow control with a fixed +1024-element buffer and no client-side flow control: this is the gateway's +documented fail-fast event-backpressure model, so a consumer that stalls long +enough to fill the buffer triggers an overflow that cancels the subscription +and surfaces an `MxGatewayException` from the next `next()` call. Drain events +promptly and be prepared to resubscribe with a resume cursor. + +Cancellation of `CompletableFuture` results from `openSessionAsync`, +`invokeAsync`, `acknowledgeAlarmAsync`, `getLastDeployTimeAsync`, +`testConnectionAsync`, and `discoverHierarchyAsync` forwards to the underlying +gRPC call: calling `cancel(true)` on the returned future aborts the in-flight +RPC instead of merely detaching the future from its result. ## Galaxy Repository Browse diff --git a/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java b/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java index ca28816..d211241 100644 --- a/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java +++ b/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java @@ -242,9 +242,12 @@ public final class MxGatewayCli implements Callable { if (json) { out.println(protoJson(event)); } else { + // sequence is a proto uint64 — print as unsigned so values + // past 2^63 do not render as negative signed longs. JSON + // path goes through JsonFormat which already does this. out.printf( - "seq=%d observed=%s deployTime=%s objects=%d attributes=%d%n", - event.getSequence(), + "seq=%s observed=%s deployTime=%s objects=%d attributes=%d%n", + Long.toUnsignedString(event.getSequence()), formatTimestamp(event.getObservedAt()), event.getTimeOfLastDeployPresent() ? formatTimestamp(event.getTimeOfLastDeploy()) diff --git a/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java b/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java index a26bcb5..bcbdef6 100644 --- a/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java +++ b/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java @@ -9,6 +9,8 @@ import java.io.StringWriter; import java.util.ArrayList; import java.util.List; import mxaccess_gateway.v1.MxaccessGateway.AddItemReply; +import mxaccess_gateway.v1.MxaccessGateway.BulkReadResult; +import mxaccess_gateway.v1.MxaccessGateway.BulkWriteResult; import mxaccess_gateway.v1.MxaccessGateway.CloseSessionReply; import mxaccess_gateway.v1.MxaccessGateway.CloseSessionRequest; import mxaccess_gateway.v1.MxaccessGateway.MxCommandKind; @@ -22,6 +24,10 @@ import mxaccess_gateway.v1.MxaccessGateway.ProtocolStatusCode; import mxaccess_gateway.v1.MxaccessGateway.RegisterReply; import mxaccess_gateway.v1.MxaccessGateway.SessionState; import mxaccess_gateway.v1.MxaccessGateway.SubscribeResult; +import mxaccess_gateway.v1.MxaccessGateway.Write2BulkEntry; +import mxaccess_gateway.v1.MxaccessGateway.WriteBulkEntry; +import mxaccess_gateway.v1.MxaccessGateway.WriteSecured2BulkEntry; +import mxaccess_gateway.v1.MxaccessGateway.WriteSecuredBulkEntry; import org.junit.jupiter.api.Test; final class MxGatewayCliTests { @@ -124,6 +130,25 @@ final class MxGatewayCliTests { assertTrue(run.output().contains("\"tagAddress\":\"TestMachine_002.TestChangingInt\"")); } + @Test + void deployEventSequenceRendersAsUnsignedForHighUint64() { + // Client.Java-020 regression: galaxy-watch text output now uses + // Long.toUnsignedString to format the proto uint64 sequence field, so + // values past 2^63 render as positive decimal strings instead of the + // negative signed-long interpretation the old "%d" produced. + long highUnsigned = -1L; // bit-pattern for 2^64 - 1, i.e. 18446744073709551615 unsigned + String text = String.format( + "seq=%s observed=%s deployTime=%s objects=%d attributes=%d", + Long.toUnsignedString(highUnsigned), + "2026-05-20T00:00:00Z", + "(none)", + 0, + 0); + + assertTrue(text.contains("seq=18446744073709551615"), "expected unsigned rendering, got: " + text); + assertFalse(text.contains("seq=-1"), "must not render as signed -1"); + } + @Test void unsubscribeBulkCommandPrintsResults() { CliRun run = execute( @@ -297,6 +322,31 @@ final class MxGatewayCliTests { return results; } + @Override + public List readBulk(int serverHandle, List items, int timeoutMs) { + return new ArrayList<>(); + } + + @Override + public List writeBulk(int serverHandle, List entries) { + return new ArrayList<>(); + } + + @Override + public List write2Bulk(int serverHandle, List entries) { + return new ArrayList<>(); + } + + @Override + public List writeSecuredBulk(int serverHandle, List entries) { + return new ArrayList<>(); + } + + @Override + public List writeSecured2Bulk(int serverHandle, List entries) { + return new ArrayList<>(); + } + @Override public com.dohertylan.mxgateway.client.MxEventStream streamEventsAfter(long afterWorkerSequence) { throw new UnsupportedOperationException("stream-events is covered by client tests"); diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java index 518f831..28c8f30 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java @@ -21,7 +21,7 @@ import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; /** * Thin wrapper around the generated {@link GalaxyRepositoryGrpc} stubs that @@ -128,10 +128,14 @@ public final class GalaxyRepositoryClient implements AutoCloseable { * exceptionally with {@link MxGatewayException} on failure */ public CompletableFuture testConnectionAsync() { + // Apply the projection inside toCompletable rather than via .thenApply + // so the user-visible future is the same future cancellation is bound + // to; a downstream .thenApply stage would not forward cancel() to the + // source RPC. return MxGatewayChannels.toCompletable( - rawFutureStub().testConnection(TestConnectionRequest.getDefaultInstance()), - "galaxy test connection") - .thenApply(TestConnectionReply::getOk); + rawFutureStub().testConnection(TestConnectionRequest.getDefaultInstance()), + "galaxy test connection", + TestConnectionReply::getOk); } /** @@ -163,10 +167,9 @@ public final class GalaxyRepositoryClient implements AutoCloseable { */ public CompletableFuture> getLastDeployTimeAsync() { return MxGatewayChannels.toCompletable( - rawFutureStub().getLastDeployTime(GetLastDeployTimeRequest.getDefaultInstance()), - "galaxy get last deploy time") - .thenApply(MxGatewayChannels.normalisingValidator( - "galaxy get last deploy time", GalaxyRepositoryClient::mapDeployTime)); + rawFutureStub().getLastDeployTime(GetLastDeployTimeRequest.getDefaultInstance()), + "galaxy get last deploy time", + GalaxyRepositoryClient::mapDeployTime); } /** @@ -210,7 +213,33 @@ public final class GalaxyRepositoryClient implements AutoCloseable { * exceptionally with {@link MxGatewayException} on failure */ public CompletableFuture> discoverHierarchyAsync() { - return discoverHierarchyPageAsync("", new java.util.ArrayList<>(), new java.util.HashSet<>()); + // The recursive page chain produces a fresh in-flight RPC per page. + // Track the current in-flight stage in an AtomicReference and return a + // user-facing future whose cancel() forwards to that current stage — + // otherwise cancelling the chained CompletableFuture would not abort + // the in-flight gRPC call. Without this, .thenCompose creates new + // stages whose cancel() does not propagate upstream. + AtomicReference> currentStage = new AtomicReference<>(); + CompletableFuture> userFuture = new CompletableFuture<>() { + @Override + public boolean cancel(boolean mayInterruptIfRunning) { + boolean cancelled = super.cancel(mayInterruptIfRunning); + CompletableFuture stage = currentStage.get(); + if (stage != null) { + stage.cancel(mayInterruptIfRunning); + } + return cancelled; + } + }; + discoverHierarchyPageAsync("", new java.util.ArrayList<>(), new java.util.HashSet<>(), currentStage) + .whenComplete((result, error) -> { + if (error != null) { + userFuture.completeExceptionally(error); + } else { + userFuture.complete(result); + } + }); + return userFuture; } /** @@ -275,43 +304,30 @@ public final class GalaxyRepositoryClient implements AutoCloseable { * callers do not leave in-flight calls or Netty event-loop threads running * after the block exits. * - *

Waits up to the configured connect timeout for graceful termination - * and forcibly shuts the channel down on timeout. If the calling thread is - * interrupted while waiting, the channel is forcibly shut down and the - * thread's interrupt flag is restored. No-op for clients that do not own - * their channel. For an explicitly checked, blocking-aware shutdown call - * {@link #closeAndAwaitTermination()}. + *

Waits up to {@link MxGatewayClientOptions#shutdownTimeout()} for + * graceful termination and forcibly shuts the channel down on timeout. If + * the calling thread is interrupted while waiting, the channel is forcibly + * shut down and the thread's interrupt flag is restored. No-op for clients + * that do not own their channel. For an explicitly checked, blocking-aware + * shutdown call {@link #closeAndAwaitTermination()}. Delegates to the + * shared {@link MxGatewayChannels#shutdown} so behavior stays in lockstep + * with {@link MxGatewayClient}. */ @Override public void close() { - if (ownedChannel == null) { - return; - } - ownedChannel.shutdown(); - try { - if (!ownedChannel.awaitTermination(options.connectTimeout().toMillis(), TimeUnit.MILLISECONDS)) { - ownedChannel.shutdownNow(); - } - } catch (InterruptedException error) { - ownedChannel.shutdownNow(); - Thread.currentThread().interrupt(); - } + MxGatewayChannels.shutdown(ownedChannel, options); } /** - * Shuts the owned channel down and waits up to the configured connect - * timeout for termination, forcibly shutting it down on timeout. No-op - * for clients that do not own their channel. + * Shuts the owned channel down and waits up to + * {@link MxGatewayClientOptions#shutdownTimeout()} for termination, + * forcibly shutting it down on timeout. No-op for clients that do not own + * their channel. * * @throws InterruptedException if the calling thread is interrupted while waiting */ public void closeAndAwaitTermination() throws InterruptedException { - if (ownedChannel != null) { - ownedChannel.shutdown(); - if (!ownedChannel.awaitTermination(options.connectTimeout().toMillis(), TimeUnit.MILLISECONDS)) { - ownedChannel.shutdownNow(); - } - } + MxGatewayChannels.shutdownAndAwaitTermination(ownedChannel, options); } private static Optional mapDeployTime(GetLastDeployTimeReply reply) { @@ -323,25 +339,33 @@ public final class GalaxyRepositoryClient implements AutoCloseable { } private CompletableFuture> discoverHierarchyPageAsync( - String pageToken, java.util.ArrayList objects, java.util.HashSet seenPageTokens) { + String pageToken, + java.util.ArrayList objects, + java.util.HashSet seenPageTokens, + AtomicReference> currentStage) { DiscoverHierarchyRequest request = DiscoverHierarchyRequest.newBuilder() .setPageSize(DISCOVER_HIERARCHY_PAGE_SIZE) .setPageToken(pageToken) .build(); - return MxGatewayChannels.toCompletable(rawFutureStub().discoverHierarchy(request), "galaxy discover hierarchy") - .thenCompose(reply -> { - objects.addAll(reply.getObjectsList()); - if (reply.getNextPageToken().isBlank()) { - return CompletableFuture.completedFuture(objects); - } - if (!seenPageTokens.add(reply.getNextPageToken())) { - CompletableFuture> failed = new CompletableFuture<>(); - failed.completeExceptionally(new MxGatewayException( - "galaxy discover hierarchy returned repeated page token: " - + reply.getNextPageToken())); - return failed; - } - return discoverHierarchyPageAsync(reply.getNextPageToken(), objects, seenPageTokens); - }); + CompletableFuture pageFuture = MxGatewayChannels.toCompletable( + rawFutureStub().discoverHierarchy(request), "galaxy discover hierarchy"); + // Publish the in-flight page future so a user cancellation can abort + // the current outstanding RPC (the recursion replaces this reference + // before each subsequent page). + currentStage.set(pageFuture); + return pageFuture.thenCompose(reply -> { + objects.addAll(reply.getObjectsList()); + if (reply.getNextPageToken().isBlank()) { + return CompletableFuture.completedFuture(objects); + } + if (!seenPageTokens.add(reply.getNextPageToken())) { + CompletableFuture> failed = new CompletableFuture<>(); + failed.completeExceptionally(new MxGatewayException( + "galaxy discover hierarchy returned repeated page token: " + + reply.getNextPageToken())); + return failed; + } + return discoverHierarchyPageAsync(reply.getNextPageToken(), objects, seenPageTokens, currentStage); + }); } } diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java index 71d3e7d..548db29 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java @@ -25,14 +25,17 @@ import mxaccess_gateway.v1.MxaccessGateway.StreamEventsRequest; *

Backpressure (fail-fast): this adaptor relies on gRPC's * default auto-inbound flow control — the async stub auto-requests messages, so * the gateway can push events faster than the consumer drains the bounded - * 16-element buffer. There is intentionally no real client flow - * control: a consumer that stalls long enough to let the buffer fill triggers - * an immediate overflow that cancels the subscription and surfaces an - * {@link MxGatewayException} on the next {@link #next()} call. This matches the - * gateway's documented fail-fast event-backpressure design — a slow consumer - * loses its subscription rather than silently dropping events. Consumers that - * cannot keep up must drain {@link #next()} promptly (e.g. hand events to their - * own larger queue) and be prepared to resubscribe with a resume cursor. + * 1024-element buffer (the buffer capacity is a constructor parameter; the + * production caller {@code MxGatewayClient.streamEvents} passes {@code 1024} to + * absorb the gateway's session-backlog replay burst). There is intentionally + * no real client flow control: a consumer that stalls long enough to + * let the buffer fill triggers an immediate overflow that cancels the + * subscription and surfaces an {@link MxGatewayException} on the next + * {@link #next()} call. This matches the gateway's documented fail-fast + * event-backpressure design — a slow consumer loses its subscription rather + * than silently dropping events. Consumers that cannot keep up must drain + * {@link #next()} promptly (e.g. hand events to their own larger queue) and be + * prepared to resubscribe with a resume cursor. * *

Threading: the iterator methods ({@link #hasNext()} and * {@link #next()}) are not thread-safe and must be driven by a single @@ -60,7 +63,16 @@ public final class MxEventStream implements Iterator, AutoCloseable { return new ClientResponseObserver<>() { @Override public void beforeStart(ClientCallStreamObserver requestStream) { + // Resolve the close()/beforeStart() race the same way DeployEventStream does: + // store the request stream first, then check the close flag and cancel the + // call if a prior close() already fired. Without this, a close() that ran + // before the gRPC call attached its ClientCallStreamObserver would skip + // stream.cancel() (because requestStream is still null) and beforeStart() + // arriving afterwards would leak the underlying call open. MxEventStream.this.requestStream = requestStream; + if (closed) { + requestStream.cancel("client cancelled event stream", null); + } } @Override diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java index df188aa..2e55935 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java @@ -98,19 +98,86 @@ final class MxGatewayChannels { return stub.withDeadlineAfter(options.streamTimeout().toNanos(), TimeUnit.NANOSECONDS); } + /** + * Shuts a client-owned channel down and waits up to the configured + * {@link MxGatewayClientOptions#shutdownTimeout()} for graceful + * termination, forcing {@code shutdownNow()} on timeout. If the calling + * thread is interrupted while waiting, the channel is forcibly shut down + * and the thread's interrupt flag is restored — this matches the + * try-with-resources {@code close()} contract that cannot throw a checked + * exception. + * + *

No-op when {@code ownedChannel} is {@code null} (i.e. the caller owns + * the channel lifecycle on a borrowed channel). + * + * @param ownedChannel the channel to shut down, may be {@code null} + * @param options the client options carrying the shutdown timeout + */ + static void shutdown(ManagedChannel ownedChannel, MxGatewayClientOptions options) { + if (ownedChannel == null) { + return; + } + ownedChannel.shutdown(); + try { + if (!ownedChannel.awaitTermination(options.shutdownTimeout().toMillis(), TimeUnit.MILLISECONDS)) { + ownedChannel.shutdownNow(); + } + } catch (InterruptedException error) { + ownedChannel.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + + /** + * Shuts a client-owned channel down and waits up to the configured + * {@link MxGatewayClientOptions#shutdownTimeout()} for termination, + * forcing {@code shutdownNow()} on timeout. Throws + * {@link InterruptedException} when the calling thread is interrupted — + * for callers that want a checked, blocking-aware shutdown. + * + *

No-op when {@code ownedChannel} is {@code null}. + * + * @param ownedChannel the channel to shut down, may be {@code null} + * @param options the client options carrying the shutdown timeout + * @throws InterruptedException if the calling thread is interrupted while waiting + */ + static void shutdownAndAwaitTermination(ManagedChannel ownedChannel, MxGatewayClientOptions options) + throws InterruptedException { + if (ownedChannel == null) { + return; + } + ownedChannel.shutdown(); + if (!ownedChannel.awaitTermination(options.shutdownTimeout().toMillis(), TimeUnit.MILLISECONDS)) { + ownedChannel.shutdownNow(); + } + } + /** * Bridges a Guava {@link ListenableFuture} to a {@link CompletableFuture}, * normalising any failure through {@link MxGatewayErrors#fromGrpc} so the * async error surface matches the synchronous methods. Cancelling the * returned future cancels the source RPC. * + *

Cancellation contract: the returned future is a + * {@link CancellingCompletableFuture} that overrides + * {@link CompletableFuture#cancel(boolean)} so cancellation always forwards + * to the source {@link ListenableFuture}, even when callers wrap the + * future in additional {@code thenApply}/{@code thenCompose} stages. The + * historical {@code whenComplete}-based forwarder was buggy because + * {@code thenApply} returns a new {@code CompletableFuture} whose + * cancellation does not propagate back to this future; with the + * override-based design, calling {@code cancel(true)} on either the + * direct return value or the user-facing chained future ultimately + * invokes {@code source.cancel(true)} (chained futures forward to the + * upstream stage they were derived from, which is this future). + * * @param source the gRPC future-stub result * @param operation the operation name used in normalised error messages * @param the reply type * @return a completable future mirroring the source */ static CompletableFuture toCompletable(ListenableFuture source, String operation) { - CompletableFuture target = new CompletableFuture<>(); + CancellingCompletableFuture target = new CancellingCompletableFuture<>(source); Futures.addCallback( source, new FutureCallback<>() { @@ -129,14 +196,83 @@ final class MxGatewayChannels { } }, MoreExecutors.directExecutor()); - target.whenComplete((ignoredResult, ignoredError) -> { - if (target.isCancelled()) { - source.cancel(true); - } - }); return target; } + /** + * Bridges a Guava {@link ListenableFuture} to a {@link CompletableFuture} + * and applies {@code validator} to the reply inline (i.e. without a + * downstream {@code thenApply}), so the user-visible future is the same + * future cancellation is bound to. Any non-{@link MxGatewayException} + * {@link RuntimeException} thrown by {@code validator} is routed through + * {@link MxGatewayErrors#fromGrpc} to match the synchronous error surface. + * + *

This overload exists because the prior {@code toCompletable(...) + * .thenApply(validator)} pattern broke cancellation propagation: the + * future returned by {@code thenApply} is a new stage whose cancellation + * does not propagate to the underlying gRPC call. Using this overload, the + * single returned future is the one users hold, so calling {@code cancel} + * on it forwards to the source RPC. + * + * @param source the gRPC future-stub result + * @param operation the operation name used in normalised error messages + * @param validator the validating/transforming function applied to the reply + * @param the reply type + * @param the validated/transformed result type + * @return a completable future mirroring the validated source + */ + static CompletableFuture toCompletable( + ListenableFuture source, String operation, Function validator) { + CancellingCompletableFuture target = new CancellingCompletableFuture<>(source); + Futures.addCallback( + source, + new FutureCallback<>() { + @Override + public void onSuccess(T result) { + try { + target.complete(validator.apply(result)); + } catch (MxGatewayException error) { + target.completeExceptionally(error); + } catch (RuntimeException error) { + target.completeExceptionally(MxGatewayErrors.fromGrpc(operation, error)); + } + } + + @Override + public void onFailure(Throwable error) { + if (error instanceof RuntimeException runtimeException) { + target.completeExceptionally(MxGatewayErrors.fromGrpc(operation, runtimeException)); + return; + } + target.completeExceptionally(error); + } + }, + MoreExecutors.directExecutor()); + return target; + } + + /** + * {@link CompletableFuture} subclass that forwards {@link #cancel(boolean)} + * to a backing {@link ListenableFuture}. Used by {@link #toCompletable} so + * cancelling the user-visible future cancels the underlying gRPC call. + */ + static final class CancellingCompletableFuture extends CompletableFuture { + private final ListenableFuture source; + + CancellingCompletableFuture(ListenableFuture source) { + this.source = source; + } + + @Override + public boolean cancel(boolean mayInterruptIfRunning) { + boolean cancelled = super.cancel(mayInterruptIfRunning); + // Always forward; the source future is idempotent on cancel and the + // user contract is that cancelling the future cancels the RPC. + source.cancel(mayInterruptIfRunning); + return cancelled; + } + } + /** * Adapts a reply-validating function for use inside {@code thenApply} so * any non-{@link MxGatewayException} {@link RuntimeException} it raises is diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java index a5182d2..7094be2 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java @@ -7,7 +7,6 @@ import io.grpc.ManagedChannel; import io.grpc.stub.StreamObserver; import java.util.Objects; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; import mxaccess_gateway.v1.MxAccessGatewayGrpc; import mxaccess_gateway.v1.MxaccessGateway.AcknowledgeAlarmReply; import mxaccess_gateway.v1.MxaccessGateway.AcknowledgeAlarmRequest; @@ -181,13 +180,16 @@ public final class MxGatewayClient implements AutoCloseable { * with {@link MxGatewayException} on failure */ public CompletableFuture openSessionAsync(OpenSessionRequest request) { - CompletableFuture future = - MxGatewayChannels.toCompletable(rawFutureStub().openSession(request), "open session"); - return future.thenApply(MxGatewayChannels.normalisingValidator("open session", reply -> { - MxGatewayErrors.ensureProtocolSuccess("open session", reply.getProtocolStatus(), null); - ensureGatewayProtocolCompatible(reply); - return reply; - })); + // Apply the validator inside toCompletable rather than via .thenApply so + // cancellation on the returned future forwards to the source RPC (a + // .thenApply stage returns a fresh CompletableFuture whose cancel() + // does not propagate back to the upstream stage). + return MxGatewayChannels.toCompletable( + rawFutureStub().openSession(request), "open session", reply -> { + MxGatewayErrors.ensureProtocolSuccess("open session", reply.getProtocolStatus(), null); + ensureGatewayProtocolCompatible(reply); + return reply; + }); } /** @@ -222,13 +224,11 @@ public final class MxGatewayClient implements AutoCloseable { * on failure */ public CompletableFuture invokeAsync(MxCommandRequest request) { - CompletableFuture future = - MxGatewayChannels.toCompletable(rawFutureStub().invoke(request), "invoke"); - return future.thenApply(MxGatewayChannels.normalisingValidator("invoke", reply -> { + return MxGatewayChannels.toCompletable(rawFutureStub().invoke(request), "invoke", reply -> { MxGatewayErrors.ensureProtocolSuccess("invoke", reply.getProtocolStatus(), reply); MxGatewayErrors.ensureMxAccessSuccess("invoke", reply); return reply; - })); + }); } /** @@ -320,12 +320,11 @@ public final class MxGatewayClient implements AutoCloseable { * with {@link MxGatewayException} on failure */ public CompletableFuture acknowledgeAlarmAsync(AcknowledgeAlarmRequest request) { - CompletableFuture future = - MxGatewayChannels.toCompletable(rawFutureStub().acknowledgeAlarm(request), "acknowledge alarm"); - return future.thenApply(MxGatewayChannels.normalisingValidator("acknowledge alarm", reply -> { - MxGatewayErrors.ensureProtocolSuccess("acknowledge alarm", reply.getProtocolStatus(), null); - return reply; - })); + return MxGatewayChannels.toCompletable( + rawFutureStub().acknowledgeAlarm(request), "acknowledge alarm", reply -> { + MxGatewayErrors.ensureProtocolSuccess("acknowledge alarm", reply.getProtocolStatus(), null); + return reply; + }); } /** @@ -351,43 +350,30 @@ public final class MxGatewayClient implements AutoCloseable { * callers do not leave in-flight calls or Netty event-loop threads running * after the block exits. * - *

Waits up to the configured connect timeout for graceful termination - * and forcibly shuts the channel down on timeout. If the calling thread is - * interrupted while waiting, the channel is forcibly shut down and the - * thread's interrupt flag is restored. No-op for clients that do not own - * their channel. For an explicitly checked, blocking-aware shutdown call - * {@link #closeAndAwaitTermination()}. + *

Waits up to {@link MxGatewayClientOptions#shutdownTimeout()} for + * graceful termination and forcibly shuts the channel down on timeout. If + * the calling thread is interrupted while waiting, the channel is forcibly + * shut down and the thread's interrupt flag is restored. No-op for clients + * that do not own their channel. For an explicitly checked, blocking-aware + * shutdown call {@link #closeAndAwaitTermination()}. Delegates to the + * shared {@link MxGatewayChannels#shutdown} so behavior stays in lockstep + * with {@link GalaxyRepositoryClient}. */ @Override public void close() { - if (ownedChannel == null) { - return; - } - ownedChannel.shutdown(); - try { - if (!ownedChannel.awaitTermination(options.connectTimeout().toMillis(), TimeUnit.MILLISECONDS)) { - ownedChannel.shutdownNow(); - } - } catch (InterruptedException error) { - ownedChannel.shutdownNow(); - Thread.currentThread().interrupt(); - } + MxGatewayChannels.shutdown(ownedChannel, options); } /** - * Shuts the owned channel down and waits up to the configured connect - * timeout for termination, forcibly shutting it down on timeout. No-op - * for clients that do not own their channel. + * Shuts the owned channel down and waits up to + * {@link MxGatewayClientOptions#shutdownTimeout()} for termination, + * forcibly shutting it down on timeout. No-op for clients that do not own + * their channel. * * @throws InterruptedException if the calling thread is interrupted while waiting */ public void closeAndAwaitTermination() throws InterruptedException { - if (ownedChannel != null) { - ownedChannel.shutdown(); - if (!ownedChannel.awaitTermination(options.connectTimeout().toMillis(), TimeUnit.MILLISECONDS)) { - ownedChannel.shutdownNow(); - } - } + MxGatewayChannels.shutdownAndAwaitTermination(ownedChannel, options); } static ProtocolStatusCode okStatusCode() { diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClientOptions.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClientOptions.java index 47b95d0..23e1ff2 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClientOptions.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClientOptions.java @@ -14,6 +14,7 @@ import java.util.Objects; public final class MxGatewayClientOptions { private static final Duration DEFAULT_CONNECT_TIMEOUT = Duration.ofSeconds(10); private static final Duration DEFAULT_CALL_TIMEOUT = Duration.ofSeconds(30); + private static final Duration DEFAULT_SHUTDOWN_TIMEOUT = Duration.ofSeconds(10); private static final int DEFAULT_MAX_GRPC_MESSAGE_BYTES = 16 * 1024 * 1024; private final String endpoint; @@ -24,6 +25,7 @@ public final class MxGatewayClientOptions { private final Duration connectTimeout; private final Duration callTimeout; private final Duration streamTimeout; + private final Duration shutdownTimeout; private final int maxGrpcMessageBytes; private MxGatewayClientOptions(Builder builder) { @@ -35,6 +37,7 @@ public final class MxGatewayClientOptions { connectTimeout = builder.connectTimeout == null ? DEFAULT_CONNECT_TIMEOUT : builder.connectTimeout; callTimeout = builder.callTimeout == null ? DEFAULT_CALL_TIMEOUT : builder.callTimeout; streamTimeout = builder.streamTimeout; + shutdownTimeout = builder.shutdownTimeout == null ? DEFAULT_SHUTDOWN_TIMEOUT : builder.shutdownTimeout; maxGrpcMessageBytes = builder.maxGrpcMessageBytes <= 0 ? DEFAULT_MAX_GRPC_MESSAGE_BYTES : builder.maxGrpcMessageBytes; @@ -131,6 +134,18 @@ public final class MxGatewayClientOptions { return streamTimeout; } + /** + * Returns the upper bound on graceful shutdown waiting, applied by + * {@code close()} and {@code closeAndAwaitTermination()}. Independent of + * {@link #connectTimeout()}; a small connect timeout no longer forces an + * aggressive {@code shutdownNow()} on in-flight calls. + * + * @return the shutdown timeout duration + */ + public Duration shutdownTimeout() { + return shutdownTimeout; + } + public int maxGrpcMessageBytes() { return maxGrpcMessageBytes; } @@ -157,6 +172,8 @@ public final class MxGatewayClientOptions { + callTimeout + ", streamTimeout=" + streamTimeout + + ", shutdownTimeout=" + + shutdownTimeout + ", maxGrpcMessageBytes=" + maxGrpcMessageBytes + '}'; @@ -181,6 +198,7 @@ public final class MxGatewayClientOptions { private Duration connectTimeout; private Duration callTimeout; private Duration streamTimeout; + private Duration shutdownTimeout; private int maxGrpcMessageBytes; private Builder() { @@ -277,6 +295,20 @@ public final class MxGatewayClientOptions { return this; } + /** + * Sets the upper bound on graceful shutdown waiting (applied by + * {@code close()} and {@code closeAndAwaitTermination()}). Defaults to + * 10 s and is independent of the connect timeout. + * + * @param value the shutdown timeout, must be non-{@code null} + * @return this builder + * @throws NullPointerException if {@code value} is {@code null} + */ + public Builder shutdownTimeout(Duration value) { + shutdownTimeout = Objects.requireNonNull(value, "shutdownTimeout"); + return this; + } + public Builder maxGrpcMessageBytes(int value) { maxGrpcMessageBytes = value; return this; diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java index 8060603..5840e8a 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java @@ -1,5 +1,7 @@ package com.dohertylan.mxgateway.client; +import java.util.regex.Pattern; + /** * Helpers for redacting secrets such as gateway API keys from log output. * @@ -7,6 +9,16 @@ package com.dohertylan.mxgateway.client; * produce shortened, masked forms safe for diagnostic messages. */ public final class MxGatewaySecrets { + // Match any gateway-shaped credential anywhere in the string, regardless of + // surrounding punctuation: quoted, colon/comma-delimited, embedded in URLs + // or parens. The underscore-separated character class also covers a + // trailing hyphen in case a future key format introduces one. + private static final Pattern MXGW_TOKEN = Pattern.compile("mxgw_[A-Za-z0-9_-]+"); + // Mask the token after a Bearer marker as a unit so callers cannot + // accidentally leak the secret when the surrounding text is a header-style + // string (e.g. "Bearer mxgw_id_secret"). + private static final Pattern BEARER_TOKEN = Pattern.compile("(?i)bearer\\s+\\S+"); + private MxGatewaySecrets() { } @@ -43,9 +55,15 @@ public final class MxGatewaySecrets { } /** - * Replaces gateway-style credential tokens (the {@code mxgw_} prefix and - * any {@code Bearer} marker) inside a free-form string with a redaction - * placeholder. + * Replaces gateway-style credential tokens inside a free-form string with a + * redaction placeholder. + * + *

Matches any {@code mxgw_<...>} token anywhere in the string, + * irrespective of surrounding punctuation (whitespace, colons, commas, + * single/double quotes, parentheses, embedded URL paths). Also masks the + * argument of an authorization-header style {@code Bearer } marker + * as a unit so the token cannot leak through when the surrounding string + * is a raw header value. * * @param value the string to scrub, may be {@code null} * @return an empty string for {@code null}, the original value when blank, @@ -56,12 +74,8 @@ public final class MxGatewaySecrets { return value == null ? "" : value; } - String[] parts = value.split("\\s+"); - for (int index = 0; index < parts.length; index++) { - if (parts[index].startsWith("mxgw_") || parts[index].equalsIgnoreCase("bearer")) { - parts[index] = ""; - } - } - return String.join(" ", parts); + String scrubbed = MXGW_TOKEN.matcher(value).replaceAll(""); + scrubbed = BEARER_TOKEN.matcher(scrubbed).replaceAll("Bearer "); + return scrubbed; } } diff --git a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayFixtureTests.java b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayFixtureTests.java index 8cf37f5..8484f53 100644 --- a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayFixtureTests.java +++ b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayFixtureTests.java @@ -106,6 +106,37 @@ final class MxGatewayFixtureTests { assertFalse(authError.getMessage().contains("visible_secret")); } + @Test + void redactCredentialsHandlesNonWhitespaceDelimitedTokens() { + // Client.Java-018 regression: the previous whitespace-split scrub left + // mxgw_ credentials attached to quotes, commas, colons, parens, and + // URL paths intact. The strengthened pattern matches mxgw_<...> + // anywhere in the string regardless of surrounding punctuation. + String singleQuoted = MxGatewaySecrets.redactCredentials("authentication failed: 'mxgw_keyid_secret'"); + String doubleQuoted = MxGatewaySecrets.redactCredentials("Bearer:\"mxgw_keyid_secret\""); + String commaDelimited = MxGatewaySecrets.redactCredentials("token=mxgw_keyid_secret,scope=admin"); + String colonDelimited = MxGatewaySecrets.redactCredentials("Bearer:mxgw_keyid_secret"); + String parenthesised = MxGatewaySecrets.redactCredentials("auth(mxgw_keyid_secret)"); + String urlEmbedded = MxGatewaySecrets.redactCredentials("https://gw/api?key=mxgw_keyid_secret&x=1"); + String bearerHeader = MxGatewaySecrets.redactCredentials("Bearer mxgw_keyid_secret"); + + for (String redacted : new String[] { + singleQuoted, doubleQuoted, commaDelimited, colonDelimited, parenthesised, urlEmbedded, bearerHeader + }) { + assertFalse(redacted.contains("mxgw_keyid_secret"), "expected redaction, got: " + redacted); + assertFalse(redacted.contains("keyid_secret"), "tail leaked: " + redacted); + assertTrue(redacted.contains(""), "expected , got: " + redacted); + } + } + + @Test + void redactCredentialsLeavesBenignContentAlone() { + assertEquals( + "no credentials here", + MxGatewaySecrets.redactCredentials("no credentials here")); + assertEquals("", MxGatewaySecrets.redactCredentials(null)); + } + private static JsonObject readFixture(String relativePath) throws Exception { return JsonParser.parseString(Files.readString(fixtureRoot().resolve(relativePath))).getAsJsonObject(); } diff --git a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayLowFindingsIITests.java b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayLowFindingsIITests.java new file mode 100644 index 0000000..776a0b2 --- /dev/null +++ b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayLowFindingsIITests.java @@ -0,0 +1,182 @@ +package com.dohertylan.mxgateway.client; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import io.grpc.CallOptions; +import io.grpc.ClientCall; +import io.grpc.ConnectivityState; +import io.grpc.ManagedChannel; +import io.grpc.MethodDescriptor; +import java.time.Duration; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.Test; + +/** + * Regression tests for the second-pass Low-severity Client.Java findings + * Client.Java-016, Client.Java-019, and the shared shutdown helpers extracted + * to {@link MxGatewayChannels}. + */ +final class MxGatewayLowFindingsIITests { + + // --- Client.Java-019: shutdown timeout is independent of connect timeout --- + + @Test + void shutdownAndAwaitTerminationHonoursShutdownTimeoutNotConnectTimeout() throws Exception { + // The historical bug: close() used connectTimeout as the awaitTermination + // deadline, so a small connectTimeout forced a premature shutdownNow() + // on in-flight calls. The fix uses a dedicated shutdownTimeout. This + // test verifies the helper waits up to shutdownTimeout (1s) even when + // connectTimeout is set to a tiny value (50ms). + RecordingChannel channel = new RecordingChannel(/* terminatesAfterMillis = */ 200); + MxGatewayClientOptions options = MxGatewayClientOptions.builder() + .endpoint("in-process") + .plaintext(true) + .connectTimeout(Duration.ofMillis(50)) + .shutdownTimeout(Duration.ofSeconds(1)) + .build(); + + long start = System.nanoTime(); + MxGatewayChannels.shutdownAndAwaitTermination(channel, options); + long elapsedMillis = (System.nanoTime() - start) / 1_000_000L; + + // The channel finished orderly termination within the shutdown timeout + // window, so shutdownNow() must NOT have been called. With the old + // implementation a 50ms connect-timeout-as-shutdown-deadline would + // have escalated to shutdownNow() before the channel's 200ms graceful + // termination completed. + assertTrue(channel.shutdownCalled, "shutdown() must be called"); + assertFalse( + channel.shutdownNowCalled, + "graceful termination finished within shutdownTimeout; shutdownNow() must not have been called"); + // Allow ample slack for build-machine variance but assert we waited at + // least the channel's graceful-termination window. + assertTrue(elapsedMillis >= 150, "should have waited for graceful termination, elapsed=" + elapsedMillis); + } + + @Test + void shutdownEscalatesToShutdownNowWhenTimeoutExceeded() { + // The other half of the contract: a channel that does not terminate + // within the shutdownTimeout window must be forcibly shut down. + RecordingChannel channel = new RecordingChannel(/* terminatesAfterMillis = */ 5_000); + MxGatewayClientOptions options = MxGatewayClientOptions.builder() + .endpoint("in-process") + .plaintext(true) + .shutdownTimeout(Duration.ofMillis(100)) + .build(); + + MxGatewayChannels.shutdown(channel, options); + + assertTrue(channel.shutdownCalled); + assertTrue(channel.shutdownNowCalled, "stuck channel must be forcibly shut down"); + } + + @Test + void shutdownTimeoutDefaultIsTenSecondsIndependentOfConnectTimeout() { + MxGatewayClientOptions defaults = MxGatewayClientOptions.builder() + .endpoint("in-process") + .build(); + // Default is 10s; an unset connectTimeout-of-10s default coincides but + // the two are now independent options. + assertEquals(Duration.ofSeconds(10), defaults.shutdownTimeout()); + + MxGatewayClientOptions tinyConnect = MxGatewayClientOptions.builder() + .endpoint("in-process") + .connectTimeout(Duration.ofMillis(500)) + .build(); + assertEquals(Duration.ofSeconds(10), tinyConnect.shutdownTimeout(), + "shutdownTimeout default is independent of connectTimeout"); + } + + // --- Client.Java-016: shared shutdown helpers behave identically for both clients --- + + @Test + void sharedShutdownHelperIsNoOpForNullChannel() throws Exception { + MxGatewayClientOptions options = MxGatewayClientOptions.builder() + .endpoint("in-process") + .plaintext(true) + .shutdownTimeout(Duration.ofMillis(50)) + .build(); + // Both helpers must tolerate a null owned-channel (caller-managed channel case). + MxGatewayChannels.shutdown(null, options); + MxGatewayChannels.shutdownAndAwaitTermination(null, options); + } + + /** + * Test double for {@link ManagedChannel} that records {@code shutdown}/ + * {@code shutdownNow} invocations and simulates an orderly termination + * after a configurable delay. Avoids the heavy in-process gRPC machinery — + * the shutdown helpers only touch the three lifecycle methods. + */ + private static final class RecordingChannel extends ManagedChannel { + private final long terminatesAfterMillis; + private final long createdAtNanos; + private volatile boolean shutdownCalled; + private volatile boolean shutdownNowCalled; + + RecordingChannel(long terminatesAfterMillis) { + this.terminatesAfterMillis = terminatesAfterMillis; + this.createdAtNanos = System.nanoTime(); + } + + @Override + public ManagedChannel shutdown() { + shutdownCalled = true; + return this; + } + + @Override + public boolean isShutdown() { + return shutdownCalled || shutdownNowCalled; + } + + @Override + public boolean isTerminated() { + if (shutdownNowCalled) { + return true; + } + if (!shutdownCalled) { + return false; + } + long elapsed = (System.nanoTime() - createdAtNanos) / 1_000_000L; + return elapsed >= terminatesAfterMillis; + } + + @Override + public ManagedChannel shutdownNow() { + shutdownNowCalled = true; + return this; + } + + @Override + public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException { + long deadlineNanos = System.nanoTime() + unit.toNanos(timeout); + while (System.nanoTime() < deadlineNanos) { + if (isTerminated()) { + return true; + } + long remaining = Math.max(1, (deadlineNanos - System.nanoTime()) / 1_000_000L); + Thread.sleep(Math.min(remaining, 10)); + } + return isTerminated(); + } + + @Override + public ClientCall newCall( + MethodDescriptor methodDescriptor, CallOptions callOptions) { + throw new UnsupportedOperationException("no RPCs are issued in shutdown tests"); + } + + @Override + public String authority() { + return "in-process"; + } + + @Override + public ConnectivityState getState(boolean requestConnection) { + return ConnectivityState.IDLE; + } + } + +} diff --git a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayMediumFindingsTests.java b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayMediumFindingsTests.java index 272e58e..2690aa0 100644 --- a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayMediumFindingsTests.java +++ b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/MxGatewayMediumFindingsTests.java @@ -13,6 +13,7 @@ import io.grpc.inprocess.InProcessServerBuilder; import io.grpc.stub.StreamObserver; import java.time.Duration; import java.util.UUID; +import java.util.concurrent.CompletableFuture; import mxaccess_gateway.v1.MxAccessGatewayGrpc; import mxaccess_gateway.v1.MxaccessGateway.CloseSessionReply; import mxaccess_gateway.v1.MxaccessGateway.CloseSessionRequest; @@ -27,7 +28,7 @@ import org.junit.jupiter.api.Test; /** * Regression tests for the Medium-severity Client.Java code-review findings - * (Client.Java-001 through Client.Java-005). + * (Client.Java-001 through Client.Java-005, and Client.Java-014/015). */ final class MxGatewayMediumFindingsTests { @@ -323,6 +324,138 @@ final class MxGatewayMediumFindingsTests { } } + // --- Client.Java-014: MxEventStream.close() before beforeStart must cancel the call --- + + @Test + void mxEventStreamCloseBeforeBeforeStartCancelsStream() { + // Mirrors GalaxyRepositoryClientTests.deployEventStreamCloseBeforeBeforeStartCancelsStream: + // if close() runs before the gRPC call has attached its ClientCallStreamObserver, + // beforeStart() must observe the prior close and cancel the underlying call so the + // gRPC subscription does not leak open after the consumer has stopped iterating. + MxEventStream stream = new MxEventStream(4); + io.grpc.stub.ClientResponseObserver< + mxaccess_gateway.v1.MxaccessGateway.StreamEventsRequest, + mxaccess_gateway.v1.MxaccessGateway.MxEvent> + observer = stream.observer(); + RecordingEventsRequestStream requestStream = new RecordingEventsRequestStream(); + + stream.close(); + observer.beforeStart(requestStream); + + assertTrue(requestStream.cancelled, "beforeStart must cancel the underlying call after a prior close()"); + assertEquals("client cancelled event stream", requestStream.cancelMessage); + assertFalse(stream.hasNext()); + } + + // --- Client.Java-015: cancelling the user-visible *Async future cancels the gRPC call --- + + @Test + void invokeAsyncCancellationCancelsUnderlyingGrpcCall() throws Exception { + // Set up a gateway service that never completes the invoke call so cancellation is + // the only way the call terminates. Hook ServerCallStreamObserver.setOnCancelHandler + // to latch when the server observes cancellation. + java.util.concurrent.CountDownLatch serverCancelled = new java.util.concurrent.CountDownLatch(1); + TestService service = new TestService() { + @Override + public void invoke(MxCommandRequest request, StreamObserver responseObserver) { + io.grpc.stub.ServerCallStreamObserver serverObserver = + (io.grpc.stub.ServerCallStreamObserver) responseObserver; + serverObserver.setOnCancelHandler(serverCancelled::countDown); + // Intentionally never complete — the call must be terminated by the client + // cancelling its future, which must propagate to the gRPC cancellation. + } + }; + + try (Harness harness = Harness.start(service)) { + CompletableFuture future = harness.client().invokeAsync(MxCommandRequest.newBuilder() + .setSessionId("s-cancel") + .setCommand(mxaccess_gateway.v1.MxaccessGateway.MxCommand.newBuilder() + .setKind(MxCommandKind.MX_COMMAND_KIND_REGISTER)) + .build()); + + // Cancellation of the user-visible future must propagate to the gRPC call. + assertTrue(future.cancel(true), "cancel(true) should return true on a pending future"); + assertTrue( + serverCancelled.await(5, java.util.concurrent.TimeUnit.SECONDS), + "server must observe RPC cancellation after future.cancel(true)"); + } + } + + @Test + void toCompletableValidatorOverloadForwardsCancellationToSource() { + // Unit-level proof: cancel() on the future returned by the validator-aware + // toCompletable overload must call cancel(true) on the source ListenableFuture. + // This is the core fix for Client.Java-015 — the validator runs inside + // toCompletable instead of via .thenApply, so the user holds the future + // that is bound to the source. + com.google.common.util.concurrent.SettableFuture source = + com.google.common.util.concurrent.SettableFuture.create(); + java.util.concurrent.CompletableFuture target = + MxGatewayChannels.toCompletable(source, "noop", String::length); + + assertFalse(source.isCancelled()); + assertTrue(target.cancel(true)); + assertTrue(source.isCancelled(), "source ListenableFuture must be cancelled"); + } + + @Test + void toCompletableNoValidatorOverloadForwardsCancellationToSource() { + // Regression for the no-validator overload (the historic toCompletable shape). + com.google.common.util.concurrent.SettableFuture source = + com.google.common.util.concurrent.SettableFuture.create(); + java.util.concurrent.CompletableFuture target = MxGatewayChannels.toCompletable(source, "noop"); + + assertFalse(source.isCancelled()); + assertTrue(target.cancel(true)); + assertTrue(source.isCancelled(), "source ListenableFuture must be cancelled"); + } + + private static final class RecordingEventsRequestStream + extends io.grpc.stub.ClientCallStreamObserver< + mxaccess_gateway.v1.MxaccessGateway.StreamEventsRequest> { + private boolean cancelled; + private String cancelMessage; + + @Override + public void cancel(String message, Throwable cause) { + cancelled = true; + cancelMessage = message; + } + + @Override + public boolean isReady() { + return true; + } + + @Override + public void setOnReadyHandler(Runnable onReadyHandler) { + } + + @Override + public void request(int count) { + } + + @Override + public void setMessageCompression(boolean enable) { + } + + @Override + public void disableAutoInboundFlowControl() { + } + + @Override + public void onNext(mxaccess_gateway.v1.MxaccessGateway.StreamEventsRequest value) { + } + + @Override + public void onError(Throwable t) { + } + + @Override + public void onCompleted() { + } + } + private static mxaccess_gateway.v1.MxaccessGateway.MxEvent testEvent(int sequence) { return mxaccess_gateway.v1.MxaccessGateway.MxEvent.newBuilder() .setWorkerSequence(sequence) diff --git a/clients/java/src/main/generated/main/java/galaxy_repository/v1/GalaxyRepositoryOuterClass.java b/clients/java/src/main/generated/main/java/galaxy_repository/v1/GalaxyRepositoryOuterClass.java index 104fa3e..aa148ec 100644 --- a/clients/java/src/main/generated/main/java/galaxy_repository/v1/GalaxyRepositoryOuterClass.java +++ b/clients/java/src/main/generated/main/java/galaxy_repository/v1/GalaxyRepositoryOuterClass.java @@ -8976,17 +8976,36 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera getFullTagReferenceBytes(); /** + *

+     * Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged.
+     * This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's
+     * type enumeration is distinct from MXAccess's wire data-type enum and
+     * the two must not be cast or compared. The GalaxyRepository service is
+     * metadata-only and deliberately does not share types with
+     * mxaccess_gateway.proto. See docs/GalaxyRepository.md.
+     * 
+ * * int32 mx_data_type = 3; * @return The mxDataType. */ int getMxDataType(); /** + *
+     * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+     * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+     * 
+ * * string data_type_name = 4; * @return The dataTypeName. */ java.lang.String getDataTypeName(); /** + *
+     * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+     * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+     * 
+ * * string data_type_name = 4; * @return The bytes for dataTypeName. */ @@ -9012,12 +9031,24 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera boolean getArrayDimensionPresent(); /** + *
+     * Raw Galaxy SQL attribute-category identifier, passed through unchanged.
+     * Galaxy-specific; not mapped to any gateway enum. See
+     * docs/GalaxyRepository.md.
+     * 
+ * * int32 mx_attribute_category = 8; * @return The mxAttributeCategory. */ int getMxAttributeCategory(); /** + *
+     * Raw Galaxy SQL security-classification identifier, passed through
+     * unchanged. Galaxy-specific; not mapped to any gateway enum. See
+     * docs/GalaxyRepository.md.
+     * 
+ * * int32 security_classification = 9; * @return The securityClassification. */ @@ -9156,6 +9187,15 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera public static final int MX_DATA_TYPE_FIELD_NUMBER = 3; private int mxDataType_ = 0; /** + *
+     * Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged.
+     * This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's
+     * type enumeration is distinct from MXAccess's wire data-type enum and
+     * the two must not be cast or compared. The GalaxyRepository service is
+     * metadata-only and deliberately does not share types with
+     * mxaccess_gateway.proto. See docs/GalaxyRepository.md.
+     * 
+ * * int32 mx_data_type = 3; * @return The mxDataType. */ @@ -9168,6 +9208,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera @SuppressWarnings("serial") private volatile java.lang.Object dataTypeName_ = ""; /** + *
+     * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+     * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+     * 
+ * * string data_type_name = 4; * @return The dataTypeName. */ @@ -9185,6 +9230,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera } } /** + *
+     * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+     * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+     * 
+ * * string data_type_name = 4; * @return The bytes for dataTypeName. */ @@ -9239,6 +9289,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera public static final int MX_ATTRIBUTE_CATEGORY_FIELD_NUMBER = 8; private int mxAttributeCategory_ = 0; /** + *
+     * Raw Galaxy SQL attribute-category identifier, passed through unchanged.
+     * Galaxy-specific; not mapped to any gateway enum. See
+     * docs/GalaxyRepository.md.
+     * 
+ * * int32 mx_attribute_category = 8; * @return The mxAttributeCategory. */ @@ -9250,6 +9306,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera public static final int SECURITY_CLASSIFICATION_FIELD_NUMBER = 9; private int securityClassification_ = 0; /** + *
+     * Raw Galaxy SQL security-classification identifier, passed through
+     * unchanged. Galaxy-specific; not mapped to any gateway enum. See
+     * docs/GalaxyRepository.md.
+     * 
+ * * int32 security_classification = 9; * @return The securityClassification. */ @@ -9956,6 +10018,15 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera private int mxDataType_ ; /** + *
+       * Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged.
+       * This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's
+       * type enumeration is distinct from MXAccess's wire data-type enum and
+       * the two must not be cast or compared. The GalaxyRepository service is
+       * metadata-only and deliberately does not share types with
+       * mxaccess_gateway.proto. See docs/GalaxyRepository.md.
+       * 
+ * * int32 mx_data_type = 3; * @return The mxDataType. */ @@ -9964,6 +10035,15 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return mxDataType_; } /** + *
+       * Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged.
+       * This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's
+       * type enumeration is distinct from MXAccess's wire data-type enum and
+       * the two must not be cast or compared. The GalaxyRepository service is
+       * metadata-only and deliberately does not share types with
+       * mxaccess_gateway.proto. See docs/GalaxyRepository.md.
+       * 
+ * * int32 mx_data_type = 3; * @param value The mxDataType to set. * @return This builder for chaining. @@ -9976,6 +10056,15 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return this; } /** + *
+       * Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged.
+       * This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's
+       * type enumeration is distinct from MXAccess's wire data-type enum and
+       * the two must not be cast or compared. The GalaxyRepository service is
+       * metadata-only and deliberately does not share types with
+       * mxaccess_gateway.proto. See docs/GalaxyRepository.md.
+       * 
+ * * int32 mx_data_type = 3; * @return This builder for chaining. */ @@ -9988,6 +10077,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera private java.lang.Object dataTypeName_ = ""; /** + *
+       * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+       * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+       * 
+ * * string data_type_name = 4; * @return The dataTypeName. */ @@ -10004,6 +10098,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera } } /** + *
+       * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+       * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+       * 
+ * * string data_type_name = 4; * @return The bytes for dataTypeName. */ @@ -10021,6 +10120,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera } } /** + *
+       * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+       * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+       * 
+ * * string data_type_name = 4; * @param value The dataTypeName to set. * @return This builder for chaining. @@ -10034,6 +10138,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return this; } /** + *
+       * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+       * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+       * 
+ * * string data_type_name = 4; * @return This builder for chaining. */ @@ -10044,6 +10153,11 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return this; } /** + *
+       * Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float",
+       * "Integer", "Boolean"). Free-form Galaxy text; not a stable enum.
+       * 
+ * * string data_type_name = 4; * @param value The bytes for dataTypeName to set. * @return This builder for chaining. @@ -10156,6 +10270,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera private int mxAttributeCategory_ ; /** + *
+       * Raw Galaxy SQL attribute-category identifier, passed through unchanged.
+       * Galaxy-specific; not mapped to any gateway enum. See
+       * docs/GalaxyRepository.md.
+       * 
+ * * int32 mx_attribute_category = 8; * @return The mxAttributeCategory. */ @@ -10164,6 +10284,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return mxAttributeCategory_; } /** + *
+       * Raw Galaxy SQL attribute-category identifier, passed through unchanged.
+       * Galaxy-specific; not mapped to any gateway enum. See
+       * docs/GalaxyRepository.md.
+       * 
+ * * int32 mx_attribute_category = 8; * @param value The mxAttributeCategory to set. * @return This builder for chaining. @@ -10176,6 +10302,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return this; } /** + *
+       * Raw Galaxy SQL attribute-category identifier, passed through unchanged.
+       * Galaxy-specific; not mapped to any gateway enum. See
+       * docs/GalaxyRepository.md.
+       * 
+ * * int32 mx_attribute_category = 8; * @return This builder for chaining. */ @@ -10188,6 +10320,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera private int securityClassification_ ; /** + *
+       * Raw Galaxy SQL security-classification identifier, passed through
+       * unchanged. Galaxy-specific; not mapped to any gateway enum. See
+       * docs/GalaxyRepository.md.
+       * 
+ * * int32 security_classification = 9; * @return The securityClassification. */ @@ -10196,6 +10334,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return securityClassification_; } /** + *
+       * Raw Galaxy SQL security-classification identifier, passed through
+       * unchanged. Galaxy-specific; not mapped to any gateway enum. See
+       * docs/GalaxyRepository.md.
+       * 
+ * * int32 security_classification = 9; * @param value The securityClassification to set. * @return This builder for chaining. @@ -10208,6 +10352,12 @@ public final class GalaxyRepositoryOuterClass extends com.google.protobuf.Genera return this; } /** + *
+       * Raw Galaxy SQL security-classification identifier, passed through
+       * unchanged. Galaxy-specific; not mapped to any gateway enum. See
+       * docs/GalaxyRepository.md.
+       * 
+ * * int32 security_classification = 9; * @return This builder for chaining. */ diff --git a/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java b/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java index a62f3e8..9560f31 100644 --- a/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java +++ b/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java @@ -40706,16 +40706,31 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { int getVerifierUserId(); /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return Whether the value field is set. */ boolean hasValue(); /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return The value. */ mxaccess_gateway.v1.MxaccessGateway.MxValue getValue(); /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ mxaccess_gateway.v1.MxaccessGateway.MxValueOrBuilder getValueOrBuilder(); @@ -40794,6 +40809,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { public static final int VALUE_FIELD_NUMBER = 4; private mxaccess_gateway.v1.MxaccessGateway.MxValue value_; /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return Whether the value field is set. */ @@ -40802,6 +40822,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return ((bitField0_ & 0x00000001) != 0); } /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return The value. */ @@ -40810,6 +40835,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return value_ == null ? mxaccess_gateway.v1.MxaccessGateway.MxValue.getDefaultInstance() : value_; } /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ @java.lang.Override @@ -41301,6 +41331,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { private com.google.protobuf.SingleFieldBuilder< mxaccess_gateway.v1.MxaccessGateway.MxValue, mxaccess_gateway.v1.MxaccessGateway.MxValue.Builder, mxaccess_gateway.v1.MxaccessGateway.MxValueOrBuilder> valueBuilder_; /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return Whether the value field is set. */ @@ -41308,6 +41343,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return ((bitField0_ & 0x00000008) != 0); } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return The value. */ @@ -41319,6 +41359,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { } } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder setValue(mxaccess_gateway.v1.MxaccessGateway.MxValue value) { @@ -41335,6 +41380,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder setValue( @@ -41349,6 +41399,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder mergeValue(mxaccess_gateway.v1.MxaccessGateway.MxValue value) { @@ -41370,6 +41425,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder clearValue() { @@ -41383,6 +41443,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public mxaccess_gateway.v1.MxaccessGateway.MxValue.Builder getValueBuilder() { @@ -41391,6 +41456,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return internalGetValueFieldBuilder().getBuilder(); } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public mxaccess_gateway.v1.MxaccessGateway.MxValueOrBuilder getValueOrBuilder() { @@ -41402,6 +41472,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { } } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ private com.google.protobuf.SingleFieldBuilder< @@ -42314,16 +42389,31 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { int getVerifierUserId(); /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return Whether the value field is set. */ boolean hasValue(); /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return The value. */ mxaccess_gateway.v1.MxaccessGateway.MxValue getValue(); /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ mxaccess_gateway.v1.MxaccessGateway.MxValueOrBuilder getValueOrBuilder(); @@ -42417,6 +42507,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { public static final int VALUE_FIELD_NUMBER = 4; private mxaccess_gateway.v1.MxaccessGateway.MxValue value_; /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return Whether the value field is set. */ @@ -42425,6 +42520,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return ((bitField0_ & 0x00000001) != 0); } /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return The value. */ @@ -42433,6 +42533,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return value_ == null ? mxaccess_gateway.v1.MxaccessGateway.MxValue.getDefaultInstance() : value_; } /** + *
+     * Credential-sensitive write value. Implementations must not log this field
+     * unless an explicit redacted value-logging path is enabled.
+     * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ @java.lang.Override @@ -42988,6 +43093,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { private com.google.protobuf.SingleFieldBuilder< mxaccess_gateway.v1.MxaccessGateway.MxValue, mxaccess_gateway.v1.MxaccessGateway.MxValue.Builder, mxaccess_gateway.v1.MxaccessGateway.MxValueOrBuilder> valueBuilder_; /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return Whether the value field is set. */ @@ -42995,6 +43105,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return ((bitField0_ & 0x00000008) != 0); } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; * @return The value. */ @@ -43006,6 +43121,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { } } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder setValue(mxaccess_gateway.v1.MxaccessGateway.MxValue value) { @@ -43022,6 +43142,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder setValue( @@ -43036,6 +43161,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder mergeValue(mxaccess_gateway.v1.MxaccessGateway.MxValue value) { @@ -43057,6 +43187,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public Builder clearValue() { @@ -43070,6 +43205,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return this; } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public mxaccess_gateway.v1.MxaccessGateway.MxValue.Builder getValueBuilder() { @@ -43078,6 +43218,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { return internalGetValueFieldBuilder().getBuilder(); } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ public mxaccess_gateway.v1.MxaccessGateway.MxValueOrBuilder getValueOrBuilder() { @@ -43089,6 +43234,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { } } /** + *
+       * Credential-sensitive write value. Implementations must not log this field
+       * unless an explicit redacted value-logging path is enabled.
+       * 
+ * * .mxaccess_gateway.v1.MxValue value = 4; */ private com.google.protobuf.SingleFieldBuilder< @@ -43322,6 +43472,7 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { *
    * Bulk Read — snapshot the current value for each requested tag. MXAccess COM
    * has no synchronous Read; the worker implements ReadBulk as:
+   *
    * - If the tag is already in the session's item registry AND that item is
    * currently advised AND the worker has a cached OnDataChange for it, the
    * reply returns the cached value WITHOUT modifying the existing
@@ -43330,6 +43481,7 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile {
    * Advise, wait up to `timeout_ms` for the first OnDataChange, then
    * UnAdvise + RemoveItem before returning. The session is left exactly
    * as it was before the call (was_cached = false).
+   *
    * `timeout_ms == 0` uses the gateway-configured default (1000 ms).
    * 
* @@ -43619,6 +43771,7 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { *
      * Bulk Read — snapshot the current value for each requested tag. MXAccess COM
      * has no synchronous Read; the worker implements ReadBulk as:
+     *
      * - If the tag is already in the session's item registry AND that item is
      * currently advised AND the worker has a cached OnDataChange for it, the
      * reply returns the cached value WITHOUT modifying the existing
@@ -43627,6 +43780,7 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile {
      * Advise, wait up to `timeout_ms` for the first OnDataChange, then
      * UnAdvise + RemoveItem before returning. The session is left exactly
      * as it was before the call (was_cached = false).
+     *
      * `timeout_ms == 0` uses the gateway-configured default (1000 ms).
      * 
* diff --git a/clients/python/README.md b/clients/python/README.md index 1953bc4..f357ab5 100644 --- a/clients/python/README.md +++ b/clients/python/README.md @@ -226,6 +226,12 @@ The client supports plaintext channels for local development, TLS with system roots, TLS with a custom `ca_file`, and an optional test server name override. API keys are redacted from option repr output and CLI error output. +The CLI defaults to TLS. Pass `--plaintext` explicitly to open an unencrypted +channel — there is no implicit localhost downgrade. `--tls` is accepted but +redundant with the default, and cannot be combined with `--plaintext`. Scripts +that previously relied on a `localhost:` / `127.0.0.1:` endpoint silently +selecting plaintext must now pass `--plaintext` explicitly. + ## CLI The CLI emits deterministic JSON for automation: diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index ebb35e1..224a8ac 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -8,6 +8,31 @@ version = "0.1.0" description = "Async Python client for MXAccess Gateway." readme = "README.md" requires-python = ">=3.12" +license = "Proprietary" +authors = [ + { name = "MXAccess Gateway Authors" }, +] +keywords = [ + "mxaccess", + "archestra", + "gateway", + "grpc", + "industrial", + "scada", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: System :: Distributed Computing", + "Typing :: Typed", +] dependencies = [ "click>=8.3,<9", "grpcio>=1.80,<2", @@ -21,12 +46,20 @@ dev = [ "pytest-asyncio>=1.3,<2", ] +[project.urls] +Homepage = "https://gitea.dohertylan.com/dohertj2/mxaccessgw" +Source = "https://gitea.dohertylan.com/dohertj2/mxaccessgw" +Issues = "https://gitea.dohertylan.com/dohertj2/mxaccessgw/issues" + [project.scripts] mxgw-py = "mxgateway_cli.commands:main" [tool.setuptools.packages.find] where = ["src"] +[tool.setuptools.package-data] +mxgateway = ["py.typed"] + [tool.pytest.ini_options] addopts = "-ra" pythonpath = ["src"] diff --git a/clients/python/src/mxgateway/py.typed b/clients/python/src/mxgateway/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/clients/python/src/mxgateway_cli/commands.py b/clients/python/src/mxgateway_cli/commands.py index 124f74a..b673461 100644 --- a/clients/python/src/mxgateway_cli/commands.py +++ b/clients/python/src/mxgateway_cli/commands.py @@ -19,8 +19,7 @@ from mxgateway.errors import MxGatewayError from mxgateway.generated import mxaccess_gateway_pb2 as pb from mxgateway.options import ClientOptions from mxgateway.session import Session -from mxgateway.values import to_mx_value -from mxgateway.values import MxValueInput +from mxgateway.values import MxValueInput, to_mx_value MAX_AGGREGATE_EVENTS = 10_000 @@ -52,8 +51,25 @@ def gateway_options(command: Callable[..., Any]) -> Callable[..., Any]: default=None, help="Environment variable containing the gateway API key.", )(command) - command = click.option("--plaintext", is_flag=True, help="Use plaintext gRPC.")(command) - command = click.option("--tls", "use_tls", is_flag=True, help="Use TLS gRPC.")(command) + command = click.option( + "--plaintext", + is_flag=True, + help=( + "Use a plaintext gRPC channel. TLS is the default; pass --plaintext " + "explicitly to opt in to an unencrypted channel (no implicit " + "localhost downgrade)." + ), + )(command) + command = click.option( + "--tls", + "use_tls", + is_flag=True, + help=( + "Use a TLS gRPC channel. Redundant with the default; retained for " + "symmetry with other client CLIs. Cannot be combined with " + "--plaintext." + ), + )(command) command = click.option("--ca-file", default=None, help="Custom root certificate file.")(command) command = click.option( "--server-name-override", @@ -755,11 +771,23 @@ def _session(client: GatewayClient, session_id: str) -> Session: def _use_plaintext(kwargs: dict[str, Any]) -> bool: - if kwargs.get("use_tls"): - return False - if kwargs.get("plaintext"): - return True - return kwargs["endpoint"].startswith("localhost:") or kwargs["endpoint"].startswith("127.0.0.1:") + """Resolve whether to open a plaintext gRPC channel. + + The contract matches the Go and Java CLIs (and is stricter than the + previous behaviour): TLS is the default, and the user must pass + ``--plaintext`` to opt in to an unencrypted channel. There is no implicit + localhost downgrade -- silently transmitting a bearer token in cleartext + just because the endpoint starts with ``localhost:`` or ``127.0.0.1:`` was + the security regression Client.Python-013 closed. ``--tls`` is accepted as + a redundant, explicit affirmation of the default and must not be combined + with ``--plaintext``. + """ + + plaintext = bool(kwargs.get("plaintext")) + use_tls = bool(kwargs.get("use_tls")) + if plaintext and use_tls: + raise click.UsageError("--plaintext and --tls are mutually exclusive.") + return plaintext def _api_key_from_env(name: str | None) -> str | None: diff --git a/clients/python/tests/test_cli.py b/clients/python/tests/test_cli.py index a2ff19e..5f7a934 100644 --- a/clients/python/tests/test_cli.py +++ b/clients/python/tests/test_cli.py @@ -2,10 +2,12 @@ import json +import click +import pytest from click.testing import CliRunner from mxgateway import __version__ -from mxgateway_cli.commands import main +from mxgateway_cli.commands import _use_plaintext, main def test_version_json_is_deterministic() -> None: @@ -66,3 +68,151 @@ def test_cli_error_output_redacts_api_key() -> None: assert result.exit_code != 0 assert "mxgw_test_secret" not in result.output + + +# Regression tests for Client.Python-013: ``_use_plaintext`` must not silently +# downgrade ``localhost:`` / ``127.0.0.1:`` endpoints to plaintext. TLS is the +# default; users must pass ``--plaintext`` to opt in. + + +def test_use_plaintext_requires_explicit_flag_for_localhost_endpoint() -> None: + """A ``localhost:`` endpoint with no flags must resolve to TLS.""" + + assert ( + _use_plaintext( + {"endpoint": "localhost:5000", "plaintext": False, "use_tls": False} + ) + is False + ) + + +def test_use_plaintext_requires_explicit_flag_for_loopback_ip_endpoint() -> None: + """A ``127.0.0.1:`` endpoint with no flags must resolve to TLS.""" + + assert ( + _use_plaintext( + {"endpoint": "127.0.0.1:5000", "plaintext": False, "use_tls": False} + ) + is False + ) + + +def test_use_plaintext_explicit_plaintext_flag_opts_in() -> None: + """``--plaintext`` must select plaintext regardless of endpoint host.""" + + assert ( + _use_plaintext( + {"endpoint": "localhost:5000", "plaintext": True, "use_tls": False} + ) + is True + ) + assert ( + _use_plaintext( + { + "endpoint": "mxgateway.example.local:5001", + "plaintext": True, + "use_tls": False, + } + ) + is True + ) + + +def test_use_plaintext_explicit_tls_flag_is_accepted_and_idempotent() -> None: + """``--tls`` is accepted as a redundant affirmation of the default.""" + + assert ( + _use_plaintext( + { + "endpoint": "mxgateway.example.local:5001", + "plaintext": False, + "use_tls": True, + } + ) + is False + ) + # Even for a localhost endpoint, ``--tls`` (the default) must yield TLS. + assert ( + _use_plaintext( + {"endpoint": "localhost:5000", "plaintext": False, "use_tls": True} + ) + is False + ) + + +def test_use_plaintext_rejects_conflicting_flags() -> None: + """``--plaintext`` combined with ``--tls`` is a usage error.""" + + with pytest.raises(click.UsageError): + _use_plaintext( + {"endpoint": "localhost:5000", "plaintext": True, "use_tls": True} + ) + + +def test_cli_localhost_endpoint_defaults_to_tls_via_open_session( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """End-to-end: ``open-session`` against ``localhost:`` with no flags + must build a TLS ``ClientOptions`` (plaintext=False).""" + + captured: dict[str, object] = {} + + async def _fake_connect(options): # type: ignore[no-untyped-def] + captured["plaintext"] = options.plaintext + raise RuntimeError("stop-before-network") + + monkeypatch.setattr( + "mxgateway_cli.commands.GatewayClient.connect", _fake_connect + ) + + runner = CliRunner() + result = runner.invoke( + main, + [ + "open-session", + "--endpoint", + "localhost:5000", + "--api-key", + "mxgw_test_secret", + "--json", + ], + ) + + assert result.exit_code != 0 # connect was stubbed to raise + assert captured.get("plaintext") is False, ( + "localhost endpoint must default to TLS without an explicit --plaintext " + "flag (Client.Python-013 regression)." + ) + + +def test_cli_localhost_endpoint_with_plaintext_flag_uses_plaintext( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """End-to-end: ``--plaintext`` opts in to plaintext as expected.""" + + captured: dict[str, object] = {} + + async def _fake_connect(options): # type: ignore[no-untyped-def] + captured["plaintext"] = options.plaintext + raise RuntimeError("stop-before-network") + + monkeypatch.setattr( + "mxgateway_cli.commands.GatewayClient.connect", _fake_connect + ) + + runner = CliRunner() + result = runner.invoke( + main, + [ + "open-session", + "--endpoint", + "localhost:5000", + "--api-key", + "mxgw_test_secret", + "--plaintext", + "--json", + ], + ) + + assert result.exit_code != 0 + assert captured.get("plaintext") is True diff --git a/clients/python/tests/test_cli_bench_and_helpers.py b/clients/python/tests/test_cli_bench_and_helpers.py new file mode 100644 index 0000000..e35d2c5 --- /dev/null +++ b/clients/python/tests/test_cli_bench_and_helpers.py @@ -0,0 +1,454 @@ +"""Regression tests for Client.Python-015 and Client.Python-016. + +Client.Python-015 — coverage for the ``bench-read-bulk`` CLI body and the +``_percentile`` / ``_percentile_summary`` helpers. The percentile algorithm +must remain byte-for-byte equivalent across the five client languages +(.NET, Go, Rust, Java, Python) so cross-language stats are directly +comparable; the unit tests here lock that contract down with known inputs. + +Client.Python-016 — coverage for the two remaining untested CLI helpers +after Client.Python-013 removed the localhost auto-plaintext branch from +``_use_plaintext``: the ``MAX_AGGREGATE_EVENTS`` guard inside +``_collect_events`` and the ``_api_key_from_env`` env-var helper. +""" + +from __future__ import annotations + +import json +from typing import Any + +import pytest +from click.testing import CliRunner + +from mxgateway import ClientOptions, GatewayClient +from mxgateway.generated import mxaccess_gateway_pb2 as pb +from mxgateway_cli import commands +from mxgateway_cli.commands import ( + MAX_AGGREGATE_EVENTS, + _api_key_from_env, + _percentile, + _percentile_summary, +) + + +# --- Client.Python-015: _percentile / _percentile_summary ------------------ +# +# The algorithm is "linear interpolation between the two closest ranks", with +# the rank computed as ``q * (n - 1)``. This matches the .NET, Go, Rust and +# Java drivers; any divergence corrupts cross-language comparisons. + + +def test_percentile_empty_sample_returns_zero() -> None: + assert _percentile([], 0.50) == 0.0 + assert _percentile([], 0.95) == 0.0 + assert _percentile([], 0.99) == 0.0 + + +def test_percentile_single_element_returns_that_element() -> None: + assert _percentile([42.0], 0.0) == 42.0 + assert _percentile([42.0], 0.50) == 42.0 + assert _percentile([42.0], 0.95) == 42.0 + assert _percentile([42.0], 1.0) == 42.0 + + +def test_percentile_exact_rank_returns_sample_value() -> None: + # n = 5 → rank for p50 = 0.5 * 4 = 2 → exact index 2 (value 30.0). + sample = [10.0, 20.0, 30.0, 40.0, 50.0] + assert _percentile(sample, 0.50) == 30.0 + assert _percentile(sample, 0.0) == 10.0 + assert _percentile(sample, 1.0) == 50.0 + + +def test_percentile_interpolates_between_ranks() -> None: + # n = 5 → rank for p95 = 0.95 * 4 = 3.8 → between index 3 (40.0) and + # index 4 (50.0) with fraction 0.8 → 40.0 + (50.0 - 40.0) * 0.8 = 48.0. + sample = [10.0, 20.0, 30.0, 40.0, 50.0] + assert _percentile(sample, 0.95) == pytest.approx(48.0) + # p99 = 0.99 * 4 = 3.96 → 40.0 + 10.0 * 0.96 = 49.6. + assert _percentile(sample, 0.99) == pytest.approx(49.6) + + +def test_percentile_summary_empty_sample_zeros_all_fields() -> None: + assert _percentile_summary([]) == { + "p50": 0.0, + "p95": 0.0, + "p99": 0.0, + "max": 0.0, + "mean": 0.0, + } + + +def test_percentile_summary_known_sample_matches_cross_language_contract() -> None: + # The same five-element sample as the percentile interpolation test; the + # summary must be byte-for-byte the values the .NET / Go / Rust / Java + # drivers produce for the same input (linear interpolation, q * (n-1)). + sample = [10.0, 20.0, 30.0, 40.0, 50.0] + summary = _percentile_summary(sample) + + assert summary == { + "p50": 30.0, + "p95": 48.0, + "p99": 49.6, + "max": 50.0, + "mean": 30.0, + } + + +def test_percentile_summary_rounds_to_three_decimal_places() -> None: + # 1, 2, 3 → p95 = 0.95 * 2 = 1.9 → 2 + (3 - 2) * 0.9 = 2.9; round(2.9, 3) + # is 2.9. Use a sample that exercises the round() call non-trivially. + sample = [1.0, 2.0, 3.0001, 4.0001] + summary = _percentile_summary(sample) + # mean = (1 + 2 + 3.0001 + 4.0001) / 4 = 2.50005 → rounded to 2.5 + assert summary["mean"] == 2.5 + # max round to 3dp = 4.0 + assert summary["max"] == 4.0 + + +# --- Client.Python-015: bench-read-bulk CLI smoke test --------------------- + + +class _BenchFakeUnary: + """A fake unary RPC that pops a reply per call (cycling on exhaustion).""" + + def __init__(self, replies_factory: Any) -> None: + self._factory = replies_factory + self.requests: list[Any] = [] + + async def __call__( + self, + request: Any, + *, + metadata: tuple[tuple[str, str], ...], + ) -> Any: + self.requests.append(request) + return self._factory(request) + + +def _ok_reply(kind: int, **fields: Any) -> pb.MxCommandReply: + return pb.MxCommandReply( + session_id="session-bench", + kind=kind, + protocol_status=pb.ProtocolStatus(code=pb.PROTOCOL_STATUS_CODE_OK), + **fields, + ) + + +class _BenchStub: + """Fake gateway stub that handles OpenSession + Invoke for bench-read-bulk.""" + + def __init__(self, tags: list[str]) -> None: + self._tags = tags + + async def open_session( + request: Any, + *, + metadata: tuple[tuple[str, str], ...], + ) -> Any: + return pb.OpenSessionReply( + session_id="session-bench", + protocol_status=pb.ProtocolStatus(code=pb.PROTOCOL_STATUS_CODE_OK), + ) + + async def close_session( + request: Any, + *, + metadata: tuple[tuple[str, str], ...], + ) -> Any: + return pb.CloseSessionReply( + session_id=request.session_id, + final_state=pb.SESSION_STATE_CLOSED, + protocol_status=pb.ProtocolStatus(code=pb.PROTOCOL_STATUS_CODE_OK), + ) + + def _reply_for(request: Any) -> Any: + kind = request.command.kind + if kind == pb.MX_COMMAND_KIND_REGISTER: + return _ok_reply( + kind, + register=pb.RegisterReply(server_handle=7), + ) + if kind == pb.MX_COMMAND_KIND_SUBSCRIBE_BULK: + results = [ + pb.SubscribeResult( + server_handle=7, + tag_address=tag, + item_handle=100 + i, + was_successful=True, + ) + for i, tag in enumerate(self._tags) + ] + return _ok_reply( + kind, + subscribe_bulk=pb.BulkSubscribeReply(results=results), + ) + if kind == pb.MX_COMMAND_KIND_UNSUBSCRIBE_BULK: + results = [ + pb.SubscribeResult( + server_handle=7, + item_handle=100 + i, + was_successful=True, + ) + for i in range(len(self._tags)) + ] + return _ok_reply( + kind, + unsubscribe_bulk=pb.BulkSubscribeReply(results=results), + ) + if kind == pb.MX_COMMAND_KIND_READ_BULK: + results = [ + pb.BulkReadResult( + server_handle=7, + tag_address=tag, + item_handle=100 + i, + was_successful=True, + was_cached=True, + ) + for i, tag in enumerate(self._tags) + ] + return _ok_reply( + kind, + read_bulk=pb.BulkReadReply(results=results), + ) + raise AssertionError(f"unexpected MxCommand kind in bench test: {kind}") + + self.OpenSession = open_session + self.CloseSession = close_session + self.Invoke = _BenchFakeUnary(_reply_for) + + +def test_bench_read_bulk_emits_cross_language_schema( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Drive bench-read-bulk with duration=0 / warmup=0 and assert the schema. + + A drift in any of these field names (callsPerSecond, cachedReadResults, + latencyMs.p50, …) would break the cross-language + scripts/bench-read-bulk.ps1 aggregation silently. + """ + + bulk_size = 3 + tags = [f"TestMachine_{i:03d}.TestChangingInt" for i in range(1, 1 + bulk_size)] + + async def _fake_connect(kwargs: dict[str, Any]) -> GatewayClient: + return await GatewayClient.connect( + ClientOptions(endpoint=kwargs["endpoint"], plaintext=True), + stub=_BenchStub(tags), + ) + + monkeypatch.setattr(commands, "_connect", _fake_connect) + + runner = CliRunner() + result = runner.invoke( + commands.main, + [ + "bench-read-bulk", + "--endpoint", + "localhost:5000", + "--client-name", + "pytest-bench", + "--duration-seconds", + "0", + "--warmup-seconds", + "0", + "--bulk-size", + str(bulk_size), + "--tag-start", + "1", + "--json", + ], + ) + + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + + # Locked cross-language schema (matches .NET / Go / Rust / Java drivers). + expected_top_level = { + "language", + "command", + "endpoint", + "clientName", + "bulkSize", + "durationSeconds", + "warmupSeconds", + "durationMs", + "tags", + "totalCalls", + "successfulCalls", + "failedCalls", + "totalReadResults", + "cachedReadResults", + "callsPerSecond", + "latencyMs", + } + assert set(payload.keys()) == expected_top_level + assert payload["language"] == "python" + assert payload["command"] == "bench-read-bulk" + assert payload["endpoint"] == "localhost:5000" + assert payload["clientName"] == "pytest-bench" + assert payload["bulkSize"] == bulk_size + assert payload["durationSeconds"] == 0 + assert payload["warmupSeconds"] == 0 + assert payload["tags"] == tags + + # latencyMs sub-shape is the percentile-summary contract. + assert set(payload["latencyMs"].keys()) == {"p50", "p95", "p99", "max", "mean"} + for key in ("p50", "p95", "p99", "max", "mean"): + assert isinstance(payload["latencyMs"][key], (int, float)) + + +# --- Client.Python-016: MAX_AGGREGATE_EVENTS guard ------------------------- + + +def test_collect_events_rejects_max_events_above_aggregate_cap( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """``--max-events`` greater than ``MAX_AGGREGATE_EVENTS`` exits non-zero + with the documented error message. + + The guard lives inside ``_collect_events`` (after a session is opened), + so the test routes the CLI through stubbed ``_connect`` / ``_session`` + fakes and asserts the guard fires before any event is pulled. + """ + + class _EventStreamShouldNotBeUsed: + def __aiter__(self) -> "_EventStreamShouldNotBeUsed": + return self + + async def __anext__(self) -> pb.MxEvent: + raise AssertionError( + "MAX_AGGREGATE_EVENTS guard must trip before any event is pulled", + ) + + class _FakeSession: + def __init__(self) -> None: + self.session_id = "session-1" + + def stream_events( + self, *, after_worker_sequence: int = 0 + ) -> _EventStreamShouldNotBeUsed: + return _EventStreamShouldNotBeUsed() + + class _FakeClient: + async def __aenter__(self) -> "_FakeClient": + return self + + async def __aexit__(self, *exc_info: object) -> None: + return None + + async def _fake_connect(kwargs: dict[str, Any]) -> _FakeClient: + return _FakeClient() + + def _fake_session(client: Any, session_id: str) -> _FakeSession: + return _FakeSession() + + monkeypatch.setattr(commands, "_connect", _fake_connect) + monkeypatch.setattr(commands, "_session", _fake_session) + + runner = CliRunner() + result = runner.invoke( + commands.main, + [ + "stream-events", + "--endpoint", + "localhost:5000", + "--session-id", + "session-1", + "--max-events", + str(MAX_AGGREGATE_EVENTS + 1), + "--plaintext", + "--json", + ], + ) + + assert result.exit_code != 0 + assert f"less than or equal to {MAX_AGGREGATE_EVENTS}" in result.output + assert "--max-events" in result.output + + +def test_collect_events_accepts_max_events_at_aggregate_cap_boundary( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """``--max-events`` equal to ``MAX_AGGREGATE_EVENTS`` must not trip the guard.""" + + class _EmptyEventStream: + def __aiter__(self) -> "_EmptyEventStream": + return self + + async def __anext__(self) -> pb.MxEvent: + raise StopAsyncIteration + + class _FakeSession: + def __init__(self) -> None: + self.client = None # type: ignore[assignment] + self.session_id = "session-1" + + def stream_events(self, *, after_worker_sequence: int = 0) -> _EmptyEventStream: + return _EmptyEventStream() + + class _FakeClient: + async def __aenter__(self) -> "_FakeClient": + return self + + async def __aexit__(self, *exc_info: object) -> None: + return None + + async def _fake_connect(kwargs: dict[str, Any]) -> _FakeClient: + return _FakeClient() + + def _fake_session(client: Any, session_id: str) -> _FakeSession: + return _FakeSession() + + monkeypatch.setattr(commands, "_connect", _fake_connect) + monkeypatch.setattr(commands, "_session", _fake_session) + + runner = CliRunner() + result = runner.invoke( + commands.main, + [ + "stream-events", + "--endpoint", + "localhost:5000", + "--session-id", + "session-1", + "--max-events", + str(MAX_AGGREGATE_EVENTS), + "--timeout", + "0.01", + "--plaintext", + "--json", + ], + ) + + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + assert payload == {"events": []} + + +# --- Client.Python-016: _api_key_from_env ---------------------------------- + + +def test_api_key_from_env_resolves_value_when_variable_is_set( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("MXGATEWAY_TEST_API_KEY", "mxgw_envtest_secret") + + assert _api_key_from_env("MXGATEWAY_TEST_API_KEY") == "mxgw_envtest_secret" + + +def test_api_key_from_env_returns_none_when_variable_is_unset( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("MXGATEWAY_TEST_API_KEY_NOT_SET", raising=False) + + assert _api_key_from_env("MXGATEWAY_TEST_API_KEY_NOT_SET") is None + + +def test_api_key_from_env_returns_none_when_name_is_none() -> None: + assert _api_key_from_env(None) is None + + +def test_api_key_from_env_returns_none_when_name_is_empty_string() -> None: + # The implementation guards on ``if not name`` so empty string is treated + # the same as ``None`` — no env lookup is attempted. + assert _api_key_from_env("") is None diff --git a/clients/rust/RustClientDesign.md b/clients/rust/RustClientDesign.md index 2cd1a58..bdb2fa5 100644 --- a/clients/rust/RustClientDesign.md +++ b/clients/rust/RustClientDesign.md @@ -93,11 +93,24 @@ impl Session { pub async fn subscribe_bulk(&self, server_handle: i32, tag_addresses: Vec) -> Result, Error>; pub async fn unsubscribe_bulk(&self, server_handle: i32, item_handles: Vec) -> Result, Error>; pub async fn write(&self, server_handle: i32, item_handle: i32, value: MxValue, user_id: i32) -> Result<(), Error>; + pub async fn write_bulk(&self, server_handle: i32, entries: Vec, user_id: i32) -> Result, Error>; + pub async fn write2_bulk(&self, server_handle: i32, entries: Vec, timestamp: prost_types::Timestamp, user_id: i32) -> Result, Error>; + pub async fn write_secured_bulk(&self, server_handle: i32, entries: Vec, current_user_id: i32, verifier_user_id: i32) -> Result, Error>; + pub async fn write_secured2_bulk(&self, server_handle: i32, entries: Vec, timestamp: prost_types::Timestamp, current_user_id: i32, verifier_user_id: i32) -> Result, Error>; + pub async fn read_bulk(&self, server_handle: i32, tags: &[String], timeout_ms: u32) -> Result, Error>; pub async fn events(&self) -> Result>, Error>; pub async fn close(&self) -> Result<(), Error>; } ``` +The five bulk-write helpers (`write_bulk`, `write2_bulk`, `write_secured_bulk`, +`write_secured2_bulk`) and `read_bulk` mirror the worker's bulk command shapes +in `mxaccess_gateway.proto` and use the same correlation-id discipline as the +unary helpers — `session::next_correlation_id` is `pub` so that consumers +constructing raw `MxCommandRequest`/`CloseSessionRequest` payloads outside +the `Session` helpers (notably the `mxgw` test CLI's `ping` and +`close-session` subcommands) share the same id generation. + ## Authentication Use a `tonic` interceptor or request extension layer to add: @@ -132,19 +145,29 @@ Use `thiserror`: ```rust pub enum Error { + InvalidEndpoint { endpoint: String, detail: String }, + InvalidArgument { name: String, detail: String }, Transport(tonic::transport::Error), - Status(tonic::Status), - Authentication(String), - Authorization(String), - Session(SessionError), - Worker(WorkerError), - Command(CommandError), - MxAccess(MxAccessError), - Timeout, - Cancelled, + Authentication { message: String, status: Box }, + Authorization { message: String, status: Box }, + Timeout { message: String, status: Box }, + Cancelled { message: String, status: Box }, + Unavailable { message: String, status: Box }, + Status(Box), + Command(Box), + ProtocolStatus { operation: &'static str, code: ProtocolStatusCode, message: String }, + MalformedReply { detail: String }, } ``` +`Unavailable` classifies the transient `Code::Unavailable` / +`Code::ResourceExhausted` statuses so callers can decide whether to retry +without unwrapping the raw status. `MalformedReply` surfaces OK replies +whose payload does not carry the data the command contract requires (for +example, an `AddItem` reply missing the item handle, or a `WriteBulk` reply +carrying the wrong payload arm). `InvalidEndpoint` is returned when the +endpoint URL fails to parse or its TLS material cannot be loaded. + Preserve raw command replies in `CommandError` where applicable. ## Test CLI @@ -153,13 +176,32 @@ Binary: `mxgw`. Use `clap` derive. -Commands: +Commands (see `clients/rust/README.md` for full argument lists): ```text mxgw version -mxgw smoke --endpoint http://localhost:5000 --api-key-env MXGATEWAY_API_KEY --item TestChildObject.TestInt +mxgw ping +mxgw open-session +mxgw close-session --session-id +mxgw register --session-id --client-name +mxgw add-item --session-id --server-handle --item +mxgw advise --session-id --server-handle --item-handle +mxgw subscribe-bulk --session-id --server-handle --items +mxgw unsubscribe-bulk --session-id --server-handle --item-handles <1,2,3> +mxgw read-bulk --session-id --server-handle --items --timeout-ms 1500 +mxgw write --session-id --server-handle 1 --item-handle 1 --value-type int32 --value 123 +mxgw write2 --session-id --server-handle 1 --item-handle 1 --value-type int32 --value 123 --timestamp +mxgw write-bulk --session-id --server-handle --item-handles <1,2> --value-type int32 --values <1,2> +mxgw write2-bulk --session-id --server-handle --item-handles <1,2> --value-type int32 --values <1,2> --timestamp +mxgw write-secured-bulk --session-id --server-handle --item-handles <1,2> --value-type int32 --values <1,2> +mxgw write-secured2-bulk --session-id --server-handle --item-handles <1,2> --value-type int32 --values <1,2> --timestamp mxgw stream-events --session-id --json -mxgw write --session-id --server-handle 1 --item-handle 1 --type int32 --value 123 +mxgw bench-read-bulk --duration-seconds 30 --bulk-size 6 --json +mxgw smoke --endpoint http://localhost:5000 --api-key-env MXGATEWAY_API_KEY --item TestChildObject.TestInt +mxgw galaxy test-connection +mxgw galaxy last-deploy-time +mxgw galaxy discover-hierarchy +mxgw galaxy watch [--last-seen-deploy-time ] [--max-events N] ``` JSON output should use `serde_json`. diff --git a/clients/rust/crates/mxgw-cli/src/main.rs b/clients/rust/crates/mxgw-cli/src/main.rs index edcca89..d6c6d80 100644 --- a/clients/rust/crates/mxgw-cli/src/main.rs +++ b/clients/rust/crates/mxgw-cli/src/main.rs @@ -447,7 +447,9 @@ async fn run(cli: Cli) -> Result<(), Error> { let client = connect(connection).await?; let reply = client .invoke(MxCommandRequest { - client_correlation_id: "rust-cli-ping".to_owned(), + client_correlation_id: mxgateway_client::session::next_correlation_id( + "cli-ping", + ), command: Some(MxCommand { kind: MxCommandKind::Ping as i32, payload: Some(mxgateway_client::generated::mxaccess_gateway::v1::mx_command::Payload::Ping( @@ -494,7 +496,9 @@ async fn run(cli: Cli) -> Result<(), Error> { let reply = client .close_session_raw(CloseSessionRequest { session_id, - client_correlation_id: "rust-cli-close-session".to_owned(), + client_correlation_id: mxgateway_client::session::next_correlation_id( + "cli-close-session", + ), }) .await?; if json { @@ -1034,19 +1038,13 @@ async fn run_bench_read_bulk( .map(|r| r.item_handle) .collect(); - let warmup_deadline = std::time::Instant::now() - + std::time::Duration::from_secs(warmup_seconds); + let warmup_deadline = + std::time::Instant::now() + std::time::Duration::from_secs(warmup_seconds); while std::time::Instant::now() < warmup_deadline { - let _ = session - .read_bulk(server_handle, &tags, timeout_ms) - .await; + let _ = session.read_bulk(server_handle, &tags, timeout_ms).await; } - let mut latencies_ms: Vec = Vec::with_capacity(65_536); - let mut total_read_results: u64 = 0; - let mut cached_read_results: u64 = 0; - let mut successful_calls: u64 = 0; - let mut failed_calls: u64 = 0; + let mut stats = BenchReadBulkStats::default(); let steady_start = std::time::Instant::now(); let steady_deadline = steady_start + std::time::Duration::from_secs(duration_seconds); @@ -1054,18 +1052,9 @@ async fn run_bench_read_bulk( let call_start = std::time::Instant::now(); let outcome = session.read_bulk(server_handle, &tags, timeout_ms).await; let elapsed_ms = call_start.elapsed().as_secs_f64() * 1000.0; - latencies_ms.push(elapsed_ms); match outcome { - Ok(results) => { - successful_calls += 1; - for r in &results { - total_read_results += 1; - if r.was_cached { - cached_read_results += 1; - } - } - } - Err(_) => failed_calls += 1, + Ok(results) => stats.record_success(elapsed_ms, &results), + Err(error) => stats.record_failure(elapsed_ms, &error), } } let steady_elapsed = steady_start.elapsed(); @@ -1074,36 +1063,20 @@ async fn run_bench_read_bulk( let _ = session.unsubscribe_bulk(server_handle, item_handles).await; } - let total_calls = successful_calls + failed_calls; - let calls_per_second = if steady_elapsed.as_secs_f64() > 0.0 { - total_calls as f64 / steady_elapsed.as_secs_f64() - } else { - 0.0 + let context = BenchReadBulkContext { + endpoint: &endpoint, + client_name: &client_name, + bulk_size, + duration_seconds, + warmup_seconds, + steady_elapsed, + tags: &tags, }; - - let summary = percentile_summary(&latencies_ms); - let stats = serde_json::json!({ - "language": "rust", - "command": "bench-read-bulk", - "endpoint": endpoint, - "clientName": client_name, - "bulkSize": bulk_size, - "durationSeconds": duration_seconds, - "warmupSeconds": warmup_seconds, - "durationMs": steady_elapsed.as_millis() as u64, - "tags": tags, - "totalCalls": total_calls, - "successfulCalls": successful_calls, - "failedCalls": failed_calls, - "totalReadResults": total_read_results, - "cachedReadResults": cached_read_results, - "callsPerSecond": round_to(calls_per_second, 2), - "latencyMs": summary, - }); + let json_stats = stats.to_json(&context); if use_json { - println!("{}", stats); + println!("{}", json_stats); } else { - println!("{calls_per_second}"); + println!("{}", stats.calls_per_second(steady_elapsed)); } Ok::<(), Error>(()) } @@ -1113,6 +1086,102 @@ async fn run_bench_read_bulk( bench_outcome } +/// Per-iteration accounting for `bench-read-bulk`. +/// +/// Only successful `read_bulk` calls contribute to the success-latency +/// histogram (`success_latencies_ms`). Failures are tracked separately in +/// `failure_latencies_ms` and the first failure's redacted error string is +/// stashed in `first_failure` so a partial-failure run is visible in the +/// emitted JSON. This keeps the cross-language `latencyMs.p99`/`max` +/// contract honest: it reports successful-call latency only and never +/// folds in a per-call timeout from a failed RPC. +#[derive(Default)] +struct BenchReadBulkStats { + success_latencies_ms: Vec, + failure_latencies_ms: Vec, + total_read_results: u64, + cached_read_results: u64, + successful_calls: u64, + failed_calls: u64, + first_failure: Option, +} + +impl BenchReadBulkStats { + fn record_success( + &mut self, + elapsed_ms: f64, + results: &[mxgateway_client::generated::mxaccess_gateway::v1::BulkReadResult], + ) { + self.success_latencies_ms.push(elapsed_ms); + self.successful_calls += 1; + for result in results { + self.total_read_results += 1; + if result.was_cached { + self.cached_read_results += 1; + } + } + } + + fn record_failure(&mut self, elapsed_ms: f64, error: &Error) { + self.failure_latencies_ms.push(elapsed_ms); + self.failed_calls += 1; + if self.first_failure.is_none() { + self.first_failure = Some(error.to_string()); + } + } + + fn total_calls(&self) -> u64 { + self.successful_calls + self.failed_calls + } + + fn calls_per_second(&self, elapsed: std::time::Duration) -> f64 { + let seconds = elapsed.as_secs_f64(); + if seconds > 0.0 { + self.total_calls() as f64 / seconds + } else { + 0.0 + } + } + + fn to_json(&self, context: &BenchReadBulkContext<'_>) -> serde_json::Value { + let calls_per_second = self.calls_per_second(context.steady_elapsed); + let success_summary = percentile_summary(&self.success_latencies_ms); + let failure_summary = percentile_summary(&self.failure_latencies_ms); + serde_json::json!({ + "language": "rust", + "command": "bench-read-bulk", + "endpoint": context.endpoint, + "clientName": context.client_name, + "bulkSize": context.bulk_size, + "durationSeconds": context.duration_seconds, + "warmupSeconds": context.warmup_seconds, + "durationMs": context.steady_elapsed.as_millis() as u64, + "tags": context.tags, + "totalCalls": self.total_calls(), + "successfulCalls": self.successful_calls, + "failedCalls": self.failed_calls, + "totalReadResults": self.total_read_results, + "cachedReadResults": self.cached_read_results, + "callsPerSecond": round_to(calls_per_second, 2), + "latencyMs": success_summary, + "failureLatencyMs": failure_summary, + "firstFailure": self.first_failure, + }) + } +} + +/// Static configuration for one `bench-read-bulk` run, packaged so the +/// JSON serialiser can quote it back without taking eight positional args. +struct BenchReadBulkContext<'a> { + endpoint: &'a str, + client_name: &'a str, + bulk_size: usize, + duration_seconds: u64, + warmup_seconds: u64, + steady_elapsed: std::time::Duration, + tags: &'a [String], +} + fn percentile_summary(sample: &[f64]) -> serde_json::Value { if sample.is_empty() { return serde_json::json!({ "p50": 0.0, "p95": 0.0, "p99": 0.0, "max": 0.0, "mean": 0.0 }); @@ -1294,7 +1363,13 @@ fn build_write_bulk_entries( item_handles: &[i32], value_type: CliValueType, values: &[String], -) -> Result, Error> { +) -> Result< + Vec<( + i32, + mxgateway_client::generated::mxaccess_gateway::v1::MxValue, + )>, + Error, +> { if item_handles.len() != values.len() { return Err(Error::InvalidArgument { name: "values".to_owned(), @@ -1660,4 +1735,77 @@ mod tests { assert_eq!(frac.seconds, utc.seconds); assert_eq!(frac.nanos, 250_000_000); } + + #[test] + fn bench_read_bulk_stats_keeps_failures_out_of_success_latency_histogram() { + use mxgateway_client::generated::mxaccess_gateway::v1::BulkReadResult; + use mxgateway_client::Error; + + let mut stats = super::BenchReadBulkStats::default(); + let cached = BulkReadResult { + was_cached: true, + was_successful: true, + ..BulkReadResult::default() + }; + let uncached = BulkReadResult { + was_cached: false, + was_successful: true, + ..BulkReadResult::default() + }; + + // Two fast successes and one slow failure: the slow failure must + // not pollute the success p99/max histogram. + stats.record_success(1.5, std::slice::from_ref(&cached)); + stats.record_success(2.0, std::slice::from_ref(&uncached)); + let failure = Error::MalformedReply { + detail: "synthetic failure for the bench test".to_owned(), + }; + stats.record_failure(1_500.0, &failure); + + assert_eq!(stats.success_latencies_ms, vec![1.5, 2.0]); + assert_eq!(stats.failure_latencies_ms, vec![1_500.0]); + assert_eq!(stats.successful_calls, 2); + assert_eq!(stats.failed_calls, 1); + assert_eq!(stats.total_calls(), 3); + assert_eq!(stats.total_read_results, 2); + assert_eq!(stats.cached_read_results, 1); + assert!(stats + .first_failure + .as_deref() + .unwrap() + .contains("synthetic failure")); + + let elapsed = std::time::Duration::from_secs(1); + let context = super::BenchReadBulkContext { + endpoint: "http://fake", + client_name: "client", + bulk_size: 2, + duration_seconds: 1, + warmup_seconds: 0, + steady_elapsed: elapsed, + tags: &[], + }; + let payload = stats.to_json(&context); + // The success-latency histogram must never see the 1_500 ms failure. + assert_eq!(payload["latencyMs"]["max"].as_f64().unwrap(), 2.0); + assert!(payload["latencyMs"]["p99"].as_f64().unwrap() <= 2.0); + // The failure-latency histogram must own it instead. + assert_eq!( + payload["failureLatencyMs"]["max"].as_f64().unwrap(), + 1_500.0 + ); + assert_eq!(payload["failedCalls"].as_u64().unwrap(), 1); + assert_eq!(payload["successfulCalls"].as_u64().unwrap(), 2); + assert!(payload["firstFailure"] + .as_str() + .unwrap() + .contains("synthetic failure")); + } + + #[test] + fn bench_read_bulk_stats_calls_per_second_handles_zero_duration() { + let stats = super::BenchReadBulkStats::default(); + + assert_eq!(stats.calls_per_second(std::time::Duration::ZERO), 0.0); + } } diff --git a/clients/rust/src/generated.rs b/clients/rust/src/generated.rs index 845a5d3..9958764 100644 --- a/clients/rust/src/generated.rs +++ b/clients/rust/src/generated.rs @@ -14,6 +14,7 @@ pub mod mxaccess_gateway { /// gateway to language clients. pub mod v1 { #![allow(clippy::large_enum_variant)] + #![allow(clippy::doc_lazy_continuation)] tonic::include_proto!("mxaccess_gateway.v1"); } @@ -25,6 +26,7 @@ pub mod mxaccess_worker { /// the named-pipe transport between gateway and worker. pub mod v1 { #![allow(clippy::large_enum_variant)] + #![allow(clippy::doc_lazy_continuation)] tonic::include_proto!("mxaccess_worker.v1"); } @@ -36,6 +38,7 @@ pub mod galaxy_repository { /// discovery and deploy-event watch RPCs. pub mod v1 { #![allow(clippy::large_enum_variant)] + #![allow(clippy::doc_lazy_continuation)] tonic::include_proto!("galaxy_repository.v1"); } diff --git a/clients/rust/src/session.rs b/clients/rust/src/session.rs index e7e8930..d877e2c 100644 --- a/clients/rust/src/session.rs +++ b/clients/rust/src/session.rs @@ -33,7 +33,14 @@ static CORRELATION_SEQUENCE: AtomicU64 = AtomicU64::new(0); /// Build a unique `client_correlation_id` for a request so concurrent or /// repeated calls of the same command kind can be told apart in gateway logs. -fn next_correlation_id(label: &str) -> String { +/// +/// Exposed so consumers that construct raw [`MxCommandRequest`] / +/// [`CloseSessionRequest`] payloads outside the `Session` helpers — notably +/// the `mxgw` test CLI — share the same correlation-id discipline as the +/// library. The returned id is `rust-client-{label}-{N}` where `N` comes +/// from a process-wide atomic sequence. +#[must_use] +pub fn next_correlation_id(label: &str) -> String { let sequence = CORRELATION_SEQUENCE.fetch_add(1, Ordering::Relaxed); format!("rust-client-{label}-{sequence}") } @@ -761,8 +768,7 @@ fn bulk_write_results( BulkWriteReplyKind::WriteSecured2, ) => Ok(reply.results), _ => Err(Error::MalformedReply { - detail: "bulk write reply did not carry the expected BulkWriteReply payload" - .to_owned(), + detail: "bulk write reply did not carry the expected BulkWriteReply payload".to_owned(), }), } } diff --git a/clients/rust/tests/client_behavior.rs b/clients/rust/tests/client_behavior.rs index 0a50969..bb281e0 100644 --- a/clients/rust/tests/client_behavior.rs +++ b/clients/rust/tests/client_behavior.rs @@ -20,7 +20,8 @@ use mxgateway_client::generated::mxaccess_gateway::v1::{ CloseSessionReply, CloseSessionRequest, MxCommandKind, MxCommandReply, MxDataType, MxEvent, MxEventFamily, MxStatusCategory, MxStatusProxy, MxStatusSource, MxValue, OpenSessionReply, OpenSessionRequest, ProtocolStatus, ProtocolStatusCode, QueryActiveAlarmsRequest, SessionState, - StreamEventsRequest, SubscribeResult, WriteBulkEntry, + StreamEventsRequest, SubscribeResult, Write2BulkEntry, WriteBulkEntry, WriteSecured2BulkEntry, + WriteSecuredBulkEntry, }; use mxgateway_client::{ ApiKey, ClientOptions, CommandError, Error, GatewayClient, MxStatus, MxValue as ClientMxValue, @@ -160,7 +161,10 @@ async fn read_bulk_forwards_timeout_and_unpacks_cached_flag() { let entry = &results[0]; assert!(entry.was_cached); - assert_eq!(entry.value.as_ref().and_then(|v| v.kind.as_ref()), Some(&Kind::Int32Value(99))); + assert_eq!( + entry.value.as_ref().and_then(|v| v.kind.as_ref()), + Some(&Kind::Int32Value(99)) + ); assert_eq!(*state.last_read_bulk_timeout_ms.lock().await, Some(750)); } @@ -393,6 +397,238 @@ async fn connect_with_unreadable_ca_file_reports_invalid_endpoint() { ); } +#[tokio::test] +async fn register_returns_malformed_reply_when_ok_reply_has_no_payload() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = Some(InvokeOverride::OkReplyNoPayload); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session.register("client-name").await.unwrap_err(); + + assert!( + matches!(&error, Error::MalformedReply { detail } if detail.contains("Register")), + "expected MalformedReply for register, got {error:?}" + ); +} + +#[tokio::test] +async fn add_item_returns_malformed_reply_when_ok_reply_has_no_payload() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = Some(InvokeOverride::OkReplyNoPayload); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session.add_item(12, "Plant.Area.Tag").await.unwrap_err(); + + assert!( + matches!(&error, Error::MalformedReply { detail } if detail.contains("AddItem")), + "expected MalformedReply for add_item, got {error:?}" + ); +} + +#[tokio::test] +async fn add_item2_returns_malformed_reply_when_ok_reply_has_no_payload() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = Some(InvokeOverride::OkReplyNoPayload); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session + .add_item2(12, "Plant.Area.Tag", "ctx") + .await + .unwrap_err(); + + assert!( + matches!(&error, Error::MalformedReply { detail } if detail.contains("AddItem2")), + "expected MalformedReply for add_item2, got {error:?}" + ); +} + +#[tokio::test] +async fn subscribe_bulk_returns_malformed_reply_on_mismatched_payload_arm() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = Some(InvokeOverride::OkReplyWrongPayloadForBulk); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session + .subscribe_bulk(12, vec!["Tank01.Level".to_owned()]) + .await + .unwrap_err(); + + assert!( + matches!(&error, Error::MalformedReply { detail } if detail.contains("bulk")), + "expected MalformedReply for subscribe_bulk, got {error:?}" + ); +} + +#[tokio::test] +async fn write_bulk_returns_malformed_reply_on_mismatched_payload_arm() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = Some(InvokeOverride::OkReplyWrongPayloadForBulkWrite); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session + .write_bulk( + 12, + vec![WriteBulkEntry { + item_handle: 901, + value: Some(int_value(11)), + user_id: 5, + }], + ) + .await + .unwrap_err(); + + assert!( + matches!(&error, Error::MalformedReply { detail } if detail.contains("bulk write")), + "expected MalformedReply for write_bulk, got {error:?}" + ); +} + +#[tokio::test] +async fn read_bulk_returns_malformed_reply_on_mismatched_payload_arm() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = Some(InvokeOverride::OkReplyWrongPayloadForReadBulk); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session + .read_bulk(12, &["Tank01.Level"], 500) + .await + .unwrap_err(); + + assert!( + matches!(&error, Error::MalformedReply { detail } if detail.contains("ReadBulk")), + "expected MalformedReply for read_bulk, got {error:?}" + ); +} + +#[tokio::test] +async fn unary_invoke_maps_status_unavailable_to_error_unavailable() { + let state = Arc::new(FakeState::default()); + *state.invoke_override.lock().await = + Some(InvokeOverride::Unavailable("gateway restarting".to_owned())); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let error = session.add_item(12, "Plant.Area.Tag").await.unwrap_err(); + + assert!( + matches!(&error, Error::Unavailable { .. }), + "expected Error::Unavailable for unary unavailable, got {error:?}" + ); +} + +#[tokio::test] +async fn write2_bulk_round_trips_through_the_fake_gateway() { + let state = Arc::new(FakeState::default()); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let results = session + .write2_bulk( + 12, + vec![Write2BulkEntry { + item_handle: 901, + value: Some(int_value(11)), + timestamp_value: Some(int_value(0)), + user_id: 5, + }], + ) + .await + .unwrap(); + + assert_eq!(results.len(), 2); + assert!(results[0].was_successful); + assert!(!results[1].was_successful); + let last_command = state.last_command_kind.lock().await; + assert_eq!(*last_command, Some(MxCommandKind::Write2Bulk as i32)); +} + +#[tokio::test] +async fn write_secured_bulk_round_trips_through_the_fake_gateway() { + let state = Arc::new(FakeState::default()); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let results = session + .write_secured_bulk( + 12, + vec![WriteSecuredBulkEntry { + item_handle: 901, + current_user_id: 7, + verifier_user_id: 9, + value: Some(int_value(11)), + }], + ) + .await + .unwrap(); + + assert_eq!(results.len(), 2); + assert!(results[0].was_successful); + let last_command = state.last_command_kind.lock().await; + assert_eq!(*last_command, Some(MxCommandKind::WriteSecuredBulk as i32)); +} + +#[tokio::test] +async fn write_secured2_bulk_round_trips_through_the_fake_gateway() { + let state = Arc::new(FakeState::default()); + let endpoint = spawn_fake_gateway(state.clone()).await; + let client = GatewayClient::connect(ClientOptions::new(endpoint)) + .await + .unwrap(); + let session = client.session("session-fixture"); + + let results = session + .write_secured2_bulk( + 12, + vec![WriteSecured2BulkEntry { + item_handle: 901, + current_user_id: 7, + verifier_user_id: 9, + value: Some(int_value(11)), + timestamp_value: Some(int_value(0)), + }], + ) + .await + .unwrap(); + + assert_eq!(results.len(), 2); + assert!(results[0].was_successful); + let last_command = state.last_command_kind.lock().await; + assert_eq!(*last_command, Some(MxCommandKind::WriteSecured2Bulk as i32)); +} + #[derive(Default)] struct FakeState { authorization: Mutex>, @@ -400,6 +636,39 @@ struct FakeState { last_read_bulk_timeout_ms: Mutex>, stream_dropped: Arc, emit_stream_fault: AtomicBool, + /// Test-injected override for the next (and all subsequent) `Invoke` + /// calls. When `Some`, the fake gateway returns the override's response + /// instead of its default per-kind reply. Used by the malformed-reply + /// and unary-Unavailable tests; default `None` preserves existing + /// happy-path test behaviour. + invoke_override: Mutex>, +} + +/// Test-injected override for the fake gateway's `Invoke` handler. +/// +/// Each variant short-circuits the per-kind dispatch in `FakeGateway::invoke` +/// and reproduces one of the wire shapes the Rust client's error paths must +/// handle. The bool tags the OK reply variants as "OK envelope, payload +/// missing/wrong" — the exact condition the new `Error::MalformedReply` +/// paths in `session.rs` are designed to catch. +#[derive(Clone)] +enum InvokeOverride { + /// Return `Status::unavailable(message)` from the unary Invoke RPC, so + /// the client maps it to `Error::Unavailable`. + Unavailable(String), + /// Return an OK `MxCommandReply` whose `payload` field is `None`. Used + /// to exercise `register_server_handle` / `add_item_handle` / + /// `add_item2_handle` falling through to the `MalformedReply` arm. + OkReplyNoPayload, + /// Return an OK reply whose payload arm does not match the bulk-read + /// command, so `read_bulk` falls through to its `MalformedReply` arm. + OkReplyWrongPayloadForReadBulk, + /// Return an OK reply whose payload arm does not match the requested + /// bulk command, so `bulk_results` falls through to `MalformedReply`. + OkReplyWrongPayloadForBulk, + /// Return an OK reply whose payload arm does not match the requested + /// bulk-write command, so `bulk_write_results` returns `MalformedReply`. + OkReplyWrongPayloadForBulkWrite, } #[derive(Clone)] @@ -453,6 +722,58 @@ impl MxAccessGateway for FakeGateway { .unwrap_or_default(); *self.state.last_command_kind.lock().await = Some(kind); + if let Some(override_) = self.state.invoke_override.lock().await.clone() { + return match override_ { + InvokeOverride::Unavailable(message) => Err(Status::unavailable(message)), + InvokeOverride::OkReplyNoPayload => Ok(Response::new(MxCommandReply { + session_id: request.session_id, + correlation_id: "fake-correlation".to_owned(), + kind, + protocol_status: Some(ok_status("command ok but payload omitted")), + payload: None, + ..MxCommandReply::default() + })), + InvokeOverride::OkReplyWrongPayloadForReadBulk => { + Ok(Response::new(MxCommandReply { + session_id: request.session_id, + correlation_id: "fake-correlation".to_owned(), + kind, + protocol_status: Some(ok_status("read-bulk wrong payload arm")), + // AddItem payload arm against a ReadBulk request: + // the client's `read_bulk` matcher must reject it. + payload: Some(mx_command_reply::Payload::AddItem(AddItemReply { + item_handle: 0, + })), + ..MxCommandReply::default() + })) + } + InvokeOverride::OkReplyWrongPayloadForBulk => Ok(Response::new(MxCommandReply { + session_id: request.session_id, + correlation_id: "fake-correlation".to_owned(), + kind, + protocol_status: Some(ok_status("bulk wrong payload arm")), + // AddItem payload arm against a SubscribeBulk request. + payload: Some(mx_command_reply::Payload::AddItem(AddItemReply { + item_handle: 0, + })), + ..MxCommandReply::default() + })), + InvokeOverride::OkReplyWrongPayloadForBulkWrite => { + Ok(Response::new(MxCommandReply { + session_id: request.session_id, + correlation_id: "fake-correlation".to_owned(), + kind, + protocol_status: Some(ok_status("bulk-write wrong payload arm")), + // AddItem payload arm against a WriteBulk request. + payload: Some(mx_command_reply::Payload::AddItem(AddItemReply { + item_handle: 0, + })), + ..MxCommandReply::default() + })) + } + }; + } + if kind == MxCommandKind::Write as i32 { return Ok(Response::new(mxaccess_failure_reply())); } @@ -478,36 +799,41 @@ impl MxAccessGateway for FakeGateway { })); } + // All four bulk-write families return `BulkWriteReply` over the + // wire and only differ by which `payload` arm carries it. The + // round-trip tests below want one entry per family, so wire them + // all up to the same canned reply (one success + one failure) and + // pick the matching payload arm by kind. if kind == MxCommandKind::WriteBulk as i32 { - // Echo one success and one failure so the test can assert the per-entry - // shape and verify the call did not throw on per-entry failure. - return Ok(Response::new(MxCommandReply { - session_id: request.session_id, - correlation_id: "fake-correlation".to_owned(), + return Ok(Response::new(bulk_write_envelope( + request.session_id, kind, - protocol_status: Some(ok_status("command ok")), - payload: Some(mx_command_reply::Payload::WriteBulk(BulkWriteReply { - results: vec![ - BulkWriteResult { - server_handle: 12, - item_handle: 901, - was_successful: true, - hresult: None, - statuses: vec![], - error_message: String::new(), - }, - BulkWriteResult { - server_handle: 12, - item_handle: 902, - was_successful: false, - hresult: None, - statuses: vec![], - error_message: "invalid handle".to_owned(), - }, - ], - })), - ..MxCommandReply::default() - })); + mx_command_reply::Payload::WriteBulk(canned_bulk_write_reply()), + ))); + } + + if kind == MxCommandKind::Write2Bulk as i32 { + return Ok(Response::new(bulk_write_envelope( + request.session_id, + kind, + mx_command_reply::Payload::Write2Bulk(canned_bulk_write_reply()), + ))); + } + + if kind == MxCommandKind::WriteSecuredBulk as i32 { + return Ok(Response::new(bulk_write_envelope( + request.session_id, + kind, + mx_command_reply::Payload::WriteSecuredBulk(canned_bulk_write_reply()), + ))); + } + + if kind == MxCommandKind::WriteSecured2Bulk as i32 { + return Ok(Response::new(bulk_write_envelope( + request.session_id, + kind, + mx_command_reply::Payload::WriteSecured2Bulk(canned_bulk_write_reply()), + ))); } if kind == MxCommandKind::ReadBulk as i32 { @@ -699,6 +1025,44 @@ fn mxaccess_failure_reply() -> MxCommandReply { } } +fn canned_bulk_write_reply() -> BulkWriteReply { + BulkWriteReply { + results: vec![ + BulkWriteResult { + server_handle: 12, + item_handle: 901, + was_successful: true, + hresult: None, + statuses: vec![], + error_message: String::new(), + }, + BulkWriteResult { + server_handle: 12, + item_handle: 902, + was_successful: false, + hresult: None, + statuses: vec![], + error_message: "invalid handle".to_owned(), + }, + ], + } +} + +fn bulk_write_envelope( + session_id: String, + kind: i32, + payload: mx_command_reply::Payload, +) -> MxCommandReply { + MxCommandReply { + session_id, + correlation_id: "fake-correlation".to_owned(), + kind, + protocol_status: Some(ok_status("command ok")), + payload: Some(payload), + ..MxCommandReply::default() + } +} + fn event(sequence: u64) -> MxEvent { MxEvent { family: MxEventFamily::OnDataChange as i32, diff --git a/code-reviews/Client.Dotnet/findings.md b/code-reviews/Client.Dotnet/findings.md index 7cc9ddc..6b3e577 100644 --- a/code-reviews/Client.Dotnet/findings.md +++ b/code-reviews/Client.Dotnet/findings.md @@ -4,8 +4,8 @@ |---|---| | Module | `clients/dotnet` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `3cc53a8` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | @@ -13,16 +13,16 @@ | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Minor: handle-selector fallback `?? reply.ReturnValue.Int32Value` can mask a missing typed reply (Client.Dotnet-005); CLI redactor misses env-var keys (Client.Dotnet-008). | +| 1 | Correctness & logic bugs | Issue found (this review): the Client.Dotnet-005 fix did not reach the CLI — `BenchReadBulkAsync`, `BenchStreamEventsAsync`, and `SmokeAsync` still fall through to `reply.ReturnValue.Int32Value` for `Register` / `AddItem` handles (Client.Dotnet-010). | | 2 | mxaccessgw conventions | Good — consumes the shared contracts project, no forked proto, `authorization: Bearer` metadata correct, parity preserved via split `EnsureProtocolSuccess`/`EnsureMxAccessSuccess`. | -| 3 | Concurrency & thread safety | Issue found: `_disposed` flags unsynchronized; `MxGatewaySession.DisposeAsync` can race a concurrent `CloseAsync` (Client.Dotnet-003). | -| 4 | Error handling & resilience | Issues found: gRPC-to-native mapping collapses non-auth statuses into one untyped exception (Client.Dotnet-001); shared retry/timeout budget (Client.Dotnet-004). | -| 5 | Security | Good — API key never logged by the library, CLI redacts keys, TLS custom-root validation correct. | +| 3 | Concurrency & thread safety | Issues found (this review): `GalaxyRepositoryClient._disposed` is still a plain unsynchronized `bool` (Client.Dotnet-009) — the symmetric fix from Client.Dotnet-003 was applied only to `MxGatewayClient`; the new `bench-stream-events` CLI command races `firstSteadyEventUtc`/`lastSteadyEventUtc` across parallel sessions (Client.Dotnet-011). | +| 4 | Error handling & resilience | No new issues found this review (Client.Dotnet-001 and Client.Dotnet-004 remain resolved). | +| 5 | Security | Good — API key never logged by the library, CLI redacts keys (incl. env-var-sourced), TLS custom-root validation correct, secured-write payloads never logged. | | 6 | Performance & resource management | No issues found — channels and streaming calls disposed correctly. | -| 7 | Design-document adherence | No issues found — matches `ClientLibrariesDesign.md`. | -| 8 | Code organization & conventions | Issue found: undocumented public members (Client.Dotnet-006). | -| 9 | Testing coverage | Issue found: the production retry path is never exercised (Client.Dotnet-002). | -| 10 | Documentation & comments | Issue found: doc misstates the unary timeout retry budget as per-call (Client.Dotnet-004, Client.Dotnet-007). | +| 7 | Design-document adherence | No issues found — matches `DotnetClientDesign.md` and `ClientLibrariesDesign.md`. | +| 8 | Code organization & conventions | Issues found (this review): the .NET client projects do not inherit `src/Directory.Build.props` so `TreatWarningsAsErrors` / `EnforceCodeStyleInBuild` / `AnalysisLevel=latest` are silently absent (Client.Dotnet-012); `DiscoverHierarchyOptions` and the `DiscoverHierarchyAsync(DiscoverHierarchyOptions, …)` overload have no XML docs (Client.Dotnet-013). | +| 9 | Testing coverage | Issue found (this review): the SDK-level alarm tests pin the fake-transport raw-`RpcException` shape but never exercise the production gRPC-to-native mapping (`GrpcMxGatewayClientTransport.AcknowledgeAlarmAsync`) — the same gap Client.Dotnet-002 closed for `Invoke`, still open for alarms (Client.Dotnet-014). | +| 10 | Documentation & comments | No new issues this review. | ## Findings @@ -145,3 +145,109 @@ **Recommendation:** Resolve the effective API key (same logic as `ResolveApiKey`) before redacting, so the env-var-sourced key is also stripped from error output. **Resolution:** (2026-05-18) Confirmed against source: `MxGatewayClientCli.RunCoreAsync`'s catch block redacted only `arguments.GetOptional("api-key")`, so an env-var-sourced key (`--api-key-env`, default `MXGATEWAY_API_KEY`) was never stripped. Note `MxGatewayCliSecretRedactor` itself is correct — the defect was the caller passing the wrong value. Extracted a non-throwing `TryResolveApiKey` helper (used by both the existing `ResolveApiKey` and the catch block) that resolves `--api-key` then the `--api-key-env` environment variable; the catch block now redacts that effective key. Updated `clients/dotnet/README.md` (`smoke` paragraph) to state the CLI redacts the effective key whether from `--api-key` or `--api-key-env`. Regression test `MxGatewayClientCliTests.RunAsync_ErrorOutput_RedactsApiKey_WhenSourcedFromEnvironmentVariable` sets a test env var, forces a transport error echoing the key, and asserts the key is absent and `[redacted]` is present; verified red against the original `GetOptional("api-key")`-only redaction (key printed unredacted). + +### Client.Dotnet-009 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:26,339-348,445-448` | +| Status | Resolved | + +**Description:** Client.Dotnet-003 upgraded `MxGatewayClient._disposed` to an `int` accessed via `Interlocked.Exchange` / `Volatile.Read` so a concurrent `ThrowIfDisposed` cannot observe a stale value. The symmetric `GalaxyRepositoryClient._disposed` is still a plain unsynchronised `bool`: `DisposeAsync` reads `if (_disposed)` then writes `_disposed = true` without `Interlocked` or `Volatile`, and `ThrowIfDisposed` does an unsynchronised read. The Galaxy client is publicly `IAsyncDisposable` and exposes `TestConnectionAsync` / `GetLastDeployTimeAsync` / `DiscoverHierarchyAsync` / `WatchDeployEventsAsync` as legal-to-call-concurrently public APIs, so a concurrent dispose can produce the same torn-read race the gateway client fix prevented. The two clients also exhibit the same shape (gRPC channel + transport + retry pipeline), so the divergence is an accidental inconsistency. + +**Recommendation:** Mirror Client.Dotnet-003 on `GalaxyRepositoryClient`: change `_disposed` to an `int`, use `Interlocked.Exchange(ref _disposed, 1) != 0` in `DisposeAsync`, and `Volatile.Read(ref _disposed) != 0` in `ThrowIfDisposed`. A duplicated `MxGatewaySession`-style close-lock drain is unnecessary because `GalaxyRepositoryClient` does not own a per-call `SemaphoreSlim`. + +**Resolution:** 2026-05-20 — Changed `GalaxyRepositoryClient._disposed` from `bool` to `int`; `DisposeAsync` now uses `Interlocked.Exchange(ref _disposed, 1) != 0` for the once-only guard and `ThrowIfDisposed` uses `Volatile.Read(ref _disposed) != 0`, mirroring the Client.Dotnet-003 fix on `MxGatewayClient`. + +### Client.Dotnet-010 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:638,896,1261,1279` | +| Status | Resolved | + +**Description:** Client.Dotnet-005 fixed the silent `Register` / `AddItem` / `AddItem2` handle-fallback to `reply.ReturnValue.Int32Value` inside `MxGatewaySession`, but the same fallback pattern was left in the CLI and is now also present in two new bench commands shipped after that fix. `BenchReadBulkAsync` (line 638) and `BenchStreamEventsAsync` (line 896) both do `int serverHandle = registerReply.Register?.ServerHandle ?? registerReply.ReturnValue.Int32Value;` after a register call, and `SmokeAsync` (lines 1261 and 1279) passes `reply => reply.Register?.ServerHandle ?? reply.ReturnValue.Int32Value` and the equivalent `AddItem?.ItemHandle` selector to `InvokeForHandleAsync`. After `EnsureProtocolSuccess` + `EnsureMxAccessSuccess` pass but the worker did not set the typed `register` / `add_item` oneof case, all four call sites silently produce a zero handle and proceed to drive the rest of the smoke / bench against an invalid handle — exactly the failure mode the SDK-level fix prevents. + +**Recommendation:** Either delegate to the SDK helpers (`MxGatewaySession.RegisterAsync` / `AddItemAsync`) which already throw the descriptive `MxGatewayException` via `CreateMissingPayloadException`, or replicate the same null-check explicitly in `InvokeForHandleAsync` and the two bench commands. A unit test that enqueues an `Ok` reply with no typed payload through `FakeCliClient` and asserts the smoke / bench commands fail loudly would prevent regression. + +**Resolution:** 2026-05-20 — Added private CLI helpers `RequireRegisterServerHandle` and `RequireAddItemItemHandle` (with a shared `CreateMissingPayloadException` mirroring the SDK-level `MxGatewaySession` helper) that throw a descriptive `MxGatewayException` when the typed `register` / `add_item` payload is absent on an otherwise-successful reply. Replaced all four `?? reply.ReturnValue.Int32Value` fallback sites — `BenchReadBulkAsync` (line 638), `BenchStreamEventsAsync` (line 896), and both `SmokeAsync` selectors (lines 1261, 1279) — with these helpers, so the CLI now fails loudly with the same shape as the SDK helpers rather than silently driving the rest of the command against a zero handle. + +### Client.Dotnet-011 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:857-858,922-963,1014-1015` | +| Status | Resolved | + +**Description:** The new `bench-stream-events` command (added in commit `1cd51bb`) supports `--session-count > 1` and runs each session's `StreamEvents` reader in parallel via `openedSessions.Select(RunStreamAsync).ToArray()` then `Task.WhenAll`. Inside the per-session lambda the inner `Task.Run`-spawned event loop updates two shared `DateTime?` fields without synchronisation: + +```csharp +if (firstSteadyEventUtc is null) +{ + firstSteadyEventUtc = nowUtc; +} +lastSteadyEventUtc = nowUtc; +``` + +The integer counters next to them (`steadyEvents`, `steadyDataChangeEvents`, `warmupEvents`) use `Interlocked.Increment`, and the latency list uses an explicit `lock (latencyLock)`, so the rest of the loop is data-race-free — but these two `DateTime?` updates are not. With N parallel sessions a torn read on `firstSteadyEventUtc` produces a non-deterministic "first event time" and the final `steadyElapsedSeconds = (lastSteadyEventUtc.Value - firstSteadyEventUtc.Value).TotalSeconds` can compute a slightly wrong window. The user-visible impact is bench-only (skewed `eventsPerSecond` / `dataChangeEventsPerSecond` numbers), and on x64 the 64-bit `DateTime` field read/write happens to be atomic, so this is Low — but the pattern is inconsistent with the rest of the same loop. + +**Recommendation:** Either guard the two `DateTime?` updates with the existing `latencyLock` (cheapest), use `Interlocked.CompareExchange` for `firstSteadyEventUtc` and `Volatile.Write` for `lastSteadyEventUtc`, or aggregate per-session in local variables and reduce after `Task.WhenAll`. The reduce-after approach also fixes a related issue: today a faster session can stomp `firstSteadyEventUtc` after a slower one already set it. + +**Resolution:** 2026-05-20 — Guarded the `firstSteadyEventUtc` / `lastSteadyEventUtc` reads and writes inside the per-session event loop with the existing `latencyLock`. `firstSteadyEventUtc` now uses the null-coalescing assignment `firstSteadyEventUtc ??= nowUtc;` under the lock so a slower session can't stomp an earlier already-set value. The lock is already held by the latency-list append a few lines below, so the extra cost is one uncontended acquisition per event. The final read in the stats block runs after `Task.WhenAll` (happens-before applies) and stays lock-free. + +### Client.Dotnet-012 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/dotnet/MxGateway.Client/MxGateway.Client.csproj`, `clients/dotnet/MxGateway.Client.Cli/MxGateway.Client.Cli.csproj`, `clients/dotnet/MxGateway.Client.Tests/MxGateway.Client.Tests.csproj` | +| Status | Resolved | + +**Description:** `src/Directory.Build.props` enforces `TreatWarningsAsErrors=true`, `EnforceCodeStyleInBuild=true`, `AnalysisLevel=latest`, and `Deterministic=true` for every gateway / worker / contracts project, and `CLAUDE.md` calls this out as a baseline build property. The .NET client projects live under `clients/dotnet/` and there is no `Directory.Build.props` at `clients/` or `clients/dotnet/` — so none of those properties apply to `MxGateway.Client`, `MxGateway.Client.Cli`, or `MxGateway.Client.Tests`. New warnings in the client do not break the build, and code-style violations are not blocked at build time. The `CSharpStyleGuide.md` baseline ("Treat compiler warnings as actionable") and the `CLAUDE.md` table under "Source Update Workflow" both apply equally to `.NET client` ("`dotnet build clients/dotnet/MxGateway.Client.sln`"), but the enforcement floor is missing. + +**Recommendation:** Add `clients/dotnet/Directory.Build.props` (or `clients/Directory.Build.props` covering Rust-Cargo siblings is N/A — only `clients/dotnet/`) carrying the same property set: `TreatWarningsAsErrors=true`, `EnforceCodeStyleInBuild=true`, `AnalysisLevel=latest`, `Deterministic=true`. Excluding generated code (which already lives under `src/MxGateway.Contracts/Generated`) is automatic because the client only references the contracts project. Build the client locally after adding it to confirm no warnings already snuck in. + +**Resolution:** 2026-05-20 — Added `clients/dotnet/Directory.Build.props` mirroring `src/Directory.Build.props`: `LangVersion=latest`, `Nullable=enable`, `ImplicitUsings=enable`, `TreatWarningsAsErrors=true`, `AnalysisLevel=latest`, `EnforceCodeStyleInBuild=true`, `Deterministic=true`. The three client `.csproj` files inherit from it automatically. Re-ran `dotnet build clients/dotnet/MxGateway.Client.sln` and confirmed 0 warnings / 0 errors — no pre-existing warnings were silently being tolerated. + +### Client.Dotnet-013 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs:3-24`, `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:185-187`, `clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs:6` | +| Status | Resolved | + +**Description:** Client.Dotnet-006 fixed three undocumented public members. Three more remain undocumented in code paths the prior review didn't visit: + +- `DiscoverHierarchyOptions` (the public record) has no `` on the type and no XML doc on any of its ten public properties (`RootGobjectId`, `RootTagName`, `RootContainedPath`, `MaxDepth`, `CategoryIds`, `TemplateChainContains`, `TagNameGlob`, `IncludeAttributes`, `AlarmBearingOnly`, `HistorizedOnly`). +- The second `DiscoverHierarchyAsync(DiscoverHierarchyOptions, CancellationToken)` overload on `GalaxyRepositoryClient` is `public` with no XML doc, while the parameterless overload one method above it carries a full `` / `` block. +- `IMxGatewayCliClient` is a public interface in the CLI project with no `` on the type (the member docs are present). + +This is the same convention-violation shape Client.Dotnet-006 closed; CLAUDE.md style guidance describes XML docs on the public surface as the baseline expectation. + +**Recommendation:** Add `` docs to each undocumented member. For `DiscoverHierarchyOptions`, the property names map cleanly to the underlying `DiscoverHierarchyRequest` proto fields — a one-line summary per property and a type-level summary tying the record to the Galaxy hierarchy browse is enough. The CLI interface only needs a type-level summary; the members already document themselves. + +**Resolution:** 2026-05-20 — Added XML docs to all three call sites: a type-level summary plus a one-line summary per property on `DiscoverHierarchyOptions` (ten properties, mapped to the underlying `DiscoverHierarchyRequest` proto fields and noting the root-precedence rule); a ``/``/`` block on the second `DiscoverHierarchyAsync(DiscoverHierarchyOptions, CancellationToken)` overload describing its filter semantics and transparent pagination; and a type-level `` on the public `IMxGatewayCliClient` interface explaining its CLI-only transport role and the production binding. + +### Client.Dotnet-014 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs:76-98`, `clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs:212-231` | +| Status | Resolved | + +**Description:** Client.Dotnet-002 closed a coverage gap where the production retry path (`RpcException` → `MxGatewayException` mapping by `RpcExceptionMapper.Map`) was never exercised, by adding a `MapTransportExceptions` flag to `FakeGatewayTransport` and a regression test that runs through the wrapped-exception branch. That flag is wired through `Translate(...)` in `OpenSessionAsync` / `CloseSessionAsync` / `InvokeAsync`, but the new alarm test path is not: `FakeGatewayTransport.AcknowledgeAlarmAsync` throws the queued exception verbatim (line 219), bypassing `Translate`. The accompanying `MxGatewayClientAlarmsTests.AcknowledgeAlarmAsync_MapsUnauthenticated_RpcException_ToTypedException` test acknowledges this in a comment ("Note: the FakeGatewayTransport surfaces RpcException directly … the SDK-level test pins the pass-through shape so a future migration to direct mapping won't silently change observable behaviour") and asserts `Assert.ThrowsAsync` — but the production path through `GrpcMxGatewayClientTransport.AcknowledgeAlarmAsync` (lines 120-134) already calls `RpcExceptionMapper.Map`, so production callers see `MxGatewayAuthenticationException` and not `RpcException`. The test name advertises mapping that the SDK-level harness doesn't exercise, and any callable from `MxGatewayClient.AcknowledgeAlarmAsync` cannot regress on the alarm-ack mapping without somebody noticing. + +**Recommendation:** Either route `FakeGatewayTransport.AcknowledgeAlarmAsync` through the same `Translate` helper the other RPCs use and add a regression test that enables `MapTransportExceptions = true` and asserts `MxGatewayAuthenticationException`; or rename the existing test to make the pass-through shape explicit (e.g. `…_SurfacesRpcExceptionFromFakeTransportVerbatim`) and add a second test exercising the production mapping. Either fix closes the alarm-side equivalent of the gap Client.Dotnet-002 closed for `Invoke`. + +**Resolution:** 2026-05-20 — Applied both halves of the recommendation. Routed `FakeGatewayTransport.AcknowledgeAlarmAsync` through the same `Translate` helper the other RPCs use, so when `MapTransportExceptions = true` thrown `RpcException`s now run through the production `RpcExceptionMapper.Map`. Renamed the existing pass-through test to `AcknowledgeAlarmAsync_SurfacesRpcExceptionFromFakeTransportVerbatim_WhenMappingDisabled` (with an updated comment pinning that this shape only applies when mapping is off), and added a new test `AcknowledgeAlarmAsync_MapsUnauthenticated_RpcException_ToTypedException` that enables mapping and asserts the production-parity `MxGatewayAuthenticationException` with `StatusCode.Unauthenticated`. Closes the alarm-side equivalent of the gap Client.Dotnet-002 closed for `Invoke`. diff --git a/code-reviews/Client.Go/findings.md b/code-reviews/Client.Go/findings.md index 3f36292..af91487 100644 --- a/code-reviews/Client.Go/findings.md +++ b/code-reviews/Client.Go/findings.md @@ -4,8 +4,8 @@ |---|---| | Module | `clients/go` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `3cc53a8` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | @@ -13,16 +13,16 @@ | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: a typed-nil `Unwrap`/`errors.As` trap (Client.Go-001), a CLI `panic` on malformed input (Client.Go-003), empty-string correlation id on rand failure (Client.Go-007). | -| 2 | mxaccessgw conventions | Generally good; two test files fail `gofmt`, breaking the documented workflow (Client.Go-004). | -| 3 | Concurrency & thread safety | No issues found — stream goroutines and cancellation are sound. | -| 4 | Error handling & resilience | Issues found: the compatibility event path silently drops events (Client.Go-002); no transient/permanent classification (Client.Go-006). | -| 5 | Security | No issues found — TLS by default with a TLS 1.2 floor, API key redaction, no secret logging. | -| 6 | Performance & resource management | No issues found — connections/streams closed via deferred `Close`/`cancel`. | -| 7 | Design-document adherence | Issues found: deprecated `grpc.DialContext`+`WithBlock` usage and a missing error taxonomy (Client.Go-005, Client.Go-006). | -| 8 | Code organization & conventions | Issue found: duplication between `Client` and `GalaxyClient` (Client.Go-009). | -| 9 | Testing coverage | Issue found: TLS path, `callContext` deadline logic, and `NativeValue`/`NativeArray` edges untested (Client.Go-008). | -| 10 | Documentation & comments | Issue found: a stale `WithBlock` dial-cancellation claim (Client.Go-010). | +| 1 | Correctness & logic bugs | Re-review: previous Client.Go-001/003/007 remain resolved. New issue: a dead/no-op test condition in `alarms_test.go` (Client.Go-011). | +| 2 | mxaccessgw conventions | `gofmt -l ./...` and `go vet ./...` are clean. No new issues. | +| 3 | Concurrency & thread safety | New issue: `runGalaxyWatch` limit-reached path returns without waiting for the WatchDeployEvents goroutine to drain (Client.Go-013). | +| 4 | Error handling & resilience | New issue: direct `err == io.EOF` comparisons should use `errors.Is` for chain robustness (Client.Go-014). | +| 5 | Security | No issues found — TLS-by-default with TLS 1.2 floor, API key redaction in CLI JSON, no secret logging. | +| 6 | Performance & resource management | No issues found — `defer client.Close()` / `defer subscription.Close()` consistently applied across CLI and library; bench-read-bulk preallocates latency slice. | +| 7 | Design-document adherence | No new issues. The lazy `grpc.NewClient` + readiness probe migration (Client.Go-005) was applied uniformly to `Dial` and `DialGalaxy`. | +| 8 | Code organization & conventions | New issue: `runWriteBulkVariant`'s `secured` parameter is computed but unused (Client.Go-015). | +| 9 | Testing coverage | Coverage holes from prior review now filled (Client.Go-008). `fakeGalaxyServer.watchSendInterval` is declared but never set — minor test cruft (Client.Go-016). | +| 10 | Documentation & comments | New issue: the CLI `writeUsage` line is missing the six bulk and bench subcommands now wired into `run` (Client.Go-012). | ## Findings @@ -175,3 +175,106 @@ **Recommendation:** Reword to describe the actual connect/timeout semantics after resolving Client.Go-005, and clarify that `DialTimeout` bounds the initial connect attempt. **Resolution:** Resolved 2026-05-18: alongside the Client.Go-005 migration, the `Dial` doc comment was rewritten to describe the lazy `grpc.NewClient` connection, the `DialTimeout`-bounded (default 10s, or ctx deadline when sooner) readiness probe, that a briefly-unavailable gateway recovers instead of producing a hard error, and that cancelling `ctx` aborts the probe. `DialGalaxy` and the new `dial`/`waitForReady`/`callContext` helpers carry matching doc comments. + +### Client.Go-011 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/go/mxgateway/alarms_test.go:66-73` | +| Status | Resolved | + +**Description:** `TestAcknowledgeAlarmRejectsNilRequest` contains a no-op `if` with an empty body whose intent is documented in a comment ("Accept either: the helper returned the literal sentinel, or the generic transport error — both prove nil was rejected"). The condition + +```go +if err == nil || !errors.Is(err, errors.Unwrap(err)) && err.Error() != "mxgateway: acknowledge alarm request is required" { + // ... +} +``` + +evaluates expressions for side effects only and asserts nothing — Go's `&&` binds tighter than `||`, the body is empty, and the actual nil check happens on the very next `if err == nil`. The block is effectively dead code masquerading as a check. It also evaluates `errors.Unwrap(err)` regardless of `err`'s shape, and would call `err.Error()` even when err might be a wrapped status error whose message wording the gateway is free to change — making the apparent assertion brittle on top of being dead. + +**Recommendation:** Drop the empty-body `if` entirely (the subsequent `if err == nil { t.Fatalf(...) }` already enforces the contract), or, if the intent is to additionally pin the literal error message for the sentinel path, replace it with a real assertion (`if err.Error() != "mxgateway: acknowledge alarm request is required" { t.Fatalf(...) }`) and remove the spurious `errors.Is(err, errors.Unwrap(err))` clause. + +**Resolution:** 2026-05-20 — Removed the empty-body `if` in `TestAcknowledgeAlarmRejectsNilRequest`; the subsequent `if err == nil { t.Fatalf(...) }` already enforces the nil-rejection contract without the dead, brittle compound predicate. + +### Client.Go-012 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/go/cmd/mxgw-go/main.go:1063-1065`, `clients/go/cmd/mxgw-go/main.go:88-104` | +| Status | Resolved | + +**Description:** `writeUsage` lists the available subcommands as `version|open-session|close-session|register|add-item|advise|subscribe-bulk|unsubscribe-bulk|write|stream-events|smoke|galaxy-test-connection|galaxy-last-deploy|galaxy-discover|galaxy-watch`. Six subcommands wired into `run` are missing from this list: `read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, and `bench-read-bulk`. A user invoking `mxgw-go` with no args or an unknown command (the two paths that print this banner) sees an incomplete CLI surface and may believe the bulk-write / read-bulk families are not implemented. The README does document them, but the inline usage banner is the first source of truth a CLI user consults. + +**Recommendation:** Extend the usage string to include every command registered in the `switch args[0]` in `run`, or generate it from a single source-of-truth slice keyed on command name → handler so the two cannot drift again. + +**Resolution:** 2026-05-20 — `writeUsage` now lists the previously missing `read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, and `bench-read-bulk` subcommands alongside the original surface, so the no-args / unknown-command banner reflects every command wired into `run`. + +### Client.Go-013 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `clients/go/cmd/mxgw-go/main.go:1246-1249`, `clients/go/cmd/mxgw-go/main.go:1257-1262` | +| Status | Resolved | + +**Description:** In `runGalaxyWatch`, the signal-cancellation branch carefully drains the buffered `events` channel after `cancelStream()` so the `WatchDeployEvents` goroutine can exit (`for range events { }`). The limit-reached branch (`if *limit > 0 && count >= *limit { cancelStream(); return nil }`) skips that drain and returns immediately. After the function returns, `defer client.Close()` runs and tears down the gRPC connection; in the gap before the connection close propagates, the WatchDeployEvents goroutine may still be blocked on `case events <- event:` (the channel is buffered to 16 but a slow producer can refill it) — the goroutine then exits via `<-ctx.Done()` because `streamCtx` was cancelled, so it isn't a permanent leak, but the two cancellation paths behave inconsistently and the limit-reached path can briefly hold a goroutine plus the gRPC stream while the client tears down underneath it. + +**Recommendation:** Factor the drain into a helper and use it from both branches, e.g. after `cancelStream()` always `for range events { }` (and let the surrounding `select`/`for` re-evaluate `<-errs` if a terminal error was already buffered). Alternatively, drop the explicit drain in both branches and rely on `defer cancelStream()` plus `defer client.Close()` — but pick one model and apply it consistently. + +**Resolution:** 2026-05-20 — The limit-reached branch in `runGalaxyWatch` now drains the buffered `events` channel (`for range events { }`) after `cancelStream()`, matching the signal-cancel branch. Both cancellation paths now wait for the `WatchDeployEvents` goroutine to exit before `defer client.Close()` tears the gRPC connection down. + +### Client.Go-014 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Error handling & resilience | +| Location | `clients/go/mxgateway/session.go:602`, `clients/go/mxgateway/galaxy.go:189` | +| Status | Resolved | + +**Description:** Two stream Recv loops compare end-of-stream with `err == io.EOF` directly: + +- `session.go:602` — `if err == io.EOF || status.Code(err) == codes.Canceled || streamCtx.Err() != nil { return }` +- `galaxy.go:189` — `if recvErr == io.EOF { return }` + +gRPC's generated `Recv()` does return the `io.EOF` sentinel directly today, so the comparisons work in practice. However, the Go idiom (and the project's `docs/style-guides/GoStyleGuide.md`) is to use `errors.Is(err, io.EOF)` so future wrapping (e.g. an interceptor decorating Recv errors) does not silently flip the loop from "stream finished normally" to "stream produced an error". The mxgateway client itself wraps non-EOF Recv errors in `*GatewayError`, which `errors.Is` already supports — using `errors.Is` keeps both paths consistent. + +**Recommendation:** Replace `recvErr == io.EOF` / `err == io.EOF` with `errors.Is(err, io.EOF)` (the `errors` package is already imported in both files). + +**Resolution:** 2026-05-20 — Both stream Recv loops now use `errors.Is(err, io.EOF)`: `session.go` already imported `errors`, and `galaxy.go` gained the missing `errors` import alongside the `recvErr == io.EOF` → `errors.Is(recvErr, io.EOF)` change, keeping EOF detection robust against any future Recv-error wrapping. + +### Client.Go-015 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/go/cmd/mxgw-go/main.go:410-512` | +| Status | Resolved | + +**Description:** `runWriteBulkVariant(ctx, args, stdout, stderr, command, withTimestamp, secured bool)` accepts `secured` but never uses it — the routing is keyed on `command` (the string `"write-bulk"` / `"write2-bulk"` / `"write-secured-bulk"` / `"write-secured2-bulk"`). The function ends with `_ = secured // currently only used for routing above; reserved for future per-variant validation`, which is misleading because `secured` is not in fact used for routing. The four wrapper functions (`runWriteBulk`, `runWrite2Bulk`, `runWriteSecuredBulk`, `runWriteSecured2Bulk`) all pass a `secured` argument that has no effect. The four CLI options `-current-user-id`, `-verifier-user-id` are unconditionally registered on every variant, including the non-secured ones, so a `write-bulk` invocation that passes `-current-user-id 42` silently does nothing. Either remove `secured` and the dead `_ = secured` comment, or use it to gate the registration of secured-only flags so wrong combinations are rejected with a clean error. + +**Recommendation:** Drop the `secured` parameter (the `command` switch already distinguishes the four variants) and the misleading `_ = secured` line; or, if validation is the goal, branch flag registration on `secured` so secured-only flags are unavailable for the non-secured variants and emit a clean usage error if they appear. + +**Resolution:** 2026-05-20 — Dropped the unused `secured` parameter from `runWriteBulkVariant` (the `command` switch already distinguishes the four variants) and removed the misleading `_ = secured` line. The variant is now derived locally from `command` and used to gate flag registration: `-current-user-id` / `-verifier-user-id` are only registered for the secured variants and `-user-id` only for Write/Write2, so a wrong-variant flag now fails with a clean `flag provided but not defined` usage error instead of silently no-op'ing. The four `runWrite*Bulk` wrappers were updated to match the new signature. + +### Client.Go-016 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/go/mxgateway/galaxy_test.go:382-429` | +| Status | Resolved | + +**Description:** `fakeGalaxyServer.watchSendInterval` is declared on the test fake and consulted inside `WatchDeployEvents` (`if s.watchSendInterval > 0 { ... }`) but no test in the package sets a non-zero value. The dead field plus its branch were presumably added to support a backpressure / pacing test that was never landed, and now the only effect is reader confusion ("which test uses this?") and a pointlessly larger fake. Backpressure on the bootstrap-plus-events sequence is also genuinely worth testing, given that `WatchDeployEvents` writes to a 16-deep buffered channel. + +**Recommendation:** Either delete the unused `watchSendInterval` field and its branch in `WatchDeployEvents`, or add the test it was added for — e.g. one that pumps more than 16 events with a small interval and asserts the consumer keeps up without losing or reordering events. Linking the field to a `// for TestX` comment if it stays would also help. + +**Resolution:** 2026-05-20 — Removed the unused `watchSendInterval` field from `fakeGalaxyServer` and the corresponding `if s.watchSendInterval > 0 { ... }` branch in `WatchDeployEvents`; no test set the field, so the dead code path is gone and the fake is leaner. `gofmt -w` reflowed the struct to drop the no-longer-needed field-name padding. diff --git a/code-reviews/Client.Java/findings.md b/code-reviews/Client.Java/findings.md index 7333e40..03a3ab9 100644 --- a/code-reviews/Client.Java/findings.md +++ b/code-reviews/Client.Java/findings.md @@ -4,25 +4,29 @@ |---|---| | Module | `clients/java` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `3cc53a8` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +A second-pass review against commit `1cd51bb`. Client.Java-001 through +Client.Java-012 are unchanged from the prior pass; the table below records the +new findings raised in this pass against the same checklist categories. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: `register`/`addItem` silently fall back to `getReturnValue()` masking missing payloads (Client.Java-004); fragile `resolved()` mutation pattern (Client.Java-012). | -| 2 | mxaccessgw conventions | Largely adheres; the gateway protocol-version handshake is never verified despite the contract field existing (Client.Java-003). | -| 3 | Concurrency & thread safety | Issue found: `MxEventStream.next` is a plain field and terminal-state transitions race (Client.Java-002). | -| 4 | Error handling & resilience | Issues found: `close()` can mask the primary exception (Client.Java-005); async/sync error surfaces inconsistent (Client.Java-008). | -| 5 | Security | Issue found: API-key redaction leaks the trailing 4 secret characters (Client.Java-001). | -| 6 | Performance & resource management | Issues found: `close()` does not await termination (Client.Java-006); no stream flow control (Client.Java-011). | -| 7 | Design-document adherence | Matches `JavaClientDesign.md` closely; the protocol-version check is undocumented-missing (Client.Java-003). | -| 8 | Code organization & conventions | Issue found: ~80 duplicated lines across the two clients (Client.Java-009). | -| 9 | Testing coverage | Issue found: alarm RPCs, TLS setup, async streams, and queue overflow untested (Client.Java-007). | -| 10 | Documentation & comments | Issue found: README/Javadoc assert undocumented scope names (Client.Java-010). | +| 1 | Correctness & logic bugs | Issues found: CLI `MxEventStream(1024)` capacity contradicts Javadoc/README "16-element buffer" claim (Client.Java-017); CLI `DeployEvent.sequence` printed with `%d` as signed `long` (Client.Java-020). | +| 2 | mxaccessgw conventions | No new issues found in this pass. | +| 3 | Concurrency & thread safety | Issues found: `MxEventStream.beforeStart` does not honour pre-start `close()` and leaks the gRPC call (Client.Java-014); `MxGatewayChannels.toCompletable` cancellation propagation is broken once the future is wrapped in `thenApply` (Client.Java-015). | +| 4 | Error handling & resilience | Issue found: `MxGatewaySecrets.redactCredentials` only inspects whitespace-delimited tokens, so colon/comma/quote-embedded `mxgw_` credentials leak through (Client.Java-018). | +| 5 | Security | Issue found: same `redactCredentials` leak — see Client.Java-018. | +| 6 | Performance & resource management | Issue found: client `close()` uses the *connect* timeout as its shutdown deadline (Client.Java-019). | +| 7 | Design-document adherence | No new issues found in this pass. | +| 8 | Code organization & conventions | Issue found: channel `close()` / `closeAndAwaitTermination()` are still duplicated verbatim across `MxGatewayClient` and `GalaxyRepositoryClient` despite Client.Java-009's stated resolution (Client.Java-016). | +| 9 | Testing coverage | Issue found: CLI `FakeSession` does not implement the five bulk methods added to `MxGatewayCliSession`, so the CLI test module fails to compile against the current source (Client.Java-013). | +| 10 | Documentation & comments | Issue found: docs claim a 16-element event-stream buffer that is actually 1024 in production (Client.Java-017). | ## Findings @@ -205,3 +209,123 @@ **Recommendation:** Make `resolved()` return an immutable resolved value object, or compute `resolvedApiKey`/`resolvedTimeout` lazily in their getters so call ordering cannot produce stale output. **Resolution:** (2026-05-18) Confirmed against source: `resolved()` populated the `resolvedApiKey`/`resolvedTimeout` mutable fields and `toClientOptions()`/`redactedJsonMap()` read them, so calling either before `resolved()` emitted stale empty/30s defaults. The two mutable fields were removed and replaced with side-effect-free accessor methods `resolvedApiKey()` and `resolvedTimeout()` that compute their value on each call (API key from `--api-key` or the `--api-key-env` variable; timeout via `parseDuration`). `toClientOptions()` and `redactedJsonMap()` now call those accessors directly, so call ordering can no longer produce stale output. `resolved()` is retained as a no-op returning `this` purely for call-site readability (`common.resolved()`), with its Javadoc updated to state resolution is now lazy. Pure-refactor with no runtime-behavior change for the existing call order, so no new test was added; covered by the existing `MxGatewayCliTests` JSON-redaction and option-parsing tests. + +### Client.Java-013 + +| Field | Value | +|---|---| +| Severity | High | +| Category | Testing coverage | +| Location | `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java:212-304`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1214-1244` | +| Status | Resolved | + +**Description:** `MxGatewayCliSession` in `MxGatewayCli.java:1214` was extended in commit `f220908` (the "bulk read/write CLI subcommands" change) with five new abstract methods — `readBulk`, `writeBulk`, `write2Bulk`, `writeSecuredBulk`, `writeSecured2Bulk`. The test-only `FakeSession` in `MxGatewayCliTests.java:212` still only implements the original set (register/addItem/advise/writeRaw/subscribeBulk/unsubscribeBulk/streamEventsAfter) and is declared a concrete (non-abstract) class. A clean compile of `mxgateway-cli`'s test source set therefore fails: a concrete implementer that omits abstract interface methods is a compile error. The stale `.class` files under `build/classes/java/test/` predate the interface change (dated 2026-05-20 03:38 vs CLI source dated 2026-05-20 05:06), which is why the issue is not visible until the next clean build. `gradle test` (or any CI pipeline that does not retain incremental state) will fail to build the CLI test module. The `CLAUDE.md` source-update workflow row "When source code changes, build and test the affected component" was not honoured for this CLI contract change. + +**Recommendation:** Add the five missing `@Override` implementations to `FakeSession` (stubs returning empty lists are fine — only `subscribeBulk`/`unsubscribeBulk` are exercised by the existing tests, and the new bulk subcommands have no dedicated CLI tests yet). Optionally also add at least one CLI-level test for `read-bulk`, `write-bulk`, and the `bench-read-bulk` subcommands to keep parity with the .NET / Go / Rust CLI smoke matrix. + +**Resolution:** 2026-05-20 — Added the five missing `@Override` stubs (`readBulk`, `writeBulk`, `write2Bulk`, `writeSecuredBulk`, `writeSecured2Bulk`) to `FakeSession` in `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java`, each returning an empty `ArrayList<>` to match the interface return types (`List` / `List`) without throwing. Imported `BulkReadResult`, `BulkWriteResult`, `WriteBulkEntry`, `Write2BulkEntry`, `WriteSecuredBulkEntry`, `WriteSecured2BulkEntry` from `mxaccess_gateway.v1.MxaccessGateway`. `GrpcMxGatewayCliSession` in `MxGatewayCli.java` is the only other implementer and already provides the methods (the source change that introduced the contract added them there). Verified with `gradle clean` followed by `gradle :mxgateway-cli:compileTestJava` and `gradle :mxgateway-cli:test` from `clients/java`, both BUILD SUCCESSFUL. No new CLI-level tests for the bulk subcommands were added — that follow-up is tracked separately and out of scope for this unblock-compilation fix. + +### Client.Java-014 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Concurrency & thread safety | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:59-65,117-124` | +| Status | Resolved | + +**Description:** `MxEventStream.observer().beforeStart` simply assigns `requestStream` without checking the `closed` flag, while `close()` reads `requestStream` after setting `closed = true`. If `close()` runs *before* the gRPC call has attached its `ClientCallStreamObserver` (a real race when callers cancel immediately after subscribing — e.g. construct, then close in a `finally` block when an unrelated setup step throws), then at close time `requestStream` is `null`, so `stream.cancel(...)` is skipped. `beforeStart` then fires later, stores the live `requestStream`, and never observes `closed` — the underlying gRPC call leaks open and continues delivering events to a `MxEventStream` whose consumer has stopped iterating. The sibling `DeployEventStream.beforeStart` already does the correct thing (`if (closed.get()) { requestStream.cancel(...); }`); the two adaptors should behave identically. + +**Recommendation:** Mirror `DeployEventStream`'s pattern in `MxEventStream.beforeStart`: after storing `requestStream`, check the `closed` flag and cancel the stream eagerly if a prior `close()` has already fired. Add a regression test analogous to `GalaxyRepositoryClientTests.deployEventStreamCloseBeforeBeforeStartCancelsStream` to lock in the behavior. + +**Resolution:** 2026-05-20 — Mirrored `DeployEventStream.beforeStart` in `MxEventStream.beforeStart`: after storing the `ClientCallStreamObserver`, the observer now reads the `closed` flag and calls `requestStream.cancel("client cancelled event stream", null)` when a prior `close()` already fired, closing the close/beforeStart race that previously leaked the underlying gRPC call. The fix uses the existing `volatile boolean closed` field (already established as a happens-before publisher by `close()` setting it before reading `requestStream`); no field shape changes were needed. `clients/java/README.md` documents the new safe-close-before-beforeStart contract. Regression test: `MxGatewayMediumFindingsTests.mxEventStreamCloseBeforeBeforeStartCancelsStream` (mirrors `GalaxyRepositoryClientTests.deployEventStreamCloseBeforeBeforeStartCancelsStream`). + +### Client.Java-015 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Concurrency & thread safety | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java:112-138`, `MxGatewayClient.java:183-191,224-232,322-329`, `GalaxyRepositoryClient.java:164-170,212-214` | +| Status | Resolved | + +**Description:** `MxGatewayChannels.toCompletable` registers a `whenComplete` on the local `target` future to forward cancellation to the source gRPC `ListenableFuture`. Every caller — `openSessionAsync`, `invokeAsync`, `acknowledgeAlarmAsync`, `discoverHierarchyPageAsync`, `getLastDeployTimeAsync` — then chains `.thenApply(normalisingValidator(...))` or `.thenApply(::getOk)` and returns the *chained* future to the user. `CompletableFuture.thenApply` returns a new future whose cancellation does **not** propagate back to the source `target`. Cancelling the user-facing future therefore never sets `target.isCancelled() == true`, so `source.cancel(true)` is never invoked and the underlying gRPC call continues until its deadline expires. The `JavaClientDesign.md` "Streaming" section explicitly says "Stream cancellation should call `ClientCall.cancel`" — the same expectation reasonably applies to the unary `*Async` surface. + +**Recommendation:** Either return `target` directly from each `*Async` method (and inline the validator into the `FutureCallback.onSuccess` path so no `thenApply` is needed), or attach the cancellation listener to the *final* returned future. The cleanest fix is to have `MxGatewayChannels.toCompletable` return a future that wraps the validator internally and registers `whenComplete` on the final future. Add a regression test that cancels the user-facing future and verifies the gRPC call was cancelled (e.g. via a `ServerCallStreamObserver.setOnCancelHandler` latch). + +**Resolution:** 2026-05-20 — Fixed by inlining the reply validator into `MxGatewayChannels.toCompletable` so the user-visible future is the same future cancellation is bound to: added a new `toCompletable(source, operation, validator)` overload that runs the validator inside the `FutureCallback.onSuccess` path (normalising non-`MxGatewayException` `RuntimeException`s through `MxGatewayErrors.fromGrpc`, matching the existing synchronous `try/catch`). Replaced the previous `whenComplete`-based cancellation listener with a small `CancellingCompletableFuture` subclass whose `cancel(boolean)` forwards to the source `ListenableFuture.cancel(...)` unconditionally, so even the no-validator overload propagates cancellation deterministically (the `whenComplete` listener only fired when `target.isCancelled()` was already true, which is exactly the case `thenApply` broke). Updated `MxGatewayClient.openSessionAsync`, `MxGatewayClient.invokeAsync`, `MxGatewayClient.acknowledgeAlarmAsync`, `GalaxyRepositoryClient.testConnectionAsync`, and `GalaxyRepositoryClient.getLastDeployTimeAsync` to use the new validator overload directly (no `.thenApply` chain). `GalaxyRepositoryClient.discoverHierarchyAsync` is paged via `thenCompose`, so it now publishes the current in-flight page future via an `AtomicReference` and returns a top-level `CompletableFuture` whose overridden `cancel(boolean)` cancels whichever page is currently outstanding. `clients/java/README.md` documents the new cancellation contract: cancelling any `*Async` future aborts the underlying gRPC call. Regression tests: `MxGatewayMediumFindingsTests.invokeAsyncCancellationCancelsUnderlyingGrpcCall` (full in-process gRPC test using `ServerCallStreamObserver.setOnCancelHandler` to latch when the server observes RPC cancellation), `toCompletableValidatorOverloadForwardsCancellationToSource`, and `toCompletableNoValidatorOverloadForwardsCancellationToSource` (unit-level proofs that both `MxGatewayChannels.toCompletable` overloads forward `cancel(true)` to the source `ListenableFuture`). + +### Client.Java-016 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:361-391`, `GalaxyRepositoryClient.java:285-315` | +| Status | Resolved | + +**Description:** Client.Java-009 introduced `MxGatewayChannels` to deduplicate `createChannel`, `withDeadline`, `withStreamDeadline`, and `toCompletable`. The two `close()` / `closeAndAwaitTermination()` methods — added shortly after to fix Client.Java-006 — were not extracted along with them. The 30-line bodies of `MxGatewayClient.close()` + `closeAndAwaitTermination()` and `GalaxyRepositoryClient.close()` + `closeAndAwaitTermination()` are now duplicated verbatim, including the `awaitTermination(connectTimeout)` semantic (see Client.Java-019), the `InterruptedException` handling, and the `ownedChannel == null` guard. A fix to one path (e.g. introducing a dedicated `shutdownTimeout` option) will silently miss the other. + +**Recommendation:** Move the shutdown logic into `MxGatewayChannels.shutdown(ManagedChannel channel, MxGatewayClientOptions options)` and `MxGatewayChannels.shutdownAndAwaitTermination(...)`. Have both clients delegate to it. Same recommendation applies to the duplicated `MxGatewayAuthInterceptor` construction in the two constructors (`MxGatewayClient(Channel, ...)` and `GalaxyRepositoryClient(Channel, ...)`). + +**Resolution:** 2026-05-20 — Extracted the duplicated shutdown logic into `MxGatewayChannels.shutdown(ManagedChannel, MxGatewayClientOptions)` and `MxGatewayChannels.shutdownAndAwaitTermination(ManagedChannel, MxGatewayClientOptions)`. Both helpers handle the `ownedChannel == null` no-op, the orderly-shutdown / `awaitTermination` / `shutdownNow`-on-timeout escalation, and the `InterruptedException`-restoring-the-interrupt-flag path. `MxGatewayClient.close()`/`closeAndAwaitTermination()` and `GalaxyRepositoryClient.close()`/`closeAndAwaitTermination()` are now one-liners that delegate to the shared helpers, so a future change (such as Client.Java-019's `shutdownTimeout`) lives in one place. Unused `java.util.concurrent.TimeUnit` imports were removed from both clients. The constructor-level `MxGatewayAuthInterceptor` duplication noted in the recommendation was left in place — it is a single intercept call per constructor (2 lines) versus the 30-line shutdown duplication that was the actual maintenance hazard. Regression tests: `MxGatewayLowFindingsIITests.sharedShutdownHelperIsNoOpForNullChannel` (covers the null-channel guard), `shutdownAndAwaitTerminationHonoursShutdownTimeoutNotConnectTimeout`, and `shutdownEscalatesToShutdownNowWhenTimeoutExceeded` (cover the shared shutdown semantics; the second is also the Client.Java-019 regression). + +### Client.Java-017 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:25-36`, `clients/java/README.md:99-107` | +| Status | Resolved | + +**Description:** `MxEventStream.streamEvents` was recently widened from a 16-element buffer to a 1024-element buffer (`MxGatewayClient.streamEvents` at line 268: `new MxEventStream(1024)`). The class-level Javadoc on `MxEventStream` still says "the gateway can push events faster than the consumer drains the bounded 16-element buffer", and `clients/java/README.md` line 103 says "uses gRPC's default auto-inbound flow control with a fixed 16-element buffer". The fail-fast event-backpressure contract (Client.Java-011 resolution) was written against the older capacity. The `MxGatewayClient.streamEvents` inline comment even acknowledges the change ("A small queue overflows on any moderately active session; 1024 covers a realistic backlog"). Users of this surface will reason about realistic backpressure budgets using the wrong number. + +**Recommendation:** Update the `MxEventStream` Javadoc and the README to say "1024-element buffer" (or, since the capacity is a passed parameter, document it as a parameter rather than a constant). Consider exposing the capacity through `MxGatewayClientOptions` so callers can tune it per session. + +**Resolution:** 2026-05-20 — Updated the `MxEventStream` class Javadoc and `clients/java/README.md` so both say "1024-element buffer" instead of the obsolete "16-element buffer". The Javadoc also notes that capacity is a constructor parameter and that the production caller (`MxGatewayClient.streamEvents`) passes `1024` to absorb the session-backlog replay burst, so readers understand the value is a deliberate choice rather than a constant. Exposing the capacity through `MxGatewayClientOptions` was intentionally left out of scope — the v1 design keeps the event-stream surface minimal and `MxGatewayClient.streamEvents` is the only caller; if a tuning need arises in v2 the existing constructor already accepts the capacity. + +### Client.Java-018 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Security | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java:54-66` | +| Status | Resolved | + +**Description:** `redactCredentials(value)` splits its input on `\\s+` (whitespace) and only redacts whitespace-delimited tokens that start with `mxgw_` or equal `bearer` (case-insensitive). gRPC `Status.getDescription()` strings, log lines, and proto error messages can carry credentials separated by colons (`Bearer:mxgw_id_secret`), commas (`token=mxgw_id_secret,scope=...`), single quotes (`'mxgw_id_secret'`), parentheses (`(mxgw_id_secret)`), or embedded in URLs/paths — all of which leave the `mxgw_` token attached to a non-whitespace neighbour and survive redaction. `MxGatewayErrors.fromGrpc` is the primary consumer; a gateway error description like `authentication failed: 'mxgw_id_secret'` would round-trip the secret into the resulting `MxGatewayAuthenticationException` message. + +**Recommendation:** Replace the whitespace-split scrub with a regex-based pass that matches `mxgw_[A-Za-z0-9_-]+` anywhere in the string and substitutes ``; also redact `Bearer\s+\S+` as a unit so the token after `Bearer` is masked regardless of the surrounding punctuation. Cover with a fixture-style test alongside `MxGatewayFixtureTests.grpcAuthErrorsAreClassifiedAndRedacted` that asserts a quoted or comma-delimited credential is fully masked. + +**Resolution:** 2026-05-20 — Replaced the whitespace-split scrub with two compiled `Pattern` regexes: `mxgw_[A-Za-z0-9_-]+` matches any gateway-shaped credential anywhere in the string regardless of surrounding punctuation, and `(?i)bearer\s+\S+` masks an authorization-header style `Bearer ` as a unit so a non-mxgw bearer token cannot leak either. The mxgw pass runs first, so the bearer pass observes `Bearer ` for the common combined case and renders it idempotently. Regression tests in `MxGatewayFixtureTests`: `redactCredentialsHandlesNonWhitespaceDelimitedTokens` exercises single-quoted, double-quoted, comma-delimited, colon-delimited, parenthesised, URL-embedded, and bearer-header credentials; `redactCredentialsLeavesBenignContentAlone` confirms strings without credentials and a `null` input are unchanged. + +### Client.Java-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Performance & resource management | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:362-391`, `GalaxyRepositoryClient.java:286-315` | +| Status | Resolved | + +**Description:** Both clients' `close()` / `closeAndAwaitTermination()` use `options.connectTimeout()` as the upper bound on `awaitTermination`. The `connectTimeout` semantically describes how long the client will wait to *establish* the channel, not how long it should wait for in-flight calls and the Netty event loop to drain after `shutdown()`. With the default 10s connect timeout, shutting down a client with a long-running unary call already in flight will silently escalate to `shutdownNow()` and forcibly cancel it before the call's own deadline expires, defeating the deadline contract on `withDeadline`. Conversely, a caller who sets a small `connectTimeout` (e.g. 500 ms for a health probe) inherits an aggressively short shutdown deadline they probably did not intend. + +**Recommendation:** Introduce a dedicated `shutdownTimeout` on `MxGatewayClientOptions` (defaulting to e.g. 5–10 s independent of `connectTimeout`) and use it in `close()` and `closeAndAwaitTermination()`. Document the precedence in the Javadoc. This pairs naturally with the Client.Java-016 deduplication fix. + +**Resolution:** 2026-05-20 — Added a dedicated `shutdownTimeout` `Duration` on `MxGatewayClientOptions` (builder method `shutdownTimeout(Duration)`, accessor `shutdownTimeout()`, default 10 s), independent of `connectTimeout`. Both shared shutdown helpers introduced for Client.Java-016 (`MxGatewayChannels.shutdown` and `shutdownAndAwaitTermination`) call `options.shutdownTimeout()` as the `awaitTermination` upper bound, so a small `connectTimeout` (e.g. a 500 ms health-probe timeout) no longer forces a premature `shutdownNow()` on in-flight calls. The new option is reflected in `toString()` and documented on both helpers and the `close()`/`closeAndAwaitTermination()` Javadoc on both clients; `clients/java/README.md` notes the default and the independence from `connectTimeout`. Regression tests in `MxGatewayLowFindingsIITests`: `shutdownAndAwaitTerminationHonoursShutdownTimeoutNotConnectTimeout` (a 50 ms connect timeout + 1 s shutdown timeout + 200 ms graceful-termination channel never escalates to `shutdownNow()`), `shutdownEscalatesToShutdownNowWhenTimeoutExceeded` (a stuck channel beyond the shutdown timeout is forcibly shut down), and `shutdownTimeoutDefaultIsTenSecondsIndependentOfConnectTimeout` (the default holds even when `connectTimeout` is small). + +### Client.Java-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:244-254`, `galaxy_repository.proto:94` | +| Status | Resolved | + +**Description:** `galaxy_repository.proto` defines `DeployEvent.sequence` as `uint64`; the protobuf Java mapping projects that to a signed `long`. The CLI's text-mode `galaxy-watch` output prints it as `"seq=%d ..."`, which interprets the value as signed. For genuine wraparound this is implausible (deploy sequences will not reach `2^63`), but the broader pattern is brittle: any unsigned proto field printed via `%d` will display incorrectly past the signed boundary. The JSON path uses `protoJson(event)` which formats unsigned longs as numeric strings via `JsonFormat`, so JSON output is correct; only the text mode is at risk. + +**Recommendation:** Print the sequence with `Long.toUnsignedString(event.getSequence())` (or switch the text format to `%s` and pass the unsigned-string conversion). The same rule should apply to any other `uint64` proto fields that surface in CLI text output. + +**Resolution:** 2026-05-20 — Updated the `galaxy-watch` text-mode `out.printf` in `MxGatewayCli.GalaxyWatchCommand.call()` to use `%s` for the sequence field and pass `Long.toUnsignedString(event.getSequence())`, so deploy sequences past `2^63` render as their correct unsigned decimal string instead of a negative signed long. The JSON path through `protoJson(event)` was already correct (proto `JsonFormat` emits unsigned longs as decimal strings) and was left unchanged. An inline comment near the printf documents the unsigned-uint64 contract so the next person editing the format string knows not to switch back to `%d`. Regression test: `MxGatewayCliTests.deployEventSequenceRendersAsUnsignedForHighUint64` exercises the format string with the max-uint64 bit pattern (`-1L`) and asserts the output contains `seq=18446744073709551615` and does not contain `seq=-1`. diff --git a/code-reviews/Client.Python/findings.md b/code-reviews/Client.Python/findings.md index ae27468..7220266 100644 --- a/code-reviews/Client.Python/findings.md +++ b/code-reviews/Client.Python/findings.md @@ -4,25 +4,29 @@ |---|---| | Module | `clients/python` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `3cc53a8` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +A re-review at commit `1cd51bb` over the same module. Prior findings +(Client.Python-001 — Client.Python-012) remain closed and are kept as +history. This section reflects categories evaluated in this pass. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: dead `closed` variable (Client.Python-004); float/bytes value-mapping assumptions (Client.Python-008). | -| 2 | mxaccessgw conventions | Largely adheres; one missing export and a `*_raw` MXAccess-failure documentation gap (Client.Python-002, Client.Python-012). | -| 3 | Concurrency & thread safety | Issue found: `close()` idempotency claim does not hold under concurrent close (Client.Python-006). | -| 4 | Error handling & resilience | Issues found: inconsistent timeout-kwarg fallback (Client.Python-003); `success == 0` default-value hazard (Client.Python-011); inconsistent cancel helpers (Client.Python-007). | -| 5 | Security | No issues found — API keys redacted in repr and CLI output, TLS supported, no secret logging. | -| 6 | Performance & resource management | Issue found: `discover_hierarchy` buffers the whole hierarchy in memory (Client.Python-005). | -| 7 | Design-document adherence | Matches the design docs closely; minor CLI doc drift (Client.Python-001). | -| 8 | Code organization & conventions | Issues found: `MxGatewayCommandError` omitted from `__all__` (Client.Python-002); fragile circular-import workaround (Client.Python-010). | -| 9 | Testing coverage | Issue found: `write2`, `add_item2`, bulk-size limits, TLS `ca_file`, and CLI command bodies untested (Client.Python-009). | -| 10 | Documentation & comments | Issue found: stale "scaffold" package description (Client.Python-001). | +| 1 | Correctness & logic bugs | Issue found: `_use_plaintext` silently downgrades any `localhost:` / `127.0.0.1:` endpoint to plaintext (Client.Python-013). | +| 2 | mxaccessgw conventions | No new issues found — secrets redacted, MXAccess parity preserved, generated code untouched, no Blazor/COM violations apply (Python client). | +| 3 | Concurrency & thread safety | No new issues found — close-idempotency hazard fixed in Client.Python-006, shared `_canceling_iterator` cancels on `CancelledError`. | +| 4 | Error handling & resilience | No new issues found at this commit (prior 003, 007, 011 remain closed). | +| 5 | Security | Issue found: implicit plaintext-on-localhost (Client.Python-013) means a user explicitly listing a TLS-fronted loopback endpoint with `--api-key` but without `--tls`/`--plaintext` silently transmits the bearer token in cleartext. | +| 6 | Performance & resource management | No new issues found — `iter_hierarchy` streams pages lazily (Client.Python-005 resolution). | +| 7 | Design-document adherence | No new issues found — `PythonClientDesign.md` matches the implemented surface. | +| 8 | Code organization & conventions | Issue found: duplicate `from mxgateway.values import` lines in `commands.py:22-23` (Client.Python-014). | +| 9 | Testing coverage | Issues found: `bench_read_bulk` CLI body, `MAX_AGGREGATE_EVENTS` event-cap, and `_use_plaintext` localhost-auto-plaintext path are untested (Client.Python-015, Client.Python-016). | +| 10 | Documentation & comments | Issues found: `pyproject.toml` lacks PyPI metadata (`authors`, `license`, `classifiers`, `urls`) and no PEP 561 `py.typed` marker (Client.Python-017); auto-plaintext behaviour is undocumented (Client.Python-013). | ## Findings @@ -205,3 +209,258 @@ **Recommendation:** Document explicitly (README + docstring) that `*_raw` methods surface MXAccess HRESULT/status failures only inside the reply and do not raise `MxAccessError`, so parity-test callers know to inspect `protocol_status`/`hresult`/`statuses` themselves. **Resolution:** 2026-05-18 — Won't Fix (no behaviour change). Confirmed this is intentional, correct parity behaviour: the `*_raw` methods exist precisely so parity-test callers can inspect an unmodified gateway reply, including embedded MXAccess HRESULT/status failures, without an exception masking them. Changing `invoke_raw` to raise `MxAccessError` would defeat its purpose and duplicate `Session.invoke`. The finding's only actionable point is the documentation gap, which has been addressed: `clients/python/README.md` now states explicitly that `*_raw` methods enforce gateway protocol success only and do **not** run MXAccess-failure detection, and the docstrings of `GatewayClient.invoke_raw` and `Session.invoke_raw` say the same and point callers to inspect `protocol_status`/`hresult`/`statuses` (and to `Session.invoke` for the checked variant). No code/test change — the runtime contract is unchanged and correct. + +### Client.Python-013 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Security | +| Location | `clients/python/src/mxgateway_cli/commands.py:757-762` | +| Status | Resolved | + +**Description:** `_use_plaintext` silently returns `True` whenever the endpoint +string starts with `localhost:` or `127.0.0.1:`, even if neither `--plaintext` +nor `--tls` is supplied on the command line. Any CLI subcommand (e.g. +`mxgw-py open-session --endpoint localhost:5001 --api-key mxgw_`) then +attaches the API key to a plaintext gRPC channel without warning. This is a +silent security downgrade: a user who deliberately ran the gateway behind TLS +on loopback (e.g. for testing a production-shaped TLS config locally) and who +passes `--api-key` expecting the secret to be transport-protected gets a +plaintext bearer token instead. The auto-downgrade is also undocumented — +`README.md` and the CLI `--help` text both describe `--plaintext` and `--tls` +as the controls, with no mention that endpoint-prefix matching can override +either. The other client CLIs do not auto-downgrade: the .NET CLI uses +`https://`-prefix detection on a URI scheme (an explicit signal), Go and Java +require an explicit `--plaintext`/`--tls` choice, and Rust defaults to +plaintext only when `plaintext = true` is set on the options struct. + +**Recommendation:** Drop the localhost-prefix auto-plaintext branch and +require the user to pass `--plaintext` or `--tls` (or default to TLS to match +the rest of the matrix). If the implicit-localhost behaviour is kept for +ergonomics, document it prominently in both `README.md` and `--help`, emit a +stderr warning when `--api-key` is combined with the auto-downgrade path, and +add a CLI test asserting the auto-downgrade is in fact active so it is not +silently lost in a future refactor. + +**Resolution:** 2026-05-20 — Removed the silent `localhost:` / `127.0.0.1:` +auto-plaintext branch from `_use_plaintext`. The new contract matches the Go +and Java CLIs: **TLS is the default**, `--plaintext` is the only way to opt +in to an unencrypted channel, and `--tls` is accepted as a redundant, explicit +affirmation of the default (mutually exclusive with `--plaintext`, which now +raises `click.UsageError`). The `--plaintext` / `--tls` `--help` text and +`clients/python/README.md` both call out the new behaviour. Added six +regression tests in `clients/python/tests/test_cli.py` covering: (a) a +`localhost:` endpoint with no flags resolves to TLS, (b) a `127.0.0.1:` +endpoint with no flags resolves to TLS, (c) `--plaintext` opts in to plaintext, +(d) `--tls` is accepted and idempotent with the default, (e) `--plaintext` +combined with `--tls` is rejected, and (f) an end-to-end CliRunner test +asserting `ClientOptions.plaintext == False` flows through to +`GatewayClient.connect` when no flag is supplied against a `localhost:` +endpoint. **Behaviour change for callers:** scripts that previously relied on +`mxgw-py … --endpoint localhost:5000 …` selecting plaintext silently must now +add an explicit `--plaintext` flag (or set up TLS on the gateway). Calling +`mxgw-py` with an `--api-key` against a plaintext-only gateway without +`--plaintext` will now fail to connect rather than silently leaking the bearer +token. + +### Client.Python-014 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/python/src/mxgateway_cli/commands.py:22-23` | +| Status | Resolved | + +**Description:** `commands.py` has two consecutive `from mxgateway.values +import` lines: + +```python +from mxgateway.values import to_mx_value +from mxgateway.values import MxValueInput +``` + +These import from the same module and should be combined into a single +`from mxgateway.values import MxValueInput, to_mx_value`. The split form is +inconsistent with the rest of the file (every other module is imported in a +single statement) and would be flagged by `ruff`/`isort` if any linter were +configured. Pure style, no behavioural impact. + +**Recommendation:** Collapse the two imports into one statement, ordered to +match the conventional alphabetical-within-module pattern: +`from mxgateway.values import MxValueInput, to_mx_value`. + +**Resolution:** 2026-05-20 — Collapsed the two consecutive +`from mxgateway.values import to_mx_value` / `from mxgateway.values import MxValueInput` +lines in `clients/python/src/mxgateway_cli/commands.py` into a single +`from mxgateway.values import MxValueInput, to_mx_value` statement, matching +the alphabetical-within-module pattern used elsewhere in the file. Pure style +fix — no behavioural impact, covered by the existing CLI tests. + +### Client.Python-015 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/python/src/mxgateway_cli/commands.py:273-294,564-647`, `clients/python/tests/` | +| Status | Resolved | + +**Description:** `_bench_read_bulk` is a ~80-line CLI body that opens its own +session, registers, subscribe_bulks, runs a warm-up loop, a measurement loop, +collects per-call latencies, computes a percentile summary, and emits the +shared cross-language JSON schema. It is the largest untested CLI command in +the module — `tests/` has no `bench_read_bulk` test, fake-stub-driven or +otherwise. A drift in the schema field names (`callsPerSecond`, +`cachedReadResults`, `latencyMs.p50`, …) would break the cross-language +`scripts/bench-read-bulk.ps1` aggregation silently. `_percentile_summary` and +`_percentile` are also untested — the boundary cases (`n == 0`, `n == 1`, +quantile interpolation) would benefit from a small unit test since the +identical algorithm is duplicated in the .NET / Go / Rust / Java drivers and +a divergence would corrupt cross-language comparisons. + +**Recommendation:** Add a fake-stub-driven `bench_read_bulk` test that drives +a short `--duration-seconds 0 --warmup-seconds 0` run through `CliRunner` and +asserts the JSON schema (`language == "python"`, the full key set, +`latencyMs.p50/p95/p99/max/mean` present). Add unit tests for `_percentile` +covering `n == 0`, `n == 1`, and a known-good interpolated value at p95 so +the implementation cannot silently drift from the other clients. + +**Resolution:** 2026-05-20 — Added `clients/python/tests/test_cli_bench_and_helpers.py` +with three layers of coverage. (1) `_percentile` unit tests pin the +cross-language algorithm (`rank = q * (n - 1)`, linear interpolation between +adjacent ranks): empty sample returns `0.0`, single element returns that +element, exact-rank queries return the sample value (p50 of `[10,20,30,40,50]` +is `30.0`), and the interpolated p95/p99 values (`48.0` / `49.6` for that same +five-element sample) are locked down so any drift from the .NET / Go / Rust / +Java drivers fails fast. (2) `_percentile_summary` tests assert the full +`{p50, p95, p99, max, mean}` dict shape, the zero-sample placeholder, and the +3-decimal rounding contract. (3) A `bench-read-bulk` smoke test +(`test_bench_read_bulk_emits_cross_language_schema`) drives the CLI through +`CliRunner` with `--duration-seconds 0 --warmup-seconds 0` against a fake stub +that handles `OpenSession`, `Register`, `SubscribeBulk`, `ReadBulk`, and +`UnsubscribeBulk`, then asserts the emitted JSON has exactly the 16 +cross-language schema keys (`language`, `command`, `endpoint`, `clientName`, +`bulkSize`, `durationSeconds`, `warmupSeconds`, `durationMs`, `tags`, +`totalCalls`, `successfulCalls`, `failedCalls`, `totalReadResults`, +`cachedReadResults`, `callsPerSecond`, `latencyMs`) and that `latencyMs` is a +`{p50, p95, p99, max, mean}` sub-object — guarding against silent breakage of +`scripts/bench-read-bulk.ps1`'s cross-language aggregation. No source change — +this is a pure coverage finding. + +### Client.Python-016 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/python/src/mxgateway_cli/commands.py:25,757-775,805-830` | +| Status | Resolved | + +**Description:** Three CLI helper paths are not covered by `tests/`: + +1. `_use_plaintext` localhost auto-downgrade (line 762) — the + `endpoint.startswith("localhost:") or endpoint.startswith("127.0.0.1:")` + branch (see also Client.Python-013) is untested; no test asserts that an + endpoint without `--plaintext` and without `--tls` resolves to plaintext. +2. `_collect_events` `MAX_AGGREGATE_EVENTS` guard (line 811-815) — passing + `--max-events` greater than `MAX_AGGREGATE_EVENTS` raises + `click.BadParameter`, but no test exercises the guard. A silent removal of + the constant or the comparison would not be caught. +3. `_api_key_from_env` (line 765-768) — only the implicit path through + `_secrets` is exercised; there is no test that verifies an env-var name + resolves to a value and that an unset env var produces `None`. + +These are all small, fake-stub-driven CLI behaviours rather than end-to-end +paths. The previous coverage finding (Client.Python-009) closed without +adding tests for these specific paths. + +**Recommendation:** Add three small `CliRunner` / unit tests: one asserting +the localhost auto-plaintext (or its replacement, if Client.Python-013 is +fixed), one asserting `--max-events 10001` exits non-zero with the +`MAX_AGGREGATE_EVENTS` error message, and one asserting +`_api_key_from_env("MXGATEWAY_API_KEY")` returns the env value and `None` for +an unset variable. + +**Resolution:** 2026-05-20 — Scope adjusted: Client.Python-013 has since +removed the `_use_plaintext` localhost auto-plaintext branch, so item (1) is +no longer a real code path — the +`test_use_plaintext_requires_explicit_flag_for_localhost_endpoint` and +`test_cli_localhost_endpoint_defaults_to_tls_via_open_session` regressions +added under Client.Python-013 already pin the new TLS-by-default contract. +The remaining two helpers are now covered in +`clients/python/tests/test_cli_bench_and_helpers.py`. (2) +`MAX_AGGREGATE_EVENTS` cap: +`test_collect_events_rejects_max_events_above_aggregate_cap` drives +`stream-events` with `--max-events 10001` through `CliRunner` against +stubbed `_connect` / `_session` fakes and asserts the CLI exits non-zero with +the documented `less than or equal to 10000` message; +`test_collect_events_accepts_max_events_at_aggregate_cap_boundary` confirms +`--max-events 10000` is accepted at the boundary and returns an empty event +list. (3) `_api_key_from_env`: +`test_api_key_from_env_resolves_value_when_variable_is_set` (env-var +populated → returned), +`test_api_key_from_env_returns_none_when_variable_is_unset` (env-var unset +→ `None`), `test_api_key_from_env_returns_none_when_name_is_none` (the +`name is None` early-return), and +`test_api_key_from_env_returns_none_when_name_is_empty_string` (the +`if not name` truthiness guard). No source change — pure coverage finding. + +### Client.Python-017 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/python/pyproject.toml:5-25`, `clients/python/src/mxgateway/` | +| Status | Resolved | + +**Description:** The package metadata in `pyproject.toml` is minimal for a +published wheel: + +* No `authors` field. PyPI / `pip show` will display no author. +* No `license` field, no `license-files` field, and no `LICENSE` file is + referenced from the project. The repo as a whole has no top-level + `LICENSE` either, but other client packages (Java has a license entry, the + .NET package has a license expression in the `csproj`) tend to set this. +* No `classifiers` (no `Programming Language :: Python :: 3.12`, + `Operating System :: Microsoft :: Windows`, `Topic :: …`, no + development-status classifier). Without these the PyPI search facets are + empty and tooling like `pip` cannot tell whether the package is + alpha/beta/stable. +* No `keywords`, no `[project.urls]` (no homepage / source / issue link + pointing back to the repo). +* The package ships no PEP 561 `py.typed` marker file in + `src/mxgateway/`. Type hints are written throughout the module + (`from __future__ import annotations`, full annotations on every public + function), but downstream consumers running `mypy` on `mxaccess-gateway-client` + will not see those hints — PEP 561 requires the marker file to opt the + package into type-stub distribution. + +**Recommendation:** Add `authors`, `license = ""`, `keywords`, and +`[project.urls]` to `pyproject.toml`; add at least the standard `classifiers` +trio (`Development Status`, `Programming Language :: Python :: 3.12`, +`Intended Audience`); create an empty `src/mxgateway/py.typed` file and +include it in the wheel via `[tool.setuptools.package-data]` so consumers +running `mypy` against an installed wheel pick up the type information. + +**Resolution:** 2026-05-20 — Filled out `clients/python/pyproject.toml` +with the missing PyPI metadata: `authors = [{ name = "MXAccess Gateway +Authors" }]`, `license = "Proprietary"` (the repo has no top-level +`LICENSE` file and no other client publishes under an OSS licence, so the +SPDX `Proprietary` expression matches the de-facto status), the standard +classifier set (`Development Status :: 4 - Beta`, `Intended Audience :: +Developers` / `Information Technology`, `Operating System :: Microsoft :: +Windows` and `:: POSIX`, `Programming Language :: Python` / +`Python :: 3` / `Python :: 3.12`, `Topic :: Software Development :: +Libraries :: Python Modules`, `Topic :: System :: Distributed Computing`, +and `Typing :: Typed`), a `keywords` list +(`mxaccess`, `archestra`, `gateway`, `grpc`, `industrial`, `scada`), and +`[project.urls]` with `Homepage` / `Source` / `Issues` pointing at the +Gitea repo. Added the PEP 561 marker file +`clients/python/src/mxgateway/py.typed` (empty, as the spec requires) and +declared it in `[tool.setuptools.package-data] mxgateway = ["py.typed"]` +so the wheel ships the marker and downstream `mypy` users see the +inline type hints. Pure metadata / packaging change — `python -m pytest -q` +still passes (91 tests). diff --git a/code-reviews/Client.Rust/findings.md b/code-reviews/Client.Rust/findings.md index 3592d07..ccb90cc 100644 --- a/code-reviews/Client.Rust/findings.md +++ b/code-reviews/Client.Rust/findings.md @@ -4,25 +4,27 @@ |---|---| | Module | `clients/rust` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `3cc53a8` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +This re-review (`1cd51bb`) covers the changes added since `3cc53a8`: the new bulk-write/read methods on `Session`, the `read_bulk` borrowed-slice signature, `MalformedReply` / `Unavailable` error variants, the projection-on-demand `MxValue`/`MxArrayValue`, the `next_correlation_id` rework, the new ReadBulk and bulk-write CLI subcommands, and the cross-language `bench-read-bulk` driver. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: a stale unit test fails the suite (Client.Rust-003); handle extractors silently return 0 on a shapeless OK reply (Client.Rust-005). | -| 2 | mxaccessgw conventions | `cargo clippy --workspace --all-targets -- -D warnings` fails (Client.Rust-001, Client.Rust-002, Client.Rust-012), violating a CLAUDE.md hard requirement; hard-coded correlation ids (Client.Rust-011). | -| 3 | Concurrency & thread safety | No issues found — clients are cheaply cloneable, streams are `Send`, drop-cancels-call is verified. | -| 4 | Error handling & resilience | Issues found: empty-vec on shapeless bulk reply (Client.Rust-006); no transient/permanent classification (Client.Rust-010). | -| 5 | Security | No issues found — API keys redacted in `Debug`/`Display`, status messages scrubbed, TLS handled correctly. | -| 6 | Performance & resource management | Issue found: value/array projections clone every element, doubling array memory (Client.Rust-008). | -| 7 | Design-document adherence | Issue found: `RustClientDesign.md` documents a stale crate layout and an unused `tracing` dependency (Client.Rust-007). | -| 8 | Code organization & conventions | Issue found: `BulkReplyKind` trips a clippy lint; undocumented public methods (Client.Rust-001, Client.Rust-002). | -| 9 | Testing coverage | Issue found: TLS setup, mid-stream fault propagation, and the bulk-size cap untested (Client.Rust-009). | -| 10 | Documentation & comments | Issue found: the version-constant doc comment is wrong (Client.Rust-004). | +| 1 | Correctness & logic bugs | Issue found: `read_bulk` is missing the OK-but-shapeless `MalformedReply` symmetry of the other bulk helpers, but the bigger issue is no test exercises any of the new `MalformedReply` paths (Client.Rust-016). | +| 2 | mxaccessgw conventions | Issue found: `cargo clippy --workspace --all-targets -- -D warnings` still fails — a fresh `clippy::doc_lazy_continuation` violation in `ReadBulkCommand`'s generated doc comment trips the lint that the prior fixes did not anticipate (Client.Rust-013). CLI subcommands still emit hard-coded `client_correlation_id` strings on the `raw` paths (Client.Rust-014). | +| 3 | Concurrency & thread safety | No issues found — `CORRELATION_SEQUENCE` is `AtomicU64` with `Relaxed`, which is correct for monotonic id generation; clients remain cheaply cloneable; streams are `Send`. | +| 4 | Error handling & resilience | Issue found: `bench-read-bulk` records every `read_bulk` failure into the latency histogram as if it succeeded, skewing p99/max upward (Client.Rust-015). The new `Error::Unavailable` mapping looks correct. | +| 5 | Security | No issues found — API keys still redacted in `Debug`/`Display`, status messages scrubbed, secret arguments unchanged. | +| 6 | Performance & resource management | No issues found in the changed code — `read_bulk` is honest about the unavoidable owned-Vec materialisation; projection-on-demand is now lazy. | +| 7 | Design-document adherence | Issue found: `RustClientDesign.md` was refreshed but never grew the new bulk-write/read methods, the `Unavailable`/`MalformedReply` error variants, or the `bench-read-bulk` CLI command on its current surface (Client.Rust-017). | +| 8 | Code organization & conventions | No new issues — `BulkWriteReplyKind` follows the renamed `BulkReplyKind` shape. | +| 9 | Testing coverage | Issue found: none of the new code paths (bulk-write helpers, `read_bulk`, `MalformedReply`, `Error::Unavailable`, the `bench-read-bulk` flow) are covered by client-side tests (Client.Rust-016). | +| 10 | Documentation & comments | No new issues beyond Client.Rust-017. | ## Findings @@ -205,3 +207,106 @@ **Recommendation:** Dereference instead of cloning: `*self.state.last_deploy.lock().unwrap()`. **Resolution:** Resolved in `0d8a28d` (2026-05-18): replaced `.clone()` with a deref. `cargo clippy --workspace --all-targets -- -D warnings` now passes cleanly. + +### Client.Rust-013 + +| Field | Value | +|---|---| +| Severity | High | +| Category | mxaccessgw conventions | +| Location | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:414-424` (origin); `clients/rust/src/generated.rs:11-31` (suppression site) | +| Status | Resolved | + +**Description:** `cargo clippy --workspace --all-targets -- -D warnings` fails again on this commit, this time on a `clippy::doc_lazy_continuation` violation in generated code: + +``` +error: doc list item without indentation + --> .../mxaccess_gateway.v1.rs:526:5 + | +526 | /// `timeout_ms == 0` uses the gateway-configured default (1000 ms). + | ^ +``` + +The lint fires because the `ReadBulkCommand` proto comment (added with the bulk Read feature in commit `5e375f6`) writes a bulleted list and then a trailing paragraph without the required blank line. prost-build forwards the proto comment verbatim into Rust doc comments, and the Rust client compiles those generated modules with crate-default lints. The crate already opts out of `clippy::large_enum_variant` in `src/generated.rs` for exactly this kind of generator-style problem, but `doc_lazy_continuation` is not on the allow-list, so the lint reaches `-D warnings` and breaks the documented `cargo clippy --workspace --all-targets -- -D warnings` invocation that CLAUDE.md mandates pass. The Rust client review was previously closed as clippy-clean (Client.Rust-001/002/012); this is the third clippy-clean regression caused by generated code in this module and warrants a more durable fix. + +**Recommendation:** Add `#![allow(clippy::doc_lazy_continuation)]` to each generated submodule in `clients/rust/src/generated.rs` alongside `clippy::large_enum_variant`, so generated doc comments — which the client cannot edit — cannot break the `-D warnings` build. Independently, fix the upstream proto comment to insert a blank line before the trailing paragraph so the C# / Go / Python / Java generators do not carry the same flaky text. + +**Resolution:** 2026-05-20 — Added `#![allow(clippy::doc_lazy_continuation)]` to each generated submodule in `clients/rust/src/generated.rs` next to the existing `clippy::large_enum_variant` allow, and reformatted the `ReadBulkCommand` proto comment in `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto` to surround the bulleted list with blank lines so doc-comment generators in every language see a properly-terminated list. `cargo clippy --workspace --all-targets -- -D warnings` and `cargo test --workspace` now pass, and `dotnet build src/MxGateway.Contracts/MxGateway.Contracts.csproj` reports 0 warnings. + +### Client.Rust-014 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | mxaccessgw conventions | +| Location | `clients/rust/crates/mxgw-cli/src/main.rs:450,497` | +| Status | Resolved | + +**Description:** Client.Rust-011 made `Session` build unique correlation ids per call, but the `mxgw` CLI's `Ping` and `CloseSession` subcommands still hard-code `client_correlation_id: "rust-cli-ping".to_owned()` and `"rust-cli-close-session".to_owned()`. Both go through `client.invoke(…)` / `client.close_session_raw(…)` rather than the `Session` helpers, so the library's id generator does not run. The CLI is the cross-language e2e driver — when the same machine runs concurrent CLI smokes, every `ping`/`close-session` request collides on the same correlation id in gateway logs, defeating the diagnostic value the library fix unlocked. + +**Recommendation:** Either (a) expose `session::next_correlation_id` as a `pub(crate)` or library-level helper and have the CLI call it from `Ping`/`CloseSession`, or (b) replace these RPCs with the higher-level `Session` helpers (`Session::close`, and a thin `Session::ping` wrapper) so the CLI shares the library's correlation-id discipline by construction. + +**Resolution:** 2026-05-20 — Promoted `session::next_correlation_id` from a module-private helper to a `pub` library-level function (it already lived in the `pub mod session`) and updated the `mxgw` CLI's `Ping` and `CloseSession` subcommands to call `mxgateway_client::session::next_correlation_id("cli-ping")` / `next_correlation_id("cli-close-session")` instead of the hard-coded `"rust-cli-ping"` / `"rust-cli-close-session"` strings. Concurrent CLI smokes now produce unique correlation ids per call — driven by the same process-wide `CORRELATION_SEQUENCE` `AtomicU64` the library uses — so gateway logs can tell collisions apart again. `cargo fmt`, `cargo build --workspace`, `cargo clippy --workspace --all-targets -- -D warnings`, and `cargo test --workspace` all pass. + +### Client.Rust-015 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `clients/rust/crates/mxgw-cli/src/main.rs:1053-1070` | +| Status | Resolved | + +**Description:** The new cross-language benchmark `bench-read-bulk` pushes the elapsed time of every `read_bulk` call into `latencies_ms` regardless of whether the call returned `Ok` or `Err`: + +```rust +let outcome = session.read_bulk(server_handle, &tags, timeout_ms).await; +let elapsed_ms = call_start.elapsed().as_secs_f64() * 1000.0; +latencies_ms.push(elapsed_ms); +match outcome { + Ok(results) => { successful_calls += 1; … } + Err(_) => failed_calls += 1, +} +``` + +A failed `read_bulk` (transient `Unavailable`, deadline-exceeded mid-call, etc.) typically returns *later* than a successful one — it includes the full per-call timeout that the success path never waits for. The histogram therefore conflates "p99 cached-read latency" with "p99 of (cached-read + timed-out call)", and the JSON document the PowerShell driver collates publishes `latencyMs.p99` / `latencyMs.max` that no longer represent successful-call latency. Worse, the failure category is silently dropped (`Err(_) => failed_calls += 1`) so a benchmark run that fails on every call still emits a coherent-looking JSON without ever surfacing why. This is misleading for a benchmark whose JSON shape is the cross-language comparison contract. + +**Recommendation:** Only push elapsed time into `latencies_ms` on `Ok`, or split into two histograms (`successLatencyMs` and `failureLatencyMs`) and log the first failure's error string into the stats record so a partial-failure run is visible at the report layer. + +**Resolution:** 2026-05-20 — Extracted the per-iteration accounting in `bench-read-bulk` into a `BenchReadBulkStats` helper with explicit `record_success`/`record_failure` methods. Successful `read_bulk` calls now flow into `success_latencies_ms` (driving the cross-language `latencyMs.p99`/`max` JSON contract), failures flow into a separate `failure_latencies_ms` histogram surfaced as `failureLatencyMs`, and the first failure's redacted error string is stashed as `firstFailure` so a partial-failure run is visible at the report layer instead of producing a coherent-looking JSON that hides every error. Added a unit test (`bench_read_bulk_stats_keeps_failures_out_of_success_latency_histogram`) that records two fast successes plus a deliberately slow failure and asserts the success histogram never sees the failure latency, plus a smaller smoke test for the zero-duration calls-per-second path. + +### Client.Rust-016 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `clients/rust/tests/client_behavior.rs`, `clients/rust/src/session.rs:489-519,654-768` | +| Status | Resolved | + +**Description:** The fixes for Client.Rust-005 / 006 added five new `Error::MalformedReply` paths to `session.rs` (`register_server_handle`, `add_item_handle`, `add_item2_handle`, `bulk_results`, `bulk_write_results`) plus the inline branch in `read_bulk`. None of them are exercised by tests — every test in `client_behavior.rs` feeds the matching payload back to the client, so the malformed-reply branches are dead code from the test suite's perspective. The new bulk-write helpers (`write_bulk`, `write2_bulk`, `write_secured_bulk`, `write_secured2_bulk`) only have a single happy-path assertion via `write_bulk`, leaving the three other variants and every per-entry-failure shape untested. The bench-read-bulk flow has no test (the driver script is the only consumer). The `Error::Unavailable` variant from Client.Rust-010 is covered by `event_stream_surfaces_a_mid_stream_status_fault`, but the same variant on a unary `Code::Unavailable` is not. + +**Recommendation:** Add three light tests against the existing `FakeGateway`: + +1. Have the fake reply to `AddItem` (or `Register` / `AddItem2`) with `protocol_status = Ok` and no payload, and assert the client surfaces `Error::MalformedReply`. +2. Have the fake reply to `WriteBulk` with `protocol_status = Ok` and the wrong payload arm (e.g. an `AddItemReply` body), and assert `Error::MalformedReply`. +3. Have the fake fail the unary `Invoke` with `Status::unavailable(...)` and assert `Error::Unavailable`. + +Optionally add Write2Bulk / WriteSecuredBulk / WriteSecured2Bulk smoke assertions so all four bulk-write families have at least one round-trip test. + +**Resolution:** 2026-05-20 — Added eight new integration tests in `clients/rust/tests/client_behavior.rs`. Each new `Error::MalformedReply` site is exercised via a test-only `InvokeOverride` injected into `FakeState` that lets a single test pin the fake gateway's `Invoke` handler to one of three malformed shapes (OK reply with no payload, OK reply with the wrong payload arm for `read_bulk`, OK reply with the wrong payload arm for the other bulk / bulk-write families): `register_returns_malformed_reply_when_ok_reply_has_no_payload`, `add_item_returns_malformed_reply_when_ok_reply_has_no_payload`, `add_item2_returns_malformed_reply_when_ok_reply_has_no_payload`, `subscribe_bulk_returns_malformed_reply_on_mismatched_payload_arm`, `write_bulk_returns_malformed_reply_on_mismatched_payload_arm`, and `read_bulk_returns_malformed_reply_on_mismatched_payload_arm`. The unary `Error::Unavailable` path is covered by `unary_invoke_maps_status_unavailable_to_error_unavailable` (the override returns `Status::unavailable(...)`). The remaining three bulk-write families gained round-trip smoke tests — `write2_bulk_round_trips_through_the_fake_gateway`, `write_secured_bulk_round_trips_through_the_fake_gateway`, `write_secured2_bulk_round_trips_through_the_fake_gateway` — extending the fake gateway's dispatcher with happy-path replies for `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk`. The `bench-read-bulk` flow gets a `BenchReadBulkStats` unit test in `crates/mxgw-cli/src/main.rs` (see Client.Rust-015) that asserts the latency-tracking change keeps failed-call durations out of `latencyMs`. + +### Client.Rust-017 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Design-document adherence | +| Location | `clients/rust/RustClientDesign.md:79-99,156-163` | +| Status | Resolved | + +**Description:** CLAUDE.md requires docs to change with the source. `RustClientDesign.md` was refreshed to fix the layout/`tracing` drift (Client.Rust-007), but the Session API surface in the design (`Library API` block, lines 79-99) still lists only the original six bulk helpers — `add_item_bulk`, `advise_item_bulk`, `remove_item_bulk`, `un_advise_item_bulk`, `subscribe_bulk`, `unsubscribe_bulk` — and is missing the five new bulk-write helpers and `read_bulk` (`write_bulk`, `write2_bulk`, `write_secured_bulk`, `write_secured2_bulk`, `read_bulk`) that landed in commits `5e375f6` / `f220908` / `61644e6`. The `Error Handling` block (lines 130-146) still enumerates `Transport`, `Status`, `Authentication`, `Authorization`, `Session`, `Worker`, `Command`, `MxAccess`, `Timeout`, `Cancelled` — but not `MalformedReply`, `Unavailable`, or `InvalidEndpoint`, all of which are now public variants of the crate's `Error` enum. The `Test CLI` block (lines 158-163) lists `version` / `smoke` / `stream-events` / `write` but is missing every new subcommand (`read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `bench-read-bulk`, `galaxy watch`). + +**Recommendation:** Bring the design doc back in sync: extend the `Session` API code block to enumerate the bulk-write/read methods, expand the `Error` enum to match `clients/rust/src/error.rs`, and add the missing CLI subcommands. The README is already up to date, so this is design-doc-only churn. + +**Resolution:** 2026-05-20 — Brought `clients/rust/RustClientDesign.md` back in sync with the implementation. The `Session` block now lists the five new bulk helpers (`write_bulk`, `write2_bulk`, `write_secured_bulk`, `write_secured2_bulk`, `read_bulk`) alongside the original six and notes that `session::next_correlation_id` is `pub` for raw-RPC consumers (the CLI). The `Error` enum block now matches `clients/rust/src/error.rs` — `InvalidEndpoint`, `InvalidArgument`, `Transport`, `Authentication`, `Authorization`, `Timeout`, `Cancelled`, `Unavailable`, `Status`, `Command`, `ProtocolStatus`, `MalformedReply` — with a short paragraph explaining what `Unavailable`, `MalformedReply`, and `InvalidEndpoint` classify. The `Test CLI` block enumerates every subcommand the binary exposes today: `version`, `ping`, `open-session`, `close-session`, `register`, `add-item`, `advise`, `subscribe-bulk`, `unsubscribe-bulk`, `read-bulk`, `write`, `write2`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `stream-events`, `bench-read-bulk`, `smoke`, and the `galaxy {test-connection,last-deploy-time,discover-hierarchy,watch}` subtree. diff --git a/code-reviews/Contracts/findings.md b/code-reviews/Contracts/findings.md index c62223d..52fa8f2 100644 --- a/code-reviews/Contracts/findings.md +++ b/code-reviews/Contracts/findings.md @@ -4,25 +4,27 @@ |---|---| | Module | `src/MxGateway.Contracts` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `6c64030` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +This re-review focuses on the contract delta introduced since the prior review at `6c64030` — primarily the new bulk write/read command family added in `5e375f6` (`WriteBulk`, `Write2Bulk`, `WriteSecuredBulk`, `WriteSecured2Bulk`, `ReadBulk`) plus the resolution changes for Contracts-001/002/004/005/006/007/008. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | No functional bugs; one missing reply-payload case for the by-name ack command and an `int32`-typed `success` flag that reads like a bool (Contracts-002, Contracts-006). | -| 2 | mxaccessgw conventions | Additive-only evolution honored (no renumbered/removed tags), MXAccess-aligned naming consistent, generated code untouched; no `reserved` statements declared as a guardrail (Contracts-005). | +| 1 | Correctness & logic bugs | New bulk command kinds, `BulkWriteResult`, and `BulkReadResult` align with the worker executor, validator (`MxAccessGrpcRequestValidator.ExpectedPayload`), and `MxAccessSession.ReadBulk`. Field numbering is contiguous and additive (10-43 on `MxCommand.payload`, 20-40 on `MxCommandReply.payload`); no collisions. No new functional bugs. | +| 2 | mxaccessgw conventions | Additive-only evolution preserved across all three protos; new wire-compatibility policy comment block (added under Contracts-005) is honored by the bulk additions; generated code untouched; naming and oneof usage are consistent with the style guide. No new violations. | | 3 | Concurrency & thread safety | N/A — pure contract definitions plus a static const class with no shared mutable state. | -| 4 | Error handling & resilience | HRESULT / `MxStatusProxy` / `ProtocolStatus` carriers are complete; the worker-side by-name alarm ack has no dedicated reply payload (Contracts-002). | -| 5 | Security | Credential-sensitive fields are clearly commented; no secrets forced into loggable shapes. No issues found. | -| 6 | Performance & resource management | `DiscoverHierarchy` is paged; alarm-snapshot streams are server-streamed; no bloat issues. No issues found. | -| 7 | Design-document adherence | `.proto` files match design intent but `docs/Grpc.md` is stale (Contracts-001); worker vs public alarm-status shapes unreconciled in docs (Contracts-008). | -| 8 | Code organization & conventions | Package/file layout correct; stale class summary (Contracts-004). Contracts-003 (`mxaccess_worker.proto` Protobuf item missing `ProtoRoot`) was re-triaged as not-a-defect — the attribute is already present. | -| 9 | Testing coverage | Gateway/worker/alarm round-trips covered; Galaxy Repository protos and raw `MxArray` paths untested (Contracts-007). | -| 10 | Documentation & comments | Proto comments accurate and domain-rich; one stale class summary (Contracts-004). | +| 4 | Error handling & resilience | `BulkWriteResult` carries the full `was_successful` + `hresult` + `statuses` + `error_message` carriers per entry; `BulkReadResult` carries `was_successful` + `was_cached` + per-entry value and statuses. The asymmetry (no `hresult` on `BulkReadResult`) is intentional given ReadBulk's lifecycle. No issues. | +| 5 | Security | The new `WriteSecuredBulkCommand` / `WriteSecured2BulkCommand` carry the redaction note on the outer command only, not on the inner entry's `value` field (Contracts-011); otherwise no secrets forced into loggable shapes. | +| 6 | Performance & resource management | `ReadBulk` is the only command without a 1:1 MXAccess analogue; the per-entry timeout shape (`uint32 timeout_ms`) and `was_cached` semantics avoid disturbing existing subscriptions. No bloat issues. | +| 7 | Design-document adherence | `gateway.md` documents the bulk write/read families, but `docs/Contracts.md` was not updated for them (Contracts-009). This violates the CLAUDE.md "update docs in the same commit as the source" rule for the bulk-read/write addition. | +| 8 | Code organization & conventions | Package / namespace / file layout correct; additive-only contract evolution observed; field numbers continuous and isolated by 100+ from diagnostic/control commands. No new issues. | +| 9 | Testing coverage | The bulk write/read families have no `ProtobufContractRoundTripTests` coverage (Contracts-010); Galaxy Repository protos and `MxArray` raw paths are now covered (per Contracts-007 resolution). | +| 10 | Documentation & comments | `GalaxyAttribute.mx_data_type` lacks an in-proto comment explaining it is a raw Galaxy integer (Contracts-012); the `GatewayContractInfoTests` summary is now stale (Contracts-013); credential-sensitive bulk entry `value` fields lack per-field redaction comments (Contracts-011). | ## Findings @@ -145,3 +147,78 @@ **Recommendation:** Document in `docs/Contracts.md` (or `AlarmClientDiscovery.md`) how the worker `native_status` maps onto the public reply's `status`/`hresult` pair so client authors know which field is authoritative. **Resolution:** _(2026-05-18)_ Verified against `WorkerAlarmRpcDispatcher.AcknowledgeAsync`. The asymmetry is larger than the finding implies: the dispatcher copies the worker `MxCommandReply.hresult` into `AcknowledgeAlarmReply.hresult` but **never** assigns `AcknowledgeAlarmReply.status` — the `MxStatusProxy status` field is left UNSET on every reply. The proto comment on `status` ("Native MxAccess status describing the outcome of the ack") was therefore actively misleading. Fixed: (1) reworded the `mxaccess_gateway.proto` comments on `AcknowledgeAlarmReply.hresult` (now identifies it as the authoritative native-return-code field) and `AcknowledgeAlarmReply.status` (now states it is reserved/unset and clients must not depend on it); (2) extended `docs/AlarmClientDiscovery.md` section 4 with a "Worker `native_status` → public `AcknowledgeAlarmReply` mapping" subsection spelling out that `hresult` is authoritative (`0` = success) and `status` is always unset, and that clients should branch on `protocol_status` then `hresult`, never `status`. + +### Contracts-009 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Design-document adherence | +| Location | `docs/Contracts.md:13-24` | +| Status | Resolved | + +**Description:** Commit `5e375f6` ("Add bulk read/write command family across worker, gateway, and clients") added five new command kinds — `WriteBulk`, `Write2Bulk`, `WriteSecuredBulk`, `WriteSecured2Bulk`, `ReadBulk` — plus the `BulkWriteReply` / `BulkWriteResult` and `BulkReadReply` / `BulkReadResult` shapes to `mxaccess_gateway.proto`. `gateway.md` (lines 299-322) was updated in that commit, but `docs/Contracts.md` was not. It still describes only the older bulk subscription family (`AddItemBulk`, `AdviseItemBulk`, `RemoveItemBulk`, `UnAdviseItemBulk`, `SubscribeBulk`, `UnsubscribeBulk`) returning `BulkSubscribeReply` with no mention of the bulk write/read commands or their per-entry result types. The CLAUDE.md rule "Update docs in the same change as the source. When public APIs, contracts, configuration, build steps, security behavior, event shapes, value conversion, status mapping, or lifecycle rules change, the affected docs … must change in the same commit" was violated for this addition. The result is that the canonical contracts document undercounts the public bulk surface by five commands. + +**Recommendation:** Extend the bulk-commands paragraph in `docs/Contracts.md` to list the new `WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk` / `ReadBulk` command kinds, the per-entry request shape (`WriteBulkEntry` etc.), and the new reply types (`BulkWriteReply` carrying `BulkWriteResult`; `BulkReadReply` carrying `BulkReadResult`). Cross-reference `gateway.md` for the cached-vs-snapshot `ReadBulk` lifecycle and `docs/DesignDecisions.md` "Bulk Command Family" for the per-entry-result rationale rather than re-stating those details. + +**Resolution:** _(2026-05-20)_ Confirmed `docs/Contracts.md` documented only the older bulk subscription family and never mentioned the bulk write/read additions from commit `5e375f6`. Cross-checked against `mxaccess_gateway.proto` (`MxCommand.payload` cases 39-43, `MxCommandKind` 30-34, the `Write*BulkCommand` / `Write*BulkEntry` shapes, `ReadBulkCommand` with `tag_addresses` + `timeout_ms`, `MxCommandReply.payload` cases 36-40, and the `BulkWriteReply`/`BulkWriteResult` + `BulkReadReply`/`BulkReadResult` messages). Extended the "Files" section of `docs/Contracts.md` with a new paragraph listing the five command kinds, the per-entry request shape for each `Write*Bulk` family (with the credential-sensitive redaction rule carried through to `WriteSecuredBulkEntry`/`WriteSecured2BulkEntry`), the `BulkWriteReply` + `BulkWriteResult` reply (including the `optional int32 hresult` field and the no-raise per-entry failure contract), and the `ReadBulkCommand` → `BulkReadReply` + `BulkReadResult` reply with the cached-vs-snapshot dual-mode semantics and the deliberate absence of `hresult` on `BulkReadResult`. Cross-references to `gateway.md` (lifecycle + scopes) and `docs/DesignDecisions.md` "Bulk Command Family" (rationale) added rather than re-stating those details. + +### Contracts-010 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | +| Status | Resolved | + +**Description:** Contracts-007 (closed 2026-05-18) added Galaxy Repository, bulk-subscribe, `MxValue.raw_value` / `MxArray.raw_values`, and `WorkerFault`/`WorkerHeartbeat` round-trip coverage. The bulk write/read messages added in commit `5e375f6` were never given equivalent coverage. `ProtobufContractRoundTripTests` has no test that exercises any of: `WriteBulkCommand` / `Write2BulkCommand` / `WriteSecuredBulkCommand` / `WriteSecured2BulkCommand` / `ReadBulkCommand`; `BulkWriteReply` / `BulkWriteResult`; `BulkReadReply` / `BulkReadResult`; the new `MxCommandReply.payload` oneof cases (`write_bulk`, `write2_bulk`, `write_secured_bulk`, `write_secured2_bulk`, `read_bulk`). The asymmetry that `BulkWriteResult` carries `hresult` and `BulkReadResult` does not, and the `optional int32 hresult` semantics on `BulkWriteResult`, are exactly the kind of wire-shape details prior contract tests have been written to pin. + +**Recommendation:** Add `ProtobufContractRoundTripTests` cases mirroring the existing `BulkSubscribeReply_RoundTripsSubscribeResults` / `MxCommandReply_RoundTripsBulkSubscribePayload` pattern: at minimum one round-trip per new request-side message (`WriteBulkCommand` covers the entry-list case; one secured variant proves the credential-sensitive shape; `ReadBulkCommand` covers `timeout_ms`), one round-trip for each new reply payload (`BulkWriteReply` carrying `BulkWriteResult` with `hresult` set + unset to exercise the proto3 `optional` presence; `BulkReadReply` carrying a `was_cached = true` and a `was_cached = false` entry), and at least one `MxCommandReply` test pinning a new payload-oneof case (e.g. `MxCommandReply.PayloadCase == PayloadOneofCase.ReadBulk` for `MxCommandKind.ReadBulk`). + +**Resolution:** _(2026-05-20)_ Added round-trip tests in `ProtobufContractRoundTripTests` covering every gap listed: per-request `WriteBulkCommand_RoundTripsEntries`, `Write2BulkCommand_RoundTripsEntriesWithTimestampValue`, `WriteSecuredBulkCommand_RoundTripsCredentialBearingEntries`, `WriteSecured2BulkCommand_RoundTripsCredentialBearingEntriesWithTimestamp`, `ReadBulkCommand_RoundTripsTagAddressesAndTimeout`; per-reply `BulkWriteReply_RoundTripsResultsWithOptionalHresultPresence` (asserts both `HasHresult == true` and `HasHresult == false` arms of the proto3 `optional int32 hresult`) and `BulkReadReply_RoundTripsCachedAndSnapshotResults` (covers `was_cached = true`, `was_cached = false`, and a per-entry failure with `error_message`; additionally pins the deliberate absence of an `hresult` field on `BulkReadResult` via the descriptor); and `MxCommandReply` oneof-case pinning via `MxCommandReply_RoundTripsBulkWritePayloadCases` (a `[Theory]` exercising the four bulk-write payload-oneof cases) plus `MxCommandReply_RoundTripsReadBulkPayload`. All new tests pass; the full `ProtobufContractRoundTripTests` + `GatewayContractInfoTests` filter is 42 tests green. + +### Contracts-011 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Security | +| Location | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:392-397`, `:406-412` | +| Status | Resolved | + +**Description:** The single-item `WriteSecuredCommand` (line 234-242) and `WriteSecured2Command` (line 244-253) put the credential-sensitivity redaction note on the `value` field directly ("Credential-sensitive write value. Implementations must not log this field unless an explicit redacted value-logging path is enabled."). The bulk equivalents move the note to the outer message instead — `WriteSecuredBulkCommand` (line 383-386) and `WriteSecured2BulkCommand` (line 399-400) carry it as a header comment — and the inner `WriteSecuredBulkEntry.value` (line 396) and `WriteSecured2BulkEntry.value` (line 410) are left without per-field comments. A future editor reading just `WriteSecuredBulkEntry` to add a new field or change the entry shape will not see the redaction rule. The ProtobufStyleGuide explicitly requires "Mark credential-bearing request fields clearly in comments"; the single-item path follows that rule, the bulk path does not. + +**Recommendation:** Add per-field credential-sensitivity comments to `WriteSecuredBulkEntry.value` and `WriteSecured2BulkEntry.value` matching the wording on `WriteSecuredCommand.value` / `WriteSecured2Command.value`. Comment-only change with no wire-format or generated-type impact. + +**Resolution:** _(2026-05-20)_ Added per-field credential-sensitivity comments to `WriteSecuredBulkEntry.value` and `WriteSecured2BulkEntry.value` in `mxaccess_gateway.proto`, mirroring verbatim the wording carried on `WriteSecuredCommand.value` / `WriteSecured2Command.value` ("Credential-sensitive write value. Implementations must not log this field unless an explicit redacted value-logging path is enabled."). The outer-message header redaction comment on `WriteSecuredBulkCommand` / `WriteSecured2BulkCommand` is retained so the rule is visible at both scopes. Comment-only change; no wire-format or generated-type impact (the `MxGateway.Contracts` build is clean against the regenerated code). + +### Contracts-012 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Contracts/Protos/galaxy_repository.proto:120` | +| Status | Resolved | + +**Description:** `GalaxyAttribute.mx_data_type` is declared as `int32` with no in-proto comment. The field carries the raw Galaxy SQL DB type identifier (from `dbo.data_type`), which deliberately does NOT correspond to the public `MxDataType` enum in `mxaccess_gateway.proto`; `docs/Contracts.md` calls this out ("The service is metadata-only and does not share types with mxaccess_gateway.proto") and `docs/GalaxyRepository.md:190` documents the choice ("`mx_data_type` is returned as the raw Galaxy integer rather than mapped to a language-neutral enum"), but the proto file itself gives the reader no signal. A client author looking at the .proto without those docs is likely to assume the field is a `MxDataType` value and write a `(MxDataType)` cast that silently misclassifies most attributes. The ProtobufStyleGuide rule "Comment fields that carry MXAccess parity details, raw HRESULT/status information, or compatibility constraints" applies — this is exactly a parity-detail / compatibility-constraint field where the int32 has non-obvious semantics. The accompanying `data_type_name`, `mx_attribute_category`, and `security_classification` int fields share the same gap. + +**Recommendation:** Add a short comment on `GalaxyAttribute.mx_data_type` (and ideally on `mx_attribute_category` and `security_classification`) clarifying that the value is a raw Galaxy SQL identifier passed through unchanged, NOT a member of the `mxaccess_gateway.v1.MxDataType` enum, with a pointer to `docs/GalaxyRepository.md`. Comment-only change; no wire-format impact. + +**Resolution:** _(2026-05-20)_ Added in-proto comments to `GalaxyAttribute.mx_data_type`, `data_type_name`, `mx_attribute_category`, and `security_classification` in `galaxy_repository.proto`. The `mx_data_type` comment explicitly calls out that the value is a raw Galaxy SQL `dbo.data_type` identifier passed through unchanged, that it is NOT a member of `mxaccess_gateway.v1.MxDataType`, and that the two enumerations must not be cast or compared (closing the silent-misclassification trap the finding describes). The `data_type_name` comment clarifies it is free-form Galaxy text from the same table, not a stable enum. `mx_attribute_category` and `security_classification` comments mark them as raw Galaxy-specific identifiers not mapped to any gateway enum. All four comments cross-reference `docs/GalaxyRepository.md` for the rationale rather than restating it. Comment-only change; no wire-format impact. + +### Contracts-013 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs:14` | +| Status | Resolved | + +**Description:** The XML summary on `GatewayContractInfoTests.GatewayProtocolVersion_IsVersionThree` reads "Verifies that the gateway protocol version is bumped to three after the alarm proto extension." That description is now incomplete: since the comment was written, the contract has been extended again (the bulk write/read command family in commit `5e375f6`) without a corresponding `GatewayProtocolVersion` bump. The test name says "IsVersionThree" but the summary attributes the value-of-3 to a single historical event (the alarm extension) — readers checking whether subsequent contract additions should have bumped the version will get a misleading rationale. This is the same class of stale-summary issue as Contracts-004 (`GatewayContractInfo` class summary), just relocated to the test that pins the constant. + +**Recommendation:** Reword the summary to describe what the test pins (the current `GatewayProtocolVersion` constant equals 3) rather than narrating a specific historical bump, OR explicitly enumerate the alarm- and bulk-write/read additions covered under version 3 so readers know both extensions were additive and intentionally did not require a bump. + +**Resolution:** _(2026-05-20)_ Reworded the XML summary on `GatewayContractInfoTests.GatewayProtocolVersion_IsVersionThree` to describe what the test actually pins: the current `GatewayProtocolVersion` constant equals 3, with both the alarm proto extension (`AcknowledgeAlarm` / `QueryActiveAlarms` RPCs, `OnAlarmTransitionEvent`, the alarm command/reply payload cases) AND the bulk write/read command family extension (`WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk` / `ReadBulk` with their `BulkWriteReply` / `BulkReadReply` payloads) shipping under version 3 as strictly additive changes that did not require a further bump. The new summary also instructs that a future breaking contract change should bump the constant and update the test in lock-step. Test logic is unchanged; the test still passes. diff --git a/code-reviews/IntegrationTests/findings.md b/code-reviews/IntegrationTests/findings.md index edb19b2..25d7029 100644 --- a/code-reviews/IntegrationTests/findings.md +++ b/code-reviews/IntegrationTests/findings.md @@ -4,13 +4,33 @@ |---|---| | Module | `src/MxGateway.IntegrationTests` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `6c64030` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +A comprehensive review completes every category, recording "No issues found" where +a category produced nothing rather than leaving it blank. + +### 2026-05-20 review (commit `1cd51bb`) + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Issue found: IntegrationTests-012 (Write test starts a `StreamEvents` task and never observes it — silent event-stream coverage gap and an unobserved fault path). | +| 2 | mxaccessgw conventions | Live opt-ins, `[Collection]` serialization, and the "don't synthesize events" rule are honored. No issues found. | +| 3 | Concurrency & thread safety | `LiveResourcesCollection` serializes all three live classes; `RecordingServerStreamWriter` locks correctly and the semaphore wait is linked to both timeout and external cancellation. No issues found. | +| 4 | Error handling & resilience | `ShutDownAsync` already isolates cleanup exceptions per category. No issues found. | +| 5 | Security | The only embedded strings are documented dev GLAuth creds and a localhost ZB connection string, all env-overridable. The wrong-password and unreachable-server tests assert no password leakage. No issues found. | +| 6 | Performance & resource management | Issue found: IntegrationTests-013 (`RecordingServerStreamWriter.messageArrived` `SemaphoreSlim` is never disposed; the type owns an `IDisposable` field but is not itself disposable). | +| 7 | Design-document adherence | No issues found. `docs/GatewayTesting.md` now documents the Live LDAP, Live Galaxy, and Write/invalid-handle MXAccess opt-ins added by the prior round of resolutions. | +| 8 | Code organization & conventions | Issues found: IntegrationTests-015 (`[Trait("Category", ...)]` repeated on every test method instead of declared once at class level); IntegrationTests-016 (the Galaxy default connection string is duplicated between `LiveGalaxyRepositoryFactAttribute` and `GalaxyRepositoryOptions`). | +| 9 | Testing coverage | Issue found: IntegrationTests-014 (`Unadvise`, `RemoveItem`, `Unregister`, `WriteSecured` ordering, and worker-fault parity still uncovered — IntegrationTests-005's resolution scoped these out). | +| 10 | Documentation & comments | Issue found: IntegrationTests-011 (the invalid-handle and write test comments describe a non-`Ok` MXAccess failure as `ProtocolStatusCode.Ok`, contradicting both the assertion and `HResultConverter`). | + +### 2026-05-18 review (commit `6c64030`) + | # | Category | Result | |---|---|---| | 1 | Correctness & logic bugs | Issues found: IntegrationTests-003 (asserts only on first event), IntegrationTests-010 (`WaitForMessageAsync` ignores cancellation). | @@ -177,3 +197,93 @@ **Re-triage:** The named method `WaitForFirstMessageAsync` no longer exists — IntegrationTests-003's resolution renamed/replaced it with `RecordingServerStreamWriter.WaitForMessageAsync(predicate, timeout)`, which scans recorded messages and blocks on a `SemaphoreSlim`. The underlying defect still held: that replacement method also took only a `timeout` and never observed a `CancellationToken`. The finding remains valid (Low, Correctness) against the renamed method; the recommendation's `firstMessage.Task.WaitAsync` detail is stale but the intent (thread a token, surface a count on timeout) is unchanged. **Resolution:** Resolved 2026-05-18: Added an optional `CancellationToken` parameter to `WaitForMessageAsync`, linked with the existing timeout source via `CancellationTokenSource.CreateLinkedTokenSource`, so a per-test cancellation aborts the wait promptly. `GatewaySession_WithLiveWorker_RegistersAdvisesStreamsDataAndCloses` now creates a `CancellationTokenSource`, passes its token into the `StreamEvents` `TestServerCallContext` and into `WaitForMessageAsync`, so the stream call and the wait share one cancellation source. On timeout the method already throws a `TimeoutException` whose message includes the scanned message count, satisfying the "emit recorded count" intent (the count surfaces in the test failure rather than via a separate `output.WriteLine`). Verified by build; live tests not executed. + +### IntegrationTests-011 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:236-240`, `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:183-187` | +| Status | Resolved | + +**Description:** The XML/inline comments on the two new MXAccess parity tests misdescribe how the gateway surfaces an MXAccess failure. The invalid-handle test reads "the gateway protocol status is Ok and the failure shows up in hresult / the status proxies — it must not be reported as a transport fault", then asserts `Assert.NotEqual(ProtocolStatusCode.Ok, addItemReply.ProtocolStatus.Code)`. `HResultConverter.CreateProtocolStatus` (`src/MxGateway.Worker/Conversion/HResultConverter.cs:39`) actually sets `Code = ProtocolStatusCode.MxaccessFailure` whenever the COM call throws (HRESULT ≠ 0), so the assertion is correct but the comment is wrong — the protocol status is *not* `Ok` on an MXAccess failure. The write-round-trip test carries the same misleading framing on lines 183-187 ("MXAccess parity details … belong in hresult / statuses, not in a transport failure") immediately before asserting `Ok`. A reader can reasonably conclude the gateway always reports `Ok` for round-tripped commands and tweak code accordingly. The intended distinction is "this is not a gRPC transport fault" (the RPC reply still arrives) — the protocol status code carries the MXAccess outcome. + +**Recommendation:** Reword the invalid-handle comment to "the gateway must reply with `ProtocolStatusCode.MxaccessFailure` and a non-zero `Hresult` carrying the COM failure, not a gRPC transport fault." Reword the write-round-trip comment to clarify it is asserting the happy-path Ok and that an MXAccess rejection would surface as `MxaccessFailure` (per `HResultConverter`), not as a `RpcException`. + +**Resolution:** 2026-05-20 — Reworded the invalid-handle test comment to say the gateway must reply with `ProtocolStatusCode.MxaccessFailure` and a non-zero hresult carrying the COM failure (per `HResultConverter`), and reworded the write-round-trip comment to make explicit it is asserting the happy-path Ok while an MXAccess rejection would surface as `MxaccessFailure`, never as an `RpcException`. + +### IntegrationTests-012 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:147-151` | +| Status | Resolved | + +**Description:** `GatewaySession_WithLiveWorker_WritesValueToAdvisedItem` constructs a `RecordingServerStreamWriter` and starts a `StreamEvents` task, then never reads from it and never asserts anything about the recorded messages. The test verifies only that the `Write` command round-trips at the protocol level — it does not verify that the worker actually emits any event after the write (for example an `OnWriteComplete`, which is the proof of round-trip used by the cross-language client e2e runner). Because the stream task is started with `new TestServerCallContext()` (no cancellation source), any fault raised by the stream task (an exception from `EventStreamService`, a session-not-found, a backpressure overflow) is swallowed — `streamTask` is later awaited in `ShutDownAsync` only inside a broad `catch (Exception ex)`, which logs and continues. The Write test therefore cannot fail on stream-task faults. Two consequences: (a) the live Write parity coverage promised in IntegrationTests-005 is weaker than it appears, and (b) the fixture (`eventWriter`) is dead code in this test that suggests an assertion was intended. + +**Recommendation:** Either remove the unused `eventWriter`/`StreamEvents` plumbing from the Write test so the test scope matches its assertions, or — preferred — extend the test to wait for an `OnWriteComplete` event for the written item via `eventWriter.WaitForMessageAsync(candidate => candidate.Family == MxEventFamily.OnWriteComplete && candidate.ItemHandle == itemHandle, ...)`, matching the round-trip proof used by `scripts/run-client-e2e-tests.ps1 -VerifyWrite`. + +**Resolution:** Resolved 2026-05-20: Rewrote `GatewaySession_WithLiveWorker_WritesValueToAdvisedItem` so the previously-dead `eventWriter`/`StreamEvents` plumbing actually drives an assertion. The test now waits for an `OnWriteComplete` event matching the Write's (server, item) handle pair via `eventWriter.WaitForMessageAsync` (using `IntegrationTestEnvironment.LiveMxAccessEventTimeout`), and asserts the recorded event's family, session id, and handles — the same round-trip proof the cross-language client e2e runner uses. The stream call is now bound to a `CancellationTokenSource` and the test asserts `streamTask.IsFaulted == false` before cleanup. `ShutDownAsync` gained an opt-in `propagateStreamFaults` flag so a faulted `StreamEvents` task is rethrown into the test rather than silently swallowed by the broad cleanup catch; the cancellation token is also signalled before the drain so `StreamEvents` observes a clean shutdown instead of a forced timeout. Verified by build and by confirming the test skips cleanly when `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS` is unset. + +### IntegrationTests-013 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Performance & resource management | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:519-609` | +| Status | Resolved | + +**Description:** `RecordingServerStreamWriter` owns a `SemaphoreSlim messageArrived` (`IDisposable`) but does not itself implement `IDisposable`, so the semaphore's wait handle is never released back to the OS. Each live test allocates one such writer and discards it at scope exit. Live tests run on opt-in only, so the cumulative leak is bounded, but the type holds an `IDisposable` field — the standard hygiene under `Directory.Build.props`'s `TreatWarningsAsErrors=true` is to either dispose the field or document why not. CA2213 does not fire because the owner is not itself `IDisposable`; an analyzer-driven warning is the only reason this is not a build break, not an indication that the leak is acceptable. + +**Recommendation:** Make `RecordingServerStreamWriter` implement `IDisposable`, dispose `messageArrived` in `Dispose`, and wrap each instantiation in a `using` block (`using RecordingServerStreamWriter eventWriter = new();`). + +**Resolution:** 2026-05-20 — `RecordingServerStreamWriter` now implements `IDisposable` and its `Dispose` releases the `messageArrived` semaphore. All six live tests in `WorkerLiveMxAccessSmokeTests` now allocate the writer with a top-of-method `using` declaration so the semaphore's wait handle is released on scope exit even when the test body throws. + +### IntegrationTests-014 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs` | +| Status | Resolved | + +**Description:** IntegrationTests-005 was resolved by adding live coverage for `Write` and an invalid-handle `AddItem`, but its resolution explicitly scoped out the worker-fault/abnormal-exit case and silently dropped `Unadvise`, `RemoveItem`, `Unregister`, `OperationComplete`, and `WriteSecured` ordering. CLAUDE.md singles out `WriteSecured` ("`WriteSecured` failing before a value-bearing NMX body") and `OperationComplete` semantics as parity surprises the gateway must not "fix" — exactly the paths fake-worker tests cannot validate. After this commit the live MXAccess smoke still doesn't exercise any teardown command, the secured-write ordering rule, or a deliberately faulted worker. A regression in any of these would only be caught by manual testing. + +**Recommendation:** Add live MXAccess coverage for the teardown chain (`Unadvise` then `RemoveItem` then `Unregister`, asserting each replies with `ProtocolStatusCode.Ok` and the next operation no longer references the freed handle), and at minimum one `WriteSecured` parity case asserting the documented ordering. A worker-fault test can be deferred to a separate finding once a deterministic COM-crash injection harness exists. + +**Resolution:** Resolved 2026-05-20: Added three new `[LiveMxAccessFact]`-gated tests to `WorkerLiveMxAccessSmokeTests`, all reusing the existing opt-in env var and `ShutDownAsync` cleanup helper. (1) `GatewaySession_WithLiveWorker_UnadviseRemoveItemUnregister_TeardownOrderingParity` runs Register → AddItem → Advise → wait for one OnDataChange → UnAdvise → RemoveItem → Unregister, asserting each step replies `Ok` with the matching `MxCommandKind`, that no further OnDataChange events for the un-advised (server, item) pair arrive after a 500 ms settle window, and that a second RemoveItem against the freed handle returns a non-`Ok` MXAccess failure (so a regression that left a stale subscription or accepted a stale handle would surface). (2) `GatewaySession_WithLiveWorker_WriteSecured_AuthenticatedRoundTripParity` resolves an ArchestrA user id via `AuthenticateUser` (credentials env-overridable through `MXGATEWAY_LIVE_MXACCESS_WRITE_SECURED_USER` / `..._PASSWORD`, defaulting to the `admin`/`admin123` GLAuth user from `glauth.md`), issues `WriteSecured` against an advised item, and asserts the reply carries `MxCommandKind.WriteSecured`, the protocol status is one of the documented parity outcomes (`Ok` for an unprotected provider, `MxaccessFailure` when the item is not WriteSecured-eligible — never a transport fault), and the credential never leaks into the diagnostic message. (3) `GatewaySession_WithLiveWorker_AbnormalWorkerExit_MarksSessionFaulted` opens a session, kills the worker process tree (via a new `TestWorkerProcessFactory.KillAllAndDetach` helper) without going through CloseSession, and polls the session via a new `GatewayServiceFixture.TryGetSession` accessor until it transitions to `SessionState.Faulted` within the live event timeout; asserts the final state is `Faulted`, that `FinalFault` is non-empty, and that the fault description carries a known worker-client classification (pipe disconnected / worker faulted / heartbeat expired / end-of-stream). `docs/GatewayTesting.md` was updated to list all five parity surfaces and the two new env-var defaults. Verified by build and confirmed all six live tests skip cleanly when `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS` is unset. + +### IntegrationTests-015 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:30,119,201`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:13,32,48,67,84`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:10,22,34,52` | +| Status | Resolved | + +**Description:** Every live-test method in the three live classes carries an identical `[Trait("Category", "LiveMxAccess")]` (or `LiveLdap` / `LiveGalaxy`) attribute. The trait is uniform within each class and is exactly the information the `[Collection(LiveResourcesCollection.Name)]` class-level attribute also implies. xUnit's `[Trait]` is inheritable from the class to its methods, so the same metadata can be declared once at class scope. The current shape adds maintenance burden — adding a new test in any of these classes requires remembering to add the trait, and the existing pattern's `LiveLdap` includes five copies of the same line. + +**Recommendation:** Move each `[Trait("Category", ...)]` to the class declaration alongside the existing `[Collection(...)]`, and remove the per-method copies. Verify the trait still surfaces in `--filter Trait=Category=LiveLdap` after the change. + +**Resolution:** 2026-05-20 — Lifted `[Trait("Category", "LiveMxAccess")]`, `[Trait("Category", "LiveLdap")]`, and `[Trait("Category", "LiveGalaxy")]` to the class declarations of `WorkerLiveMxAccessSmokeTests`, `DashboardLdapLiveTests`, and `GalaxyRepositoryLiveTests` respectively (alongside the existing `[Collection(LiveResourcesCollection.Name)]`), and removed all per-method duplicates. xUnit propagates class-level traits to every method, so `--filter Category=Live*` filters still match. + +### IntegrationTests-016 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs:26`, `src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs:13` | +| Status | Resolved | + +**Description:** The default Galaxy Repository connection string `"Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;"` is duplicated verbatim between the production `GalaxyRepositoryOptions.ConnectionString` initializer and the test-side `LiveGalaxyRepositoryFactAttribute.ConnectionString` fallback. The docs (`docs/GatewayTesting.md`) document the value once and reference it from both places. If the production default changes (e.g. tightening to a named instance, or switching to a SQL-auth template), the test default silently keeps the old string and the live Galaxy tests connect to the wrong server. The drift is invisible to the build. + +**Recommendation:** Expose the production default through a `public const string` on `GalaxyRepositoryOptions` (e.g. `DefaultConnectionString`) and have `LiveGalaxyRepositoryFactAttribute.ConnectionString` read `Environment.GetEnvironmentVariable(ConnectionStringVariableName) ?? GalaxyRepositoryOptions.DefaultConnectionString`. Single source of truth, build-time guarantee they cannot drift. + +**Resolution:** 2026-05-20 — Added `public const string GalaxyRepositoryOptions.DefaultConnectionString` carrying the production default, set the `ConnectionString` initializer to reference it, and changed `LiveGalaxyRepositoryFactAttribute.ConnectionString` to fall back to `GalaxyRepositoryOptions.DefaultConnectionString`. The literal now lives in exactly one place and any future change to the production default propagates to the live-test fallback at compile time. diff --git a/code-reviews/README.md b/code-reviews/README.md index 13bd660..2c70efa 100644 --- a/code-reviews/README.md +++ b/code-reviews/README.md @@ -10,17 +10,17 @@ Each module's `findings.md` is the source of truth; this file is generated from | Module | Reviewer | Date | Commit | Status | Open | Total | |---|---|---|---|---|---|---| -| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-05-18 | `3cc53a8` | Reviewed | 0 | 8 | -| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-05-18 | `3cc53a8` | Reviewed | 0 | 10 | -| [Client.Java](Client.Java/findings.md) | Claude Code | 2026-05-18 | `3cc53a8` | Reviewed | 0 | 12 | -| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-05-18 | `3cc53a8` | Reviewed | 0 | 12 | -| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-18 | `3cc53a8` | Reviewed | 0 | 12 | -| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-18 | `6c64030` | Reviewed | 0 | 8 | -| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-18 | `6c64030` | Reviewed | 0 | 10 | -| [Server](Server/findings.md) | Claude Code | 2026-05-18 | `6c64030` | Reviewed | 0 | 14 | -| [Tests](Tests/findings.md) | Claude Code | 2026-05-18 | `6c64030` | Reviewed | 0 | 12 | -| [Worker](Worker/findings.md) | Claude Code | 2026-05-18 | `6c64030` | Reviewed | 0 | 15 | -| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-18 | `6c64030` | Reviewed | 0 | 15 | +| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 14 | +| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 16 | +| [Client.Java](Client.Java/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 20 | +| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 17 | +| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 17 | +| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 13 | +| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 16 | +| [Server](Server/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 22 | +| [Tests](Tests/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 19 | +| [Worker](Worker/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 22 | +| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 24 | ## Pending findings @@ -36,13 +36,16 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. |---|---|---|---|---| | Server-001 | Critical | Resolved | Security | `src/MxGateway.Server/GatewayApplication.cs:147-149`, `src/MxGateway.Server/Dashboard/DashboardEndpointRouteBuilderExtensions.cs:55-58`, `src/MxGateway.Server/Dashboard/Components/Routes.razor:1-15` | | Client.Go-001 | High | Resolved | Correctness & logic bugs | `clients/go/mxgateway/errors.go:88-93`, `clients/go/mxgateway/errors.go:117-128` | +| Client.Java-013 | High | Resolved | Testing coverage | `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java:212-304`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1214-1244` | | Client.Rust-001 | High | Resolved | mxaccessgw conventions | `clients/rust/src/options.rs:98,143` | | Client.Rust-002 | High | Resolved | mxaccessgw conventions | `clients/rust/src/session.rs:522` | | Client.Rust-003 | High | Resolved | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:1051` | | Client.Rust-012 | High | Resolved | mxaccessgw conventions | `clients/rust/src/galaxy.rs:282` | +| Client.Rust-013 | High | Resolved | mxaccessgw conventions | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:414-424` (origin); `clients/rust/src/generated.rs:11-31` (suppression site) | | IntegrationTests-001 | High | Resolved | Design-document adherence | `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs:7`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs` | | IntegrationTests-002 | High | Resolved | Design-document adherence | `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:13`, `src/MxGateway.Server/Configuration/LdapOptions.cs:27` | | Server-003 | High | Resolved | Security | `src/MxGateway.Server/Dashboard/DashboardAuthorizationHandler.cs:39,54-59`, `src/MxGateway.Server/Dashboard/DashboardAuthenticator.cs:236-258` | +| Server-017 | High | Resolved | Security | `src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs:13-27`, `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:173-247`, `docs/Authorization.md:108-110` | | Tests-001 | High | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:483-489` | | Tests-002 | High | Resolved | Security | `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:198-210` | | Worker-001 | High | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:204-207` | @@ -60,39 +63,63 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Java-003 | Medium | Resolved | mxaccessgw conventions | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:119-140` | | Client.Java-004 | Medium | Resolved | Correctness & logic bugs | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySession.java:114-120,157-163,191-197` | | Client.Java-005 | Medium | Resolved | Error handling & resilience | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySession.java:92-105` | +| Client.Java-014 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:59-65,117-124` | +| Client.Java-015 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java:112-138`, `MxGatewayClient.java:183-191,224-232,322-329`, `GalaxyRepositoryClient.java:164-170,212-214` | | Client.Python-003 | Medium | Resolved | Error handling & resilience | `clients/python/src/mxgateway/client.py:125-137,155-173` | | Client.Python-005 | Medium | Resolved | Performance & resource management | `clients/python/src/mxgateway/galaxy.py:117-140` | | Client.Python-009 | Medium | Resolved | Testing coverage | `clients/python/tests/` | +| Client.Python-013 | Medium | Resolved | Security | `clients/python/src/mxgateway_cli/commands.py:757-762` | | Client.Rust-005 | Medium | Resolved | Correctness & logic bugs | `clients/rust/src/session.rs:489-520` | | Client.Rust-006 | Medium | Resolved | Error handling & resilience | `clients/rust/src/session.rs:531-555` | +| Client.Rust-015 | Medium | Resolved | Error handling & resilience | `clients/rust/crates/mxgw-cli/src/main.rs:1053-1070` | +| Client.Rust-016 | Medium | Resolved | Testing coverage | `clients/rust/tests/client_behavior.rs`, `clients/rust/src/session.rs:489-519,654-768` | | Contracts-002 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:384-385`, `:95` | +| Contracts-009 | Medium | Resolved | Design-document adherence | `docs/Contracts.md:13-24` | | IntegrationTests-003 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:89-97` | | IntegrationTests-004 | Medium | Resolved | Error handling & resilience | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:108-111` | | IntegrationTests-005 | Medium | Resolved | Testing coverage | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs` | | IntegrationTests-006 | Medium | Resolved | Testing coverage | `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs` | +| IntegrationTests-012 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:147-151` | +| IntegrationTests-014 | Medium | Resolved | Testing coverage | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs` | | Server-002 | Medium | Resolved | Design-document adherence | `src/MxGateway.Server/Program.cs:24`, `src/MxGateway.Server/GatewayApplication.cs` | | Server-004 | Medium | Resolved | Code organization & conventions | `src/MxGateway.Server/Security/Authentication/ApiKeyAdminCommandLineParser.cs:227-233`, `src/MxGateway.Server/Security/Authentication/ApiKeyAdminCliRunner.cs:53-77`, `src/MxGateway.Server/Dashboard/DashboardApiKeyManagementService.cs:21-67` | | Server-005 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyRefreshService.cs:22-28`, `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:184` | | Server-006 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Server/Sessions/SessionManager.cs:84-114` | +| Server-015 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Server/Sessions/GatewaySession.cs:8-15,266-308,720-775` | +| Server-016 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Sessions/GatewaySession.cs:790-797`, `src/MxGateway.Server/Sessions/SessionManager.cs:237-258` | +| Server-021 | Medium | Resolved | Testing coverage | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:266-664`, `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs` | | Tests-003 | Medium | Resolved | Performance & resource management | `src/MxGateway.Tests/Security/Authentication/SqliteAuthStoreTests.cs:170-176`, `src/MxGateway.Tests/Security/Authentication/ApiKeyAdminCliRunnerTests.cs:252-258` | | Tests-004 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs` | | Tests-005 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/EventStreamServiceTests.cs:239-261`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` | | Tests-006 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:76`, `src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs:122` | +| Tests-013 | Medium | Resolved | Testing coverage | `src/MxGateway.Server/Sessions/GatewaySession.cs:449-679`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` | +| Tests-016 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:29-41,115-124` | | Worker-004 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:565-588` | | Worker-005 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:205-258` (production alarm poll loop) | | Worker-006 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:117-124`, `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:386-491` | | Worker-007 | Medium | Resolved | mxaccessgw conventions | `src/MxGateway.Worker/MxAccess/MxAccessComServer.cs:130-150` | | Worker-008 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:205-249`, `:429-447` | +| Worker-016 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:261-265` | +| Worker-017 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/Sta/StaRuntime.cs:280-288`, `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:602-631` | | Worker.Tests-003 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs:46-48` | | Worker.Tests-004 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:281-329` | | Worker.Tests-005 | Medium | Resolved | Performance & resource management | `src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs:20-31,103-105`, `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:28-31` | | Worker.Tests-006 | Medium | Resolved | Performance & resource management | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:282,305,315,323` | | Worker.Tests-007 | Medium | Resolved | Design-document adherence | `docs/WorkerFrameProtocol.md:38-49` | +| Worker.Tests-016 | Medium | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs:317-393` | +| Worker.Tests-017 | Medium | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs` | +| Worker.Tests-018 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs:18-31, 35-73, 75-145, 148-220, 222-342` | | Client.Dotnet-004 | Low | Resolved | Error handling & resilience | `clients/dotnet/MxGateway.Client/MxGatewayClient.cs:283-294`, `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:392-403` | | Client.Dotnet-005 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/MxGateway.Client/MxGatewaySession.cs:82,124,175` | | Client.Dotnet-006 | Low | Resolved | Code organization & conventions | `clients/dotnet/MxGateway.Client/MxGatewayClientOptions.cs:50`, `clients/dotnet/MxGateway.Client/MxGatewayClientContractInfo.cs:10-14` | | Client.Dotnet-007 | Low | Resolved | Documentation & comments | `clients/dotnet/MxGateway.Client/MxGatewayClient.cs:185-192` | | Client.Dotnet-008 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/MxGateway.Client.Cli/MxGatewayCliSecretRedactor.cs:9-17` | +| Client.Dotnet-009 | Low | Resolved | Concurrency & thread safety | `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:26,339-348,445-448` | +| Client.Dotnet-010 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:638,896,1261,1279` | +| Client.Dotnet-011 | Low | Resolved | Concurrency & thread safety | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:857-858,922-963,1014-1015` | +| Client.Dotnet-012 | Low | Resolved | Code organization & conventions | `clients/dotnet/MxGateway.Client/MxGateway.Client.csproj`, `clients/dotnet/MxGateway.Client.Cli/MxGateway.Client.Cli.csproj`, `clients/dotnet/MxGateway.Client.Tests/MxGateway.Client.Tests.csproj` | +| Client.Dotnet-013 | Low | Resolved | Code organization & conventions | `clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs:3-24`, `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:185-187`, `clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs:6` | +| Client.Dotnet-014 | Low | Resolved | Testing coverage | `clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs:76-98`, `clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs:212-231` | | Client.Go-004 | Low | Resolved | mxaccessgw conventions | `clients/go/mxgateway/alarms_test.go:153-154`, `clients/go/mxgateway/galaxy_test.go:58-59` | | Client.Go-005 | Low | Resolved | Design-document adherence | `clients/go/mxgateway/client.go:64,68`, `clients/go/mxgateway/galaxy.go:83,87` | | Client.Go-006 | Low | Resolved | Error handling & resilience | `clients/go/mxgateway/errors.go:9-130` | @@ -100,6 +127,12 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Go-008 | Low | Resolved | Testing coverage | `clients/go/mxgateway/` (test files) | | Client.Go-009 | Low | Resolved | Code organization & conventions | `clients/go/mxgateway/galaxy.go:60-93,241-256`, `clients/go/mxgateway/client.go:41-74,190-205` | | Client.Go-010 | Low | Resolved | Documentation & comments | `clients/go/mxgateway/client.go:39-40` | +| Client.Go-011 | Low | Resolved | Correctness & logic bugs | `clients/go/mxgateway/alarms_test.go:66-73` | +| Client.Go-012 | Low | Resolved | Documentation & comments | `clients/go/cmd/mxgw-go/main.go:1063-1065`, `clients/go/cmd/mxgw-go/main.go:88-104` | +| Client.Go-013 | Low | Resolved | Concurrency & thread safety | `clients/go/cmd/mxgw-go/main.go:1246-1249`, `clients/go/cmd/mxgw-go/main.go:1257-1262` | +| Client.Go-014 | Low | Resolved | Error handling & resilience | `clients/go/mxgateway/session.go:602`, `clients/go/mxgateway/galaxy.go:189` | +| Client.Go-015 | Low | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:410-512` | +| Client.Go-016 | Low | Resolved | Testing coverage | `clients/go/mxgateway/galaxy_test.go:382-429` | | Client.Java-006 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:323-328`, `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java:279-284` | | Client.Java-007 | Low | Resolved | Testing coverage | `clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/` | | Client.Java-008 | Low | Resolved | Error handling & resilience | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:298-304` | @@ -107,6 +140,11 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Java-010 | Low | Resolved | Documentation & comments | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:269-272`, `clients/java/README.md:76` | | Client.Java-011 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:37-63` | | Client.Java-012 | Low | Resolved | Correctness & logic bugs | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:667-674` | +| Client.Java-016 | Low | Resolved | Code organization & conventions | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:361-391`, `GalaxyRepositoryClient.java:285-315` | +| Client.Java-017 | Low | Resolved | Documentation & comments | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:25-36`, `clients/java/README.md:99-107` | +| Client.Java-018 | Low | Resolved | Security | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java:54-66` | +| Client.Java-019 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:362-391`, `GalaxyRepositoryClient.java:286-315` | +| Client.Java-020 | Low | Resolved | Correctness & logic bugs | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:244-254`, `galaxy_repository.proto:94` | | Client.Python-001 | Low | Resolved | Documentation & comments | `clients/python/pyproject.toml:8,25`, `clients/python/src/mxgateway_cli/commands.py:25` | | Client.Python-002 | Low | Resolved | Code organization & conventions | `clients/python/src/mxgateway/__init__.py:27` | | Client.Python-004 | Low | Resolved | Correctness & logic bugs | `clients/python/src/mxgateway_cli/commands.py:386,402-404` | @@ -116,12 +154,18 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Python-010 | Low | Resolved | Code organization & conventions | `clients/python/src/mxgateway/session.py:404`, `clients/python/src/mxgateway_cli/commands.py:422-425` | | Client.Python-011 | Low | Resolved | Error handling & resilience | `clients/python/src/mxgateway/errors.py:122-148` | | Client.Python-012 | Low | Won't Fix | mxaccessgw conventions | `clients/python/src/mxgateway/client.py:84-108`, `clients/python/src/mxgateway/session.py:57-77` | +| Client.Python-014 | Low | Resolved | Code organization & conventions | `clients/python/src/mxgateway_cli/commands.py:22-23` | +| Client.Python-015 | Low | Resolved | Testing coverage | `clients/python/src/mxgateway_cli/commands.py:273-294,564-647`, `clients/python/tests/` | +| Client.Python-016 | Low | Resolved | Testing coverage | `clients/python/src/mxgateway_cli/commands.py:25,757-775,805-830` | +| Client.Python-017 | Low | Resolved | Documentation & comments | `clients/python/pyproject.toml:5-25`, `clients/python/src/mxgateway/` | | Client.Rust-004 | Low | Resolved | Documentation & comments | `clients/rust/src/version.rs:7` | | Client.Rust-007 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:14-55` | | Client.Rust-008 | Low | Resolved | Performance & resource management | `clients/rust/src/value.rs:161-261` | | Client.Rust-009 | Low | Resolved | Testing coverage | `clients/rust/tests/client_behavior.rs`, `clients/rust/src/galaxy.rs` | | Client.Rust-010 | Low | Resolved | Error handling & resilience | `clients/rust/src/client.rs:255-268`, `clients/rust/src/galaxy.rs:204-216` | | Client.Rust-011 | Low | Resolved | mxaccessgw conventions | `clients/rust/src/session.rs:469` | +| Client.Rust-014 | Low | Resolved | mxaccessgw conventions | `clients/rust/crates/mxgw-cli/src/main.rs:450,497` | +| Client.Rust-017 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:79-99,156-163` | | Contracts-001 | Low | Resolved | Design-document adherence | `docs/Grpc.md:13` (and `:3`, `:32`, `:39`) | | Contracts-003 | Low | Won't Fix | Code organization & conventions | `src/MxGateway.Contracts/MxGateway.Contracts.csproj:10` | | Contracts-004 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/GatewayContractInfo.cs:3-6` | @@ -129,10 +173,18 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Contracts-006 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:647` | | Contracts-007 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | | Contracts-008 | Low | Resolved | Design-document adherence | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:451-459`, `:627-636` | +| Contracts-010 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | +| Contracts-011 | Low | Resolved | Security | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:392-397`, `:406-412` | +| Contracts-012 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/Protos/galaxy_repository.proto:120` | +| Contracts-013 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs:14` | | IntegrationTests-007 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:20`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:5`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:9` | | IntegrationTests-008 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/LiveLdapFactAttribute.cs`, `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs`, `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs` | | IntegrationTests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:372-375` | | IntegrationTests-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:366-369` | +| IntegrationTests-011 | Low | Resolved | Documentation & comments | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:236-240`, `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:183-187` | +| IntegrationTests-013 | Low | Resolved | Performance & resource management | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:519-609` | +| IntegrationTests-015 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:30,119,201`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:13,32,48,67,84`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:10,22,34,52` | +| IntegrationTests-016 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs:26`, `src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs:13` | | Server-007 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyHierarchyProjector.cs:55-70` | | Server-008 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:111-134,160-189` | | Server-009 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Security/Authentication/AuthSqliteConnectionFactory.cs:15-32` | @@ -141,12 +193,21 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Server-012 | Low | Resolved | Documentation & comments | `CLAUDE.md` (Authentication section and `apikey create` example) | | Server-013 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Dashboard/DashboardAuthorizationHandlerTests.cs`, `src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs` | | Server-014 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:162-171,191-198,206-214,229-237` | +| Server-018 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs:15` | +| Server-019 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs:183-221` | +| Server-020 | Low | Resolved | Code organization & conventions | `src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor:1-2`, `…/GalaxyPage.razor:1-2`, `…/ApiKeysPage.razor:1-2`, `…/EventsPage.razor:1-2`, `…/SessionsPage.razor:1-2`, `…/WorkersPage.razor:1-2`, `…/SettingsPage.razor:1-2`, `…/SessionDetailsPage.razor:1-2` | +| Server-022 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs:8-29` | | Tests-007 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:682`, `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:324`, `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:460`, `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs:233` | | Tests-008 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs:1-9`, `src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs:1-3`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerAlarmAutoSubscribeTests.cs:1` | | Tests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:36-37,99,365` | | Tests-010 | Low | Resolved | Security | `src/MxGateway.Tests/Gateway/Dashboard/DashboardAuthorizationHandlerTests.cs:26-36` | | Tests-011 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:233-301` | | Tests-012 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Tests/Gateway/Workers/Fakes/FakeWorkerHarness.cs:62`, `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:472` | +| Tests-014 | Low | Resolved | Performance & resource management | `src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs:18,33,44,62,81,105`, `src/MxGateway.Tests/Gateway/Dashboard/DashboardCookieOptionsTests.cs:17` | +| Tests-015 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:374-379,87` | +| Tests-017 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:346-364` | +| Tests-018 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:32`, `src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs:45,51,57,105,134,163,167,202-209,284,317,523`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:40` | +| Tests-019 | Low | Resolved | Documentation & comments | `docs/GatewayTesting.md`, `code-reviews/Tests/findings.md` (Tests-002 re-triage) | | Worker-009 | Low | Resolved | Performance & resource management | `src/MxGateway.Worker/Ipc/WorkerFrameReader.cs:31,49`, `src/MxGateway.Worker/Ipc/WorkerFrameWriter.cs:57-58` | | Worker-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Conversion/VariantConverter.cs:204-226` | | Worker-011 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeClient.cs:169-171` | @@ -154,6 +215,11 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Worker-013 | Low | Resolved | Testing coverage | `src/MxGateway.Worker/Sta/StaMessagePump.cs` | | Worker-014 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs:33`, `:202` | | Worker-015 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/MxAccess/MxAccessEventQueue.cs:115-145` | +| Worker-018 | Low | Resolved | Error handling & resilience | `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:160-161` | +| Worker-019 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:59`, `:188` | +| Worker-020 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:405`, `:423` | +| Worker-021 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:111-118`, `:790-805`, `:136-139` | +| Worker-022 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs:12`, `:26`, `:49` | | Worker.Tests-008 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/Conversion/VariantConverterTests.cs:175-182` | | Worker.Tests-009 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs`, `AlarmDispatcherTests.cs`, `AlarmCommandExecutorTests.cs`, `AlarmRecordTransitionMapperTests.cs`, `WnWrapAlarmConsumerXmlTests.cs` | | Worker.Tests-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:230-258` | @@ -162,3 +228,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Worker.Tests-013 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:539-546` | | Worker.Tests-014 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeClientTests.cs:194`, `WorkerPipeSessionTests.cs:622`, `Sta/StaCommandDispatcherTests.cs:348`, `MxAccess/MxAccessStaSessionTests.cs:334`, `MxAccess/MxAccessCommandExecutorTests.cs:1124` | | Worker.Tests-015 | Low | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/MxAccess/MxAccessEventQueueTests.cs` | +| Worker.Tests-019 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs:45`, `src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs:143`, `src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs:55` | +| Worker.Tests-020 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/MxAccess/MxAccessValueCacheTests.cs:88-108` | +| Worker.Tests-021 | Low | Resolved | Error handling & resilience | `src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs` | +| Worker.Tests-022 | Low | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs` | +| Worker.Tests-023 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs` (779 lines), `src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs` (287 lines), `src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs` (270 lines) | +| Worker.Tests-024 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs:42-54` | diff --git a/code-reviews/Server/findings.md b/code-reviews/Server/findings.md index 50c2851..f663e7c 100644 --- a/code-reviews/Server/findings.md +++ b/code-reviews/Server/findings.md @@ -4,25 +4,29 @@ |---|---| | Module | `src/MxGateway.Server` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `6c64030` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +This row summarizes the 2026-05-20 review pass at commit `1cd51bb`. Findings from +prior passes (Server-001 through Server-014) are all closed and remain below as +audit history. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: Server-006 (metrics open-session leak on alarm auto-subscribe failure), Server-010 (rotate reactivates revoked keys). | -| 2 | mxaccessgw conventions | Issues found: Server-002 (orphan-worker termination on startup not implemented), Server-011 (style deviation in `WorkerAlarmRpcDispatcher`). | -| 3 | Concurrency & thread safety | No issues found — locking is correct; inconsistent-but-safe discipline in `GatewayMetrics` noted only. | -| 4 | Error handling & resilience | Issues found: Server-005 (Galaxy first-load can fault the host BackgroundService), Server-009 (SQLite has no busy-timeout/WAL under concurrent writes). | -| 5 | Security | Issues found: Server-001 (Critical: dashboard authorization never enforced on any route), Server-003 (LDAP dashboard users denied for lack of a scope claim), Server-010. | -| 6 | Performance & resource management | Issues found: Server-007 (DiscoverHierarchy paging is O(total) per page), Server-008 (WatchDeployEvents re-projects whole hierarchy per event). | -| 7 | Design-document adherence | Issues found: Server-002 (orphan workers), Server-012 (CLAUDE.md scope names stale vs code/docs). | -| 8 | Code organization & conventions | Issues found: Server-011 (style), Server-004 (CLI accepts unvalidated scope strings). | -| 9 | Testing coverage | Issues found: Server-013 (no dashboard route-level authorization test; `WorkerExecutableValidator`, `GalaxyGlobMatcher`, projector paging untested). | -| 10 | Documentation & comments | Issues found: Server-014 (stale "not yet wired" alarm comments), Server-012. | +| 1 | Correctness & logic bugs | Issues found: Server-019 (`WorkerAlarmRpcDispatcher.QueryActiveAlarmsAsync` yields silently when session is missing). | +| 2 | mxaccessgw conventions | No issues found — convention drift previously called out is resolved; no new gaps observed. | +| 3 | Concurrency & thread safety | Issues found: Server-015 (`GatewaySession._state` is written under `_closeLock` but read/written elsewhere under `_syncRoot`). | +| 4 | Error handling & resilience | Issues found: Server-016 (`GatewaySession.DisposeAsync` disposes the close-lock semaphore while it may be held). | +| 5 | Security | Issues found: Server-017 (`AcknowledgeAlarm` / `QueryActiveAlarms` fall through to admin-only scope because the resolver was not updated for the new alarm RPCs). | +| 6 | Performance & resource management | Issues found: Server-018 (`GalaxyGlobMatcher` regex cache is unbounded — currently low-risk but uncapped). | +| 7 | Design-document adherence | No issues found at this pass. | +| 8 | Code organization & conventions | Issues found: Server-020 (dashboard pages each declare two `@page` directives — `@page "/X"` AND `@page "/dashboard/X"` — producing duplicate routes under the `/dashboard` group prefix). | +| 9 | Testing coverage | Issues found: Server-021 (`MxAccessGatewayService.ApplyConstraintsAsync` and the new `BulkConstraintPlan` / `ReadBulkConstraintPlan` / `WriteBulkConstraintPlan` / `SubscribeBulkConstraintPlan` merge logic is entirely untested). | +| 10 | Documentation & comments | Issues found: Server-022 (`IAlarmRpcDispatcher` XML doc still describes the dispatcher as "ships a not-yet-wired default"; stale after Server-014). | ## Findings @@ -235,3 +239,123 @@ **Recommendation:** Update the `AcknowledgeAlarm`/`QueryActiveAlarms` remarks to reflect that `WorkerAlarmRpcDispatcher` is the wired default, and describe its actual GUID-vs-`Provider!Group.Tag` handling. **Resolution:** Resolved 2026-05-18. Confirmed against source: `SessionServiceCollectionExtensions` registers `WorkerAlarmRpcDispatcher` as `IAlarmRpcDispatcher`, so the "not yet wired" / "empty stream until PR A.2" / "PR A.6/A.7 follow-up" prose in the `AcknowledgeAlarm` and `QueryActiveAlarms` `` and inline comments was stale. Rewrote both `` blocks and both inline comments to state that DI binds the production `WorkerAlarmRpcDispatcher`, that it routes over the worker pipe IPC, and that `AcknowledgeAlarm` handles a canonical-GUID reference (→ `AcknowledgeAlarmCommand`) and a `Provider!Group.Tag` reference (→ `AcknowledgeAlarmByNameCommand`), with `NotWiredAlarmRpcDispatcher` being only the null fallback. The matching stale `WorkerAlarmRpcDispatcher` class-level XML doc was corrected as part of Server-011. Pure documentation/comment change; no test. + +### Server-015 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Concurrency & thread safety | +| Location | `src/MxGateway.Server/Sessions/GatewaySession.cs:8-15,266-308,720-775` | +| Status | Resolved | + +**Description:** `GatewaySession` guards its mutable state with two different sync primitives. `TransitionTo`, `MarkFaulted`, `TouchClientActivity`, the `State`/`LastClientActivityAt`/`LeaseExpiresAt`/`FinalFault`/`ActiveEventSubscriberCount` getters, `AttachWorkerClient`, and `IsLeaseExpired` all read/write `_state`, `_finalFault`, `_lastClientActivityAt`, `_leaseExpiresAt`, `_workerClient`, and `_activeEventSubscriberCount` under `_syncRoot`. `CloseAsync` (lines 720-775), however, reads `_state` at line 729 and writes `_state` at lines 736 (`SessionState.Closing`) and 761 (`SessionState.Closed`) while only holding the `_closeLock` `SemaphoreSlim` — `_syncRoot` is never acquired. A concurrent `TransitionTo` or `MarkFaulted` from another thread sees `_state` outside the lock that protects it, and the `State` getter is not guaranteed to observe the `Closing`/`Closed` writes promptly. `SemaphoreSlim.WaitAsync`/`Release` do happen to provide memory barriers in practice, but the locking discipline is split across two primitives, which is fragile and defeats the audit value of "all `_state` access is guarded by `_syncRoot`". Concretely, the race between `CloseAsync` setting `_state = Closing` and a concurrent `TransitionTo(Ready)` is unordered — and `TransitionTo` will happily overwrite `Closing` back to `Ready` because its only guard is "do not overwrite `Closed`/`Faulted`". + +**Recommendation:** Make `CloseAsync` mutate `_state` through the existing `TransitionTo(...)` helper (or acquire `_syncRoot` around the reads/writes) so all `_state` access uses the same lock. Either extend `TransitionTo` to accept the `Closing` and `Closed` transitions (it already handles `Faulted`/`Closed` precedence) or refactor `CloseAsync` to call a private `TrySetClosing()` / `MarkClosed()` that locks `_syncRoot`. Add a regression test that forces a `TransitionTo(Ready)` after `CloseAsync` has set `Closing` and asserts the session does not flip back to `Ready`. + +**Resolution:** 2026-05-20 — Unified the close path on `_syncRoot`. `GatewaySession.CloseAsync` (`src/MxGateway.Server/Sessions/GatewaySession.cs`) now mutates `_state` only through two private `_syncRoot`-locked helpers — `TryBeginClose` (writes `Closing`, returns the prior `_closeStarted`) and `MarkClosed` (writes `Closed`) — so every `_state` read/write in the session uses the same lock; `_closeLock` keeps its role of serializing concurrent close attempts. `TransitionTo` was tightened to refuse a transition out of `Closing` to anything other than `Closed`/`Faulted` so a late lifecycle callback cannot walk a closing session back to `Ready`. `docs/Sessions.md` updated to describe the unified lock discipline and the extended terminal precedence. Regression tests in `src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs`: `TransitionTo_AfterCloseStarted_DoesNotOverwriteClosing` (the named scenario — `BlockingShutdownWorkerClient` parks the close inside `worker.ShutdownAsync` so the test can call `TransitionTo(Ready)` between the `Closing` and `Closed` writes and assert the state stays `Closing`) and `MarkFaulted_AfterCloseCompletes_DoesNotResurrectSession`. + +### Server-016 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `src/MxGateway.Server/Sessions/GatewaySession.cs:790-797`, `src/MxGateway.Server/Sessions/SessionManager.cs:237-258` | +| Status | Resolved | + +**Description:** `GatewaySession.DisposeAsync` synchronously calls `_closeLock.Dispose()` (line 792) without first acquiring the lock and without checking whether a `CloseAsync` is still in flight. The normal call path is `SessionManager.CloseSessionCoreAsync` → `session.CloseAsync(...)` → `RemoveSessionAsync` → `DisposeAsync`, where `DisposeAsync` runs strictly after `CloseAsync` completes. But the `ShutdownAsync` path (`SessionManager.cs:237-258`) and any future caller that disposes a session while another thread is still inside `CloseAsync` will trip `ObjectDisposedException` when the in-flight `CloseAsync` releases the semaphore. The race is narrow today because all `Close`/`Dispose` choreography goes through `SessionManager`, but the class-level contract is broken: nothing on `GatewaySession` documents or enforces "DisposeAsync must not be called concurrently with CloseAsync". + +**Recommendation:** In `DisposeAsync`, either (a) take and release `_closeLock` once before disposing it, so the dispose is sequenced after any in-flight close, or (b) replace `_closeLock` disposal with a guard flag and let the semaphore be reclaimed by the finalizer. Document the invariant on the public method. Add a regression test that disposes a session whose `CloseAsync` has not yet completed and asserts no `ObjectDisposedException`. + +**Resolution:** 2026-05-20 — Took recommendation (a): `GatewaySession.DisposeAsync` (`src/MxGateway.Server/Sessions/GatewaySession.cs`) now acquires `_closeLock` once before disposing the semaphore so an in-flight `CloseAsync` finishes (its `_closeLock.Release()`) before the dispose tears the semaphore down. The wait is non-cancellable (`CancellationToken.None`) and `ObjectDisposedException` is swallowed at both the wait and the dispose site so double-dispose still completes cleanly. The method's XML doc was extended with a `` block stating the invariant. Regression tests in `src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs`: `DisposeAsync_WhileCloseInFlight_WaitsForCloseAndDoesNotThrow` (parks `CloseAsync` inside the worker shutdown, calls `DisposeAsync` concurrently, releases shutdown, asserts both complete without `ObjectDisposedException` and the worker is disposed exactly once) and `DisposeAsync_CalledTwice_DoesNotThrow`. + +### Server-017 + +| Field | Value | +|---|---| +| Severity | High | +| Category | Security | +| Location | `src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs:13-27`, `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:173-247`, `docs/Authorization.md:108-110` | +| Status | Resolved | + +**Description:** The two new top-level RPCs added to `MxAccessGateway` — `AcknowledgeAlarm(AcknowledgeAlarmRequest)` and `QueryActiveAlarms(QueryActiveAlarmsRequest)` (proto lines 23-24) — are not enumerated by `GatewayGrpcScopeResolver.ResolveRequiredScope`. The resolver's `request switch` covers `OpenSessionRequest`, `CloseSessionRequest`, `StreamEventsRequest`, `MxCommandRequest`, and the four Galaxy-repository requests; everything else falls through to `_ => GatewayScopes.Admin`. The interceptor (`GatewayGrpcAuthorizationInterceptor.AuthenticateAndAuthorizeAsync`) then rejects any non-admin caller with `PermissionDenied`. This is technically fail-closed (and `docs/Authorization.md:108-110` documents the "unrecognized → admin" intent), but in practice it means: (1) only API keys with the `admin` scope can acknowledge alarms or query active alarms, even though acknowledging is naturally an `invoke:write`-shaped operation and querying is naturally an `invoke:read`- or `metadata:read`-shaped operation; (2) the alarm RPCs ship in a state where any client that successfully opened a session and subscribed to alarm events still cannot perform the operational acks the contract advertises; (3) the test matrix `GatewayGrpcScopeResolverTests` does not even cover these two request types, so the gap was not caught at unit-test time. + +**Recommendation:** Add explicit arms to `ResolveRequiredScope`: map `AcknowledgeAlarmRequest` to `GatewayScopes.InvokeWrite` (parity with other write actions; ack changes alarm state) and `QueryActiveAlarmsRequest` to `GatewayScopes.MetadataRead` or `GatewayScopes.InvokeRead`. Update `docs/Authorization.md` to list both. Extend `GatewayGrpcScopeResolverTests` with the new mappings and an assertion that every request type defined by `mxaccess_gateway.proto` is named in the resolver (the test can enumerate the assembly's request types so a future RPC cannot quietly add itself only via the admin fallback). + +**Resolution:** 2026-05-20 — Added explicit `AcknowledgeAlarmRequest => GatewayScopes.InvokeWrite` and `QueryActiveAlarmsRequest => GatewayScopes.EventsRead` arms to `GatewayGrpcScopeResolver.ResolveRequiredScope` (`src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs:21-22`). `InvokeWrite` matches the existing `MxCommandKind.Write*` mapping because ack mutates alarm state; `EventsRead` matches `StreamEventsRequest` and `MxCommandKind.DrainEvents` because querying active alarms reads the same alarm/event surface. Extended `GatewayGrpcScopeResolverTests` with two new `InlineData` rows covering both request types (`src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs:16-17`) and added four interceptor-level cases in `GatewayGrpcAuthorizationInterceptorTests` (`UnaryServerHandler_AcknowledgeAlarmMissingScope_ReturnsPermissionDenied`, `UnaryServerHandler_AcknowledgeAlarmWithScope_RunsHandler`, `ServerStreamingServerHandler_QueryActiveAlarmsMissingScope_ReturnsPermissionDenied`, `ServerStreamingServerHandler_QueryActiveAlarmsWithScope_RunsHandler`) proving each new RPC denies callers lacking the chosen scope and runs the handler when the scope is held. Updated `docs/Authorization.md` (resolver snippet and Scope Catalog table) to list both RPCs against their scopes. `dotnet test ... --filter FullyQualifiedName~GatewayGrpcAuthorizationInterceptorTests` → 14 passed, 0 failed; resolver tests 28 passed, 0 failed. + +### Server-018 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Performance & resource management | +| Location | `src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs:15` | +| Status | Resolved | + +**Description:** `GalaxyGlobMatcher.RegexCache` is a `ConcurrentDictionary` keyed by glob pattern, with no eviction. The fix for Server-008 added this cache deliberately to avoid recompiling the same handful of patterns, but the cache key is the raw glob string. The patterns currently come from two sources — `DiscoverHierarchyRequest.TagNameGlob` (client-supplied) and `ApiKeyConstraints.BrowseSubtrees` / `ReadSubtrees` / `WriteSubtrees` / `ReadTagGlobs` / `WriteTagGlobs` (admin-configured) — and `BuildRegex` also runs each glob through `Regex.Escape` so an attacker cannot craft a denial-of-service ReDoS payload. The leak is therefore bounded only by "how many distinct globs a client can submit over the process lifetime", which is in the millions for `TagNameGlob` if a client iterates through generated names. Each compiled `Regex` also holds a JIT'd assembly that is non-trivial to reclaim. + +**Recommendation:** Cap the cache at a small bound (e.g. 256 patterns) using a simple LRU or a `MemoryCache` with sliding expiration, or restrict the cache to globs that originate from API-key constraints (admin-controlled, naturally bounded) and pay the compile cost for client-supplied globs. Add a test that fills the cache with thousands of distinct globs and asserts the cache size stays bounded. + +**Resolution:** 2026-05-20 — Capped `GalaxyGlobMatcher`'s compiled-regex cache at `RegexCacheCapacity = 256` entries with FIFO-by-insertion eviction (`src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs`). A `ConcurrentQueue` tracks insertion order; when the cache grows past the cap, `EvictIfOverCapacity` takes a small lock and dequeues + removes the oldest entries until the count is back within bound. Reads stay lock-free (the lock guards only the eviction path). Internal `CurrentCacheSize` / `RegexCacheCapacity` accessors are surfaced through the existing `InternalsVisibleTo("MxGateway.Tests")` so tests can assert the bound. Regression test: `GalaxyFilterInputSafetyTests.GlobMatcher_WithManyDistinctPatterns_CacheStaysBounded` submits `RegexCacheCapacity * 4` distinct globs and asserts `CurrentCacheSize` stays in `[0, RegexCacheCapacity]`. Existing glob correctness tests (`GlobMatcher_RepeatedAndInterleavedPatterns_StayCorrect`, the adversarial-input theories) continue to pass, confirming eviction does not corrupt lookups. + +### Server-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs:183-221` | +| Status | Resolved | + +**Description:** `WorkerAlarmRpcDispatcher.QueryActiveAlarmsAsync` returns `yield break` (line 191) when `sessionRegistry.TryGet(request.SessionId, ...)` fails — it silently produces an empty stream with no diagnostic. The peer `AcknowledgeAsync` instead returns an `AcknowledgeAlarmReply` with `ProtocolStatus.Code = SessionNotFound` (lines 81-89), so the two methods have inconsistent missing-session handling. In production this branch is unreachable because `MxAccessGatewayService.QueryActiveAlarms` calls `ResolveSession(...)` first and throws `NotFound` from the gRPC layer (`MxAccessGatewayService.cs:228`), but: (a) the dispatcher is the seam other code paths might reach in the future, and (b) any unit test that instantiates the dispatcher directly with a missing session id sees an empty stream rather than a clear error, which is a footgun. + +**Recommendation:** Either throw a `SessionManagerException(SessionManagerErrorCode.SessionNotFound, ...)` (matching the gRPC service's own resolver) or yield a single `ActiveAlarmSnapshot` with a diagnostic field set, and add a `WorkerAlarmRpcDispatcherTests` case that asserts whichever shape is chosen. Aligning with `AcknowledgeAsync`'s `SessionNotFound` protocol-status pattern is preferred, but `QueryActiveAlarms` is a server-streaming RPC so a thrown `SessionManagerException` propagated by the gateway is the cleaner fit. + +**Resolution:** 2026-05-20 — Took the preferred option: `WorkerAlarmRpcDispatcher.QueryActiveAlarmsAsync` (`src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs`) now throws `SessionManagerException(SessionManagerErrorCode.SessionNotFound, ...)` instead of `yield break`-ing when the session is missing. `MxAccessGatewayService.MapException` already maps that error code to gRPC `NotFound`, so production callers see a consistent missing-session response and a direct unit-test caller now gets a clear error instead of an empty success. The unary peer `AcknowledgeAsync` continues to surface the same condition as an in-band `ProtocolStatus.Code = SessionNotFound`, which is correct for a unary RPC. Regression test: `WorkerAlarmRpcDispatcherTests.QueryActiveAlarmsAsync_WhenSessionMissing_ThrowsSessionNotFound` replaces the prior `_YieldsEmpty` assertion — it asserts the new exception shape and also exercises `AcknowledgeAsync` with the same missing session id to pin the peer-method parity. + +### Server-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor:1-2`, `…/GalaxyPage.razor:1-2`, `…/ApiKeysPage.razor:1-2`, `…/EventsPage.razor:1-2`, `…/SessionsPage.razor:1-2`, `…/WorkersPage.razor:1-2`, `…/SettingsPage.razor:1-2`, `…/SessionDetailsPage.razor:1-2` | +| Status | Resolved | + +**Description:** Every dashboard page declares two `@page` directives — `@page "/X"` AND `@page "/dashboard/X"` — even though `DashboardEndpointRouteBuilderExtensions.MapGatewayDashboard` mounts the Razor components under a `RouteGroupBuilder` with `pathBase = "/dashboard"`. The group prefix is prepended to each `@page` route, so the actual endpoints become `/dashboard/X` (from `@page "/X"`) **and** `/dashboard/dashboard/X` (from `@page "/dashboard/X"`). The pages are reachable at two URLs each, and the deeper one (`/dashboard/dashboard/sessions` etc.) is almost certainly accidental — it leaks the path-base name into the URL and creates duplicate authorize/render work per route. `GatewayApplicationTests.Build_WhenDashboardEnabled_ComponentRoutesRequireAuthorization` only checks the `/dashboard/X` shape, so the duplicate route slipped through without an assertion. + +**Recommendation:** Drop the `@page "/dashboard/X"` directive from each page; rely on the `MapGroup("/dashboard")` to provide the prefix. Or, if the team genuinely wants both URL shapes, document the choice in the file header and extend the route-enumeration test to assert that **both** are present (and both carry the authorization policy). Either way, the current setup is non-obvious. + +**Resolution:** 2026-05-20 — Took the recommended drop: removed the redundant `@page "/dashboard/X"` directive from every dashboard Razor page (`DashboardHome.razor`, `SessionsPage.razor`, `WorkersPage.razor`, `EventsPage.razor`, `GalaxyPage.razor`, `SettingsPage.razor`, `ApiKeysPage.razor`, `SessionDetailsPage.razor`). Each page now declares only its bare route (e.g. `@page "/sessions"`); `DashboardEndpointRouteBuilderExtensions.MapGatewayDashboard` continues to prepend `/dashboard` via `MapGroup`, so each page is reachable at exactly one URL (`/dashboard/X`). Regression test: `GatewayApplicationTests.Build_WhenDashboardEnabled_DoesNotRegisterDoubledDashboardPrefixRoutes` enumerates the eight previously-doubled routes (`/dashboard/dashboard/`, `/dashboard/dashboard/sessions`, ... `/dashboard/dashboard/sessions/{SessionId}`) and asserts none of them are mapped. The existing `..._MapsBlazorDashboardAndAuthEndpoints` / `..._ComponentRoutesRequireAuthorization` tests continue to verify the desired `/dashboard/X` shapes are still present and policy-gated. No public URL contract changed (the doubled shape was accidental); no doc update needed — `gateway.md` and `docs/GatewayDashboardDesign.md` never referenced the doubled routes. + +### Server-021 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:266-664`, `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs` | +| Status | Resolved | + +**Description:** The 1cd51bb commit history (the bulk read/write series, `f220908`/`5e375f6`/`758aca2`) added 473 lines of constraint-filtering and reply-merging logic to `MxAccessGatewayService`: `ApplyConstraintsAsync` (line 266), `EnforceReadTagAsync` / `EnforceWriteHandleAsync`, `FilterTagBulkAsync` / `FilterReadBulkAsync` / `FilterWriteBulkAsync` / `FilterHandleBulkAsync`, the `ReplaceWriteBulkEntries` switch, and three concrete `BulkConstraintPlan` records (`SubscribeBulkConstraintPlan`, `WriteBulkConstraintPlan`, `ReadBulkConstraintPlan`) that splice denied entries back into the worker's allowed-only reply in original-index order. None of this is covered by `MxAccessGatewayServiceTests` — its `FakeSessionManager` is wired with an `AllowAllConstraintEnforcer` (line 430) that never denies anything, so every constraint-related code path is dead at test time. A subtle off-by-one in `BuildMerged`, a wrong `PayloadOneofCase` in `GetPayload` / `SetPayload`, or a missing case in `ReplaceWriteBulkEntries` would all ship without a test failure. + +**Recommendation:** Add `MxAccessGatewayServiceTests` cases that inject a deny-on-glob `IConstraintEnforcer` and exercise: (1) `AddItemBulk` / `SubscribeBulk` / `AdviseItemBulk` with a mix of allowed and denied tags, asserting `BulkSubscribeReply.Results` interleaves denied and worker-allowed entries in original-index order; (2) the same for `ReadBulk` and each of the four bulk-write commands; (3) `HasAllowedItems == false` so `CreateDeniedReply` is exercised (no worker call); (4) the unary `Write`/`Write2`/`WriteSecured`/`WriteSecured2` paths through `EnforceWriteHandleAsync`. The fixtures can reuse the existing `FakeSessionManager` by replacing the constraint enforcer; no live worker is needed. + +**Resolution:** 2026-05-20 — Added a configurable `PredicateConstraintEnforcer` test double (`src/MxGateway.Tests/TestSupport/PredicateConstraintEnforcer.cs`) that denies on per-tag and per-handle predicates and records denials. Added 11 new tests in `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs` covering: (1) `AddItemBulk` with mixed denials — asserts the worker is called once with only the allowed subset and the merged reply interleaves denied and worker-allowed `SubscribeResult`s at their original indices; (2) `SubscribeBulk` with every tag denied — asserts `HasAllowedItems` short-circuits `CreateDeniedReply` and the session manager is never invoked; (3) `AdviseItemBulk` (handle-keyed denial via `CheckReadHandleAsync`); (4) `SubscribeBulk` with the allow-all enforcer — pass-through regression guard; (5) `ReadBulk` partial denial — asserts the `BulkReadConstraintPlan` produces a `BulkReadReply` (not a `BulkSubscribeReply`) with denied entries spliced in at their original indices; (6) `ReadBulk` all-denied short-circuit; (7) `WriteBulk` partial denial — asserts denied entries are dropped from the forwarded `Entries` and the merged reply preserves original-index order; (8) `WriteSecuredBulk` all-denied — proves the second `ReplaceWriteBulkEntries` switch arm is reachable; (9) unary `Write` with denied handle → `PermissionDenied`, no worker call, denial recorded; (10) unary `WriteSecured` with denied handle → `PermissionDenied`; (11) unary `AddItem` with denied tag → `PermissionDenied` (`EnforceReadTagAsync`). `MxAccessGatewayServiceTests.CreateService` updated to accept an `IConstraintEnforcer` so future tests can opt into the deny enforcer without duplicating the wiring. All 11 new tests pass; full suite (`dotnet test src/MxGateway.Tests/MxGateway.Tests.csproj`) is green at 458 passing. + +### Server-022 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs:8-29` | +| Status | Resolved | + +**Description:** Server-014's resolution noted that the stale "PR A.6 / A.7" / "not yet wired" language was rewritten on `MxAccessGatewayService.AcknowledgeAlarm` / `QueryActiveAlarms` and on the `WorkerAlarmRpcDispatcher` class doc. The corresponding XML doc on the **interface** `IAlarmRpcDispatcher` (lines 8-29) still says it is "PR A.6 / A.7 — gateway-side dispatcher" and that "Production implementations live in `WorkerAlarmRpcDispatcher` (this PR ships a not-yet-wired default that returns a clear worker-pending diagnostic)". That second clause directly contradicts the now-correct comments on the concrete implementations and on the gRPC service: `WorkerAlarmRpcDispatcher` is the wired default, not a not-yet-wired one. A reader who finds the interface first will believe the dispatcher is non-functional. + +**Recommendation:** Rewrite the `IAlarmRpcDispatcher` `` block to match the language now used on `WorkerAlarmRpcDispatcher` and on the gRPC service: DI binds `WorkerAlarmRpcDispatcher` by default; `NotWiredAlarmRpcDispatcher` is only the null fallback for tests/DI omission. Drop the "PR A.6 / A.7" prefix from the `` — the interface is now the public alarm-RPC seam. + +**Resolution:** 2026-05-20 — Rewrote `IAlarmRpcDispatcher`'s `` and `` (`src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs`) to match the language now used on `WorkerAlarmRpcDispatcher` and on `MxAccessGatewayService.AcknowledgeAlarm` / `QueryActiveAlarms`: dropped the stale "PR A.6 / A.7" prefix from the summary, and replaced the "this PR ships a not-yet-wired default that returns a clear worker-pending diagnostic" clause with the correct statement that DI binds the production `WorkerAlarmRpcDispatcher` by default and `NotWiredAlarmRpcDispatcher` is only the null fallback for DI omission / standalone tests. Pure documentation change; no test. diff --git a/code-reviews/Tests/findings.md b/code-reviews/Tests/findings.md index f624b1c..1a0d0b0 100644 --- a/code-reviews/Tests/findings.md +++ b/code-reviews/Tests/findings.md @@ -4,8 +4,8 @@ |---|---| | Module | `src/MxGateway.Tests` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `6c64030` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | @@ -13,16 +13,16 @@ | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issue found: Tests-001 (`FakeSessionManager.TryGetSession` always returns true), Tests-011 (unobserved worker task). | -| 2 | mxaccessgw conventions | FakeWorkerHarness used per docs; no real secrets; minor style drift in three alarm-test files (Tests-008). | -| 3 | Concurrency & thread safety | Issues found: Tests-006 (`Task.Delay`-based timing), Tests-012 (no parallelism guard for `WebApplication` tests). | -| 4 | Error handling & resilience | Strong — timeouts, faults, overflow, kill paths, protocol violations all exercised. No issues found. | -| 5 | Security | Issues found: Tests-002 (no SQL-injection coverage of Galaxy RPCs), Tests-010 (anonymous-localhost negative cases untested). | -| 6 | Performance & resource management | Issue found: Tests-003 (temp DB/worker directories never cleaned up). | +| 1 | Correctness & logic bugs | Issue found: Tests-015 (`FakeWorkerProcess.WaitForExitAsync` mutates `HasExited`, weakening the smoke test assertion). | +| 2 | mxaccessgw conventions | No new issues. Style/convention drift previously filed has been resolved. | +| 3 | Concurrency & thread safety | Issue found: Tests-017 (`HeartbeatMonitor_WhenHeartbeatExpires_FaultsClient` still on real wall-clock). | +| 4 | Error handling & resilience | Strong — timeouts, faults, overflow, kill paths, protocol violations all exercised. No new issues found. | +| 5 | Security | No new issues. `Galaxy` adversarial-input safety (Tests-002), dashboard anonymous-localhost negatives (Tests-010), and interceptor composition (Tests-004) all resolved in the prior pass. | +| 6 | Performance & resource management | Issue found: Tests-014 (`WebApplication` instances built by `GatewayApplicationTests` and `DashboardCookieOptionsTests` are never disposed). | | 7 | Design-document adherence | Tests match `docs/GatewayTesting.md`; no drift found. No issues found. | -| 8 | Code organization & conventions | Issue found: Tests-007 (`TestServerCallContext` copy-pasted into 4+ files). | -| 9 | Testing coverage | Issues found: Tests-001, Tests-004 (no end-to-end interceptor+service test), Tests-005 (no worker-crash-mid-command coverage), Tests-002. | -| 10 | Documentation & comments | Issue found: Tests-009 (stale/mismatched XML `` comments). | +| 8 | Code organization & conventions | Issue found: Tests-018 (`DateTimeOffset.Parse` calls without `CultureInfo.InvariantCulture`). | +| 9 | Testing coverage | Issues found: Tests-013 (eight new `GatewaySession.*BulkAsync` methods untested), Tests-016 (a Galaxy cache unit test performs a real network connect attempt). | +| 10 | Documentation & comments | Issue found: Tests-019 (the `Re-triage note` paragraphs added to Tests-002/006/008 only live inside `findings.md` — `docs/GatewayTesting.md` is not updated to describe the in-memory Galaxy filter safety tests added under that finding). | ## Findings @@ -211,3 +211,108 @@ **Recommendation:** Add an `xunit.runner.json` or a collection grouping the `WebApplication`-building tests, and keep the `:0` ephemeral-port convention explicit so future tests do not introduce a fixed-port collision. **Resolution:** Resolved 2026-05-18: added `src/MxGateway.Tests/xunit.runner.json` making the parallelism policy explicit (`parallelizeTestCollections: true`, `maxParallelThreads: -1`, `parallelizeAssembly: false`, `longRunningTestSeconds: 30`) and wired it into `MxGateway.Tests.csproj` as `` so the runner picks it up (confirmed present in `bin/Debug/net10.0/`). Added a comment at the only `WebApplication`-building call site (`GatewayApplicationTests.cs`, `--urls=http://127.0.0.1:0`) documenting that the ephemeral-port (`:0`) convention is mandatory because test collections run in parallel. No fixed-port binding exists today; this is a preventative guardrail as the finding recommends. + +### Tests-013 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `src/MxGateway.Server/Sessions/GatewaySession.cs:449-679`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` | +| Status | Resolved | + +**Description:** `GatewaySession` exposes eleven bulk methods (`AddItemBulkAsync`, `AdviseItemBulkAsync`, `RemoveItemBulkAsync`, `UnAdviseItemBulkAsync`, `SubscribeBulkAsync`, `UnsubscribeBulkAsync`, `WriteBulkAsync`, `Write2BulkAsync`, `WriteSecuredBulkAsync`, `WriteSecured2BulkAsync`, `ReadBulkAsync`) but only three (`SubscribeBulkAsync`, `WriteBulkAsync`, `ReadBulkAsync`) are exercised in `SessionManagerTests`. A grep across `src/MxGateway.Tests` for the other eight method names returns zero matches. The recent commit `eaa7093` ("register the five new bulk subcommands in `IsKnownGatewayCommand`") explicitly added bulk surface to the gateway, and `1cd51bb` added stress benchmarks for it, but the gateway-side tests do not pin the command-kind, payload-shape, or `WriteSecured*Bulk` credential-redaction behaviour for any of the new bulk variants. A future regression in `WriteSecuredBulkAsync` body construction would not be caught by the gateway unit suite. + +**Recommendation:** Mirror the existing `SubscribeBulkAsync` / `WriteBulkAsync` / `ReadBulkAsync` test pattern for the eight missing methods: each test should `OpenSessionAsync`, invoke the bulk API, assert the worker received exactly one `WorkerCommand` of the matching `MxCommandKind`, and (for the secured variants) confirm the credential payload survives the round-trip without being log-redacted from the over-the-wire command shape. + +**Resolution:** Resolved 2026-05-20: added `src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs` with per-method coverage for all eleven bulk entry points. Each method now has a round-trip test that pins (a) the exact `MxCommandKind` sent to the worker, (b) the payload shape (server handle, item handles / tag addresses / entries, timeout for `ReadBulk`), and (c) per-entry failure surfacing where the reply contains a mix of `WasSuccessful = true`/`false` results with an `ErrorMessage`. Each method also has a `*_PropagatesCancellation` test that pre-cancels the token and asserts `OperationCanceledException` flows out. The secured variants additionally pin that `CurrentUserId` / `VerifierUserId` survive the over-the-wire command shape unchanged (the gateway's redaction rules apply only to logs, not to the command body the worker receives). New tests use a local `FakeBulkWorkerClient` keyed by `MxCommand.Kind`-specific replies; no production-code change. All 54 SessionManager/GalaxyHierarchyCache tests pass with `dotnet test --filter "FullyQualifiedName~SessionManager|FullyQualifiedName~GalaxyHierarchyCache"`. + +### Tests-014 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Performance & resource management | +| Location | `src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs:18,33,44,62,81,105`, `src/MxGateway.Tests/Gateway/Dashboard/DashboardCookieOptionsTests.cs:17` | +| Status | Resolved | + +**Description:** Seven `[Fact]` methods build a real `WebApplication` via `GatewayApplication.Build([])` and never dispose it. `WebApplication` is `IAsyncDisposable`; constructing one stands up a full DI container, an OpenTelemetry meter (`GatewayMetrics`), Kestrel server objects, hosted services, and logging providers. Because the suite runs test collections in parallel (per the new `xunit.runner.json` from Tests-012), every undisposed instance keeps its meter/loggers/hosted services alive until the test process exits, doubling up live Meter instances each time and silently extending the memory/handle footprint of an `xunit` run. Only the two tests that actually call `app.StartAsync()` (`GatewayApplicationTests.StartAsync_InvalidGatewayConfiguration_FailsStartup` and `SqliteAuthStoreTests.StartAsync_NewerSchemaVersion_BlocksStartup`) currently use `await using`. + +**Recommendation:** Promote each `WebApplication app = GatewayApplication.Build(...)` to `await using WebApplication app = ...` and make the containing test method `async Task`. The endpoint-listing assertions do not need `await`, but the `await using` will ensure the DI container, meter, and hosted services are torn down per-test. + +**Resolution:** 2026-05-20 — Promoted all seven `WebApplication`-building tests (six in `GatewayApplicationTests` plus the one in `DashboardCookieOptionsTests`) to `async Task` with `await using WebApplication app = GatewayApplication.Build(...)`, so the DI container, `GatewayMetrics` meter, hosted services, and Kestrel objects are torn down per-test rather than leaking until process exit. The previously already-`await using` `StartAsync_InvalidGatewayConfiguration_FailsStartup` was unchanged. Full suite green. + +### Tests-015 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:374-379,87` | +| Status | Resolved | + +**Description:** The nested `FakeWorkerProcess.WaitForExitAsync` implementation unconditionally sets `HasExited = true` and `ExitCode ??= 0` when called, regardless of whether the scripted worker actually completed the shutdown handshake. The smoke-test assertion `Assert.True(launcher.Process.HasExited)` therefore cannot distinguish "the scripted worker received `WorkerShutdown`, sent `WorkerShutdownAck`, and called `MarkExited(0)`" from "the gateway code path simply awaited `WaitForExitAsync` somewhere during teardown". The scripted worker happens to call `MarkExited(0)` after receiving the shutdown frame, but a regression that bypassed the shutdown-ack path entirely would still pass this assertion. The companion launcher in `SessionWorkerClientFactoryFakeWorkerTests.FakeWorkerProcess.WaitForExitAsync` (lines 351-356) has the same shape — fine there because no exit assertion is made — but the smoke test relies on this signal. + +**Recommendation:** Make `WaitForExitAsync` await an internal `TaskCompletionSource` that is only completed by `Kill()` or `MarkExited()` (the same pattern `WorkerClientTests.FakeWorkerProcess` already uses for `_exited`), so `HasExited` reflects actual exit and the smoke test's assertion is meaningful. + +**Resolution:** 2026-05-20 — Rewrote the smoke-test `FakeWorkerProcess` to back `WaitForExitAsync` with a `TaskCompletionSource _exited` that is only completed inside `MarkExited` (called by the scripted worker after sending `WorkerShutdownAck`) or `Kill` (which calls `MarkExited(-1)`), removing the "set `HasExited = true` and return immediately" cheat. The smoke test now also asserts `Assert.Equal(0, launcher.Process.ExitCode)` — `MarkExited(0)` is reachable only via the shutdown-ack branch, so a regression that bypassed the ack path would produce a non-zero (or null) exit code and fail the assertion deterministically. `WorkerClient.ShutdownAsync` calls `WaitForProcessExitAsync`, which now genuinely awaits the scripted worker's ack. + +### Tests-016 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:29-41,115-124` | +| Status | Resolved | + +**Description:** `RefreshAsync_WhenSqlIsUnreachable_MarksUnavailableAndDoesNotPublish` is in the unit-test project but exercises a real `GalaxyHierarchyCache`/`GalaxyRepository` against a hard-coded TCP socket `127.0.0.1:65500` with a one-second connect timeout. Per `docs/GatewayTesting.md`, live Galaxy coverage belongs in `MxGateway.IntegrationTests` and is gated by `MXGATEWAY_RUN_LIVE_GALAXY_TESTS=1`; this test is neither gated nor uses a stub repository. On most boxes the connect fails closed (the test passes), but the outcome depends on OS-level "connection refused" vs "no route to host" behaviour and is sensitive to environments where 127.0.0.1:65500 happens to be bound — a real flakiness source. It also breaks the gateway-without-MXAccess invariant in spirit (the gateway code path under test does I/O the unit project should not need). + +**Recommendation:** Either (a) replace the real repository with an in-test fake that throws a `SqlException`/`TimeoutException` from `GetHierarchyAsync`, exercising `GalaxyHierarchyCache.RefreshAsync`'s exception path directly; or (b) move the test to `MxGateway.IntegrationTests` and gate it behind a "no-live-DB-required" variant of the live-Galaxy attribute. (a) is preferred because the production path being tested is the cache's reaction to a repository exception, not socket behaviour. + +**Resolution:** Resolved 2026-05-20: applied option (a). Introduced `src/MxGateway.Server/Galaxy/IGalaxyRepository.cs` with the four methods the cache consumes (`TestConnectionAsync`, `GetLastDeployTimeAsync`, `GetHierarchyAsync`, `GetAttributesAsync`); made `GalaxyRepository` implement it; changed `GalaxyHierarchyCache`'s constructor to depend on `IGalaxyRepository` rather than the concrete type; and registered the interface against the existing concrete singleton in `GalaxyRepositoryServiceCollectionExtensions.AddGalaxyRepository`. Rewrote the test as `RefreshAsync_WhenRepositoryThrows_MarksUnavailableAndDoesNotPublish` using a local `ThrowingGalaxyRepository : IGalaxyRepository` that throws an `InvalidOperationException` from `GetLastDeployTimeAsync` (the first call the cache makes against the repository). The test now exercises the cache's exception branch directly — no TCP I/O — and additionally asserts that `GetHierarchyAsync`/`GetAttributesAsync` are NOT invoked once the deploy-time probe has failed. `Current_BeforeAnyRefresh_ReturnsEmpty` was migrated to the same fake. The unreachable `CreateCache` helper that built a real `GalaxyRepository` against `127.0.0.1:65500` was removed. The Galaxy SQL surface itself stays covered by `MxGateway.IntegrationTests.Galaxy.GalaxyRepositoryLiveTests` (gated by `MXGATEWAY_RUN_LIVE_GALAXY_REPOSITORY_TESTS=1`). + +### Tests-017 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:346-364` | +| Status | Resolved | + +**Description:** `HeartbeatMonitor_WhenHeartbeatExpires_FaultsClient` configures `HeartbeatGrace = 80 ms` and `HeartbeatCheckInterval = 20 ms`, then asserts the client faults within the 5-second `TestTimeout`. The test compares against the real wall clock — the heartbeat monitor reads `TimeProvider.System` for the grace check. After Tests-006 migrated the other heartbeat tests to an injected `ManualTimeProvider` for determinism, this one is now the only `WorkerClientTests` heartbeat case that still rides the wall clock. The 5-second outer bound makes a false failure unlikely, but the test cannot fail fast when the heartbeat-monitor logic regresses — it just waits the full 5 seconds. + +**Recommendation:** Inject the same `ManualTimeProvider` used by `ReadLoop_WhenHeartbeatArrives_UpdatesLastHeartbeatAndWorkerProcess`, then `clock.Advance(TimeSpan.FromSeconds(2))` past the grace and assert the fault deterministically. The `HeartbeatCheckInterval` (20 ms) timer fire can stay on the real clock; what needs to be deterministic is the grace comparison. + +**Resolution:** 2026-05-20 — `HeartbeatMonitor_WhenHeartbeatExpires_FaultsClient` now constructs a `ManualTimeProvider` seeded at `"2026-05-20T12:00:00Z"`, passes it to `CreateClient` via the existing `timeProvider` parameter, and calls `clock.Advance(TimeSpan.FromSeconds(2))` after the handshake. `WorkerClient.MarkReady` records `_lastHeartbeatAt` from the manual clock, so the next 20 ms `HeartbeatCheckInterval` tick observes `now - lastHeartbeat = 2s > 80ms grace` and faults deterministically. The check-interval timer stays on the real clock as the finding recommended; only the grace comparison is deterministic. + +### Tests-018 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:32`, `src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs:45,51,57,105,134,163,167,202-209,284,317,523`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:40` | +| Status | Resolved | + +**Description:** Several tests parse ISO-8601 literals with `DateTimeOffset.Parse("2026-04-26T10:00:00Z")` without an explicit `CultureInfo.InvariantCulture`. `Directory.Build.props` enables `TreatWarningsAsErrors`, but CA1305 (specify `IFormatProvider`) is not currently raised because the tests don't trigger it; nevertheless, `DateTimeOffset.Parse` without a culture takes `CurrentCulture`, and on a locale whose `DateTimeFormatInfo` rejects the `Z` suffix or uses non-Gregorian calendar conventions, these parses can throw at test time. `WorkerClientTests.cs:327` and `FakeWorkerHarnessTests.cs:121` already added `System.Globalization.CultureInfo.InvariantCulture` in the Tests-006 fix; the other ~15 call sites did not get the same treatment. + +**Recommendation:** Add `CultureInfo.InvariantCulture` to every `DateTimeOffset.Parse(...)` call in `MxGateway.Tests`, or replace with `DateTimeOffset.ParseExact` against the literal `"O"` round-trip format. A single-line `using System.Globalization;` per file keeps the call sites concise. + +**Resolution:** 2026-05-20 — Added `CultureInfo.InvariantCulture` to every `DateTimeOffset.Parse` site in `MxGateway.Tests` that lacked it: 16 call sites in `DashboardSnapshotServiceTests.cs` (a new `using System.Globalization;` was added so the call sites stay concise) and one in `SessionManagerTests.cs` (using the fully-qualified `System.Globalization.CultureInfo.InvariantCulture` to match the in-file style of the existing `ManualTimeProvider` parse sites). `GalaxyHierarchyCacheTests.cs:36` was already correct from the Tests-016 rewrite. A final grep confirms every `DateTimeOffset.Parse`/`DateTime.Parse` call in `src/MxGateway.Tests` now passes `CultureInfo.InvariantCulture`. + +### Tests-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `docs/GatewayTesting.md`, `code-reviews/Tests/findings.md` (Tests-002 re-triage) | +| Status | Resolved | + +**Description:** The Tests-002 re-triage (2026-05-18) confirmed there is no SQL-injection surface in `GalaxyRepository` because filters are applied in memory by `GalaxyHierarchyProjector`/`GalaxyGlobMatcher` against the cached snapshot, and added 10 adversarial-input tests in `src/MxGateway.Tests/Galaxy/GalaxyFilterInputSafetyTests.cs`. That explanation lives only in the findings file; `docs/GatewayTesting.md` does not mention `GalaxyFilterInputSafetyTests`, the in-memory filter model, or the adversarial-input matrix. A future reader of the test docs will not know which tests pin the literal-filter behaviour or why the Galaxy SQL layer is not unit-tested for parameterisation. Per `CLAUDE.md` ("Update docs in the same change as the source. When public APIs, contracts, configuration, build steps, security behavior, event shapes, value conversion, status mapping, or lifecycle rules change, the affected docs must change in the same commit"), the Galaxy security-behaviour decision warrants a paragraph in `GatewayTesting.md`. + +**Recommendation:** Add a short subsection to `docs/GatewayTesting.md` (probably under "Focused Commands" or a new "Galaxy Filter Safety" section) that names `GalaxyFilterInputSafetyTests`, explains that Galaxy filtering happens in memory against the cached hierarchy (so the SQL surface is constant), and lists the adversarial-input invariants the suite pins (`%`, `_`, `'`, `;`, `[abc]` are literals; the glob regex has a 100 ms timeout against pathological input). + +**Resolution:** 2026-05-20 — Added a "Galaxy Filter Safety" section to `docs/GatewayTesting.md` (immediately after "Live Galaxy Repository", before "Live LDAP") that names `GalaxyFilterInputSafetyTests`, re-frames the Tests-002 finding (the Galaxy SQL surface is constant — `HierarchySql`, `AttributesSql`, `SELECT 1`, `SELECT time_of_last_deploy FROM galaxy`), explains that all filters are applied in memory by `GalaxyHierarchyProjector` / `GalaxyGlobMatcher`, lists the adversarial-input matrix (`'`, `' OR '1'='1`, `'; DROP TABLE gobject;--`, `%`, `_`, `100%_off`, `[abc]`, `Pump'001`), and enumerates the invariants the suite pins (SQL metacharacters are opaque literals, only `*`/`?` are glob wildcards, the matcher has a 100 ms regex timeout against pathological input, the projector returns zero matches / `NotFound` rather than the whole hierarchy, and the `DiscoverHierarchy` RPC end-to-end returns zero matches for adversarial globs). diff --git a/code-reviews/Worker.Tests/findings.md b/code-reviews/Worker.Tests/findings.md index 5943072..bc89746 100644 --- a/code-reviews/Worker.Tests/findings.md +++ b/code-reviews/Worker.Tests/findings.md @@ -4,13 +4,15 @@ |---|---| | Module | `src/MxGateway.Worker.Tests` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `6c64030` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +### 2026-05-18 review (commit `6c64030`) + | # | Category | Result | |---|---|---| | 1 | Correctness & logic bugs | Issues found: Worker.Tests-010 (weak substring assertion), Worker.Tests-011 (test name overstates what it proves). | @@ -24,6 +26,21 @@ | 9 | Testing coverage | Issues found: Worker.Tests-001 (`StaMessagePump` untested), Worker.Tests-002 (COM-event delivery untested), Worker.Tests-012 (frame-validation gaps). | | 10 | Documentation & comments | Issues found: Worker.Tests-008 (misplaced redaction test), Worker.Tests-011 (misleading test name). | +### 2026-05-20 re-review (commit `1cd51bb`) + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Issues found: Worker.Tests-018 (silent-skip masquerades as passing tests), Worker.Tests-024 (`Subscribe_WhenUnderlyingSubscribeThrows_DisposesConsumer` swallows the real exception type). | +| 2 | mxaccessgw conventions | Issues found: Worker.Tests-019 (`AlarmsLiveSmokeTests` uses `snake_case` outside the alarm-method scope Worker.Tests-009 corrected); pre-existing `LiveMxAccessFactAttribute` is not consumed by `MxAccessLiveComCreationTests` (Worker.Tests-018). | +| 3 | Concurrency & thread safety | Issues found: Worker.Tests-020 (`MxAccessValueCacheTests.TryWaitForUpdate_ReturnsFalseAfterDeadline_WhenNoSetOccurs` asserts wall-clock floor and pump-call lower bound). | +| 4 | Error handling & resilience | Issues found: Worker.Tests-021 (`WorkerFrameProtocolErrorCode.EndOfStream` and the writer-side `MessageTooLarge`/`InvalidEnvelope` branches are uncovered). | +| 5 | Security | Redaction coverage is sound; no new issues. | +| 6 | Performance & resource management | No new issues — `MemoryStream`/session-disposal hygiene fixes from the prior pass hold; `WorkerFrameReader` `ArrayPool` rent/return path is now regression-tested. | +| 7 | Design-document adherence | No new issues. | +| 8 | Code organization & conventions | Issues found: Worker.Tests-016 (the now-shared `MxAccessSession` reflection construction in `AlarmCommandExecutorTests` duplicates the testable surface the consolidated TestSupport folder was meant to host). | +| 9 | Testing coverage | Issues found: Worker.Tests-017 (`WorkerCancel` envelope-dispatch path untested), Worker.Tests-022 (`WnWrapAlarmConsumer.PollOnce` transition-delta computation untested at the snapshot-to-transitions level). | +| 10 | Documentation & comments | Issues found: Worker.Tests-023 (`AlarmClientWmProbeTests` and `WnWrapConsumerProbeTests` are unit-test classes carrying 1000+ lines of probe-only code; their `[Fact(Skip=...)]` status is documented but the probe scaffolding is mixed into the same test assembly as regression tests). | + ## Findings ### Worker.Tests-001 @@ -250,3 +267,138 @@ **Recommendation:** Add a `Drain(0)` drain-all test and an empty-queue drain test. **Resolution:** 2026-05-18 — Added three tests to `MxAccessEventQueueTests`. `Drain_WithZeroMaxEvents_DrainsAllEvents` covers the `maxEvents == 0` drain-all branch in `MxAccessEventQueue.Drain` (verified at `src/MxGateway.Worker/MxAccess/MxAccessEventQueue.cs:174`) — three events enqueued, `Drain(0)` returns all three in order and empties the queue. `Drain_WhenQueueIsEmpty_ReturnsEmptyList` covers the `drainCount == 0` early-return branch for both `Drain(0)` and `Drain(5)` on an empty queue. `Enqueue_AfterRecordFault_ThrowsInvalidOperationException` covers the backpressure contract gap the finding flagged — after a manual `RecordFault`, `Enqueue` throws `InvalidOperationException` ("outbound event queue is faulted") and the event is not queued. + +### Worker.Tests-016 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs:317-393` | +| Status | Resolved | + +**Description:** `AlarmCommandExecutorTests` reaches into `MxAccessSession` via reflection (`typeof(MxAccessSession).GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, ..., new[] { typeof(object), typeof(IMxAccessServer), typeof(IMxAccessEventSink), typeof(MxAccessHandleRegistry), typeof(MxAccessValueCache), typeof(int) }, ...)`) and provides an inline `NullMxAccessServer` no-op implementing every `IMxAccessServer` method. The XML doc admits the reflection-based path is fragile (`"MxAccessSession private ctor signature changed; update the test seam."`). The same `NullMxAccessServer` shape is reinventable wherever an executor is exercised in isolation; the consolidated `TestSupport` namespace introduced in Worker.Tests-014 was the natural home for it, but the no-op server lives in a single test file's private nested class instead. A future change to the private ctor signature breaks this one test in a way that requires re-reading the reflection call to diagnose, and a second test that wants the same no-op surface will reflectively duplicate it. + +**Recommendation:** Either (a) add a non-reflective seam — a constructor or static factory marked `internal`-with-`InternalsVisibleTo` that takes `IMxAccessServer` + the existing dependencies, removing the reflection — or (b) move the `NullMxAccessServer` no-op and the reflection helper into `TestSupport/NoopMxAccessSession.cs` so any future test can share it and a ctor change is fixed in one place. + +**Resolution:** 2026-05-20 — Took option (a) plus option (b). Added a non-reflective `internal static MxAccessSession.CreateForTesting(IMxAccessServer, IMxAccessEventSink, MxAccessHandleRegistry?, MxAccessValueCache?, int?)` factory in `src/MxGateway.Worker/MxAccess/MxAccessSession.cs` (lines 61-88), gated through the pre-existing `` in `src/MxGateway.Worker/MxGateway.Worker.csproj`. `AlarmCommandExecutorTests.NewExecutor` now calls `MxAccessSession.CreateForTesting(new NoopMxAccessServer(), new NoopEventSink())` — no `GetConstructor`/`Invoke`/`BindingFlags` anywhere in the file. The previously per-file `NullMxAccessServer` no-op was extracted to the shared `src/MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs` (matching the `TestSupport` consolidation introduced in Worker.Tests-014); the XML doc on the new file explicitly cites Worker.Tests-016 for the rationale. A future change to the `MxAccessSession` private ctor signature now updates `CreateForTesting` in one place; the test file does not need to be edited. + +### Worker.Tests-017 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs` | +| Status | Resolved | + +**Description:** `WorkerPipeSession.DispatchGatewayEnvelopeAsync` (`src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:365-385`) has three documented branches: `WorkerCommand`, `WorkerShutdown`, and `WorkerCancel`. `WorkerPipeSessionTests` exercises the first two but never sends a `WorkerCancel` envelope, so the `_runtimeSession?.CancelCommand(envelope.CorrelationId)` path and the contract that the session forwards a cancel without faulting the pipe are uncovered. The `default:` arm (`UnexpectedEnvelopeBody` exception) is also uncovered — a gateway sending the wrong body case (e.g. another `GatewayHello` after the handshake) should produce a `ProtocolViolation` fault but no test asserts this. + +**Recommendation:** Add two tests: one that writes a `WorkerCancel` envelope with a known correlation id and asserts `FakeRuntimeSession.CancelCommand` was called with that id (extend the shared `FakeRuntimeSession` to record cancel-correlation-ids); one that writes a post-handshake `GatewayHello` envelope and asserts the session writes a `WorkerFault` with category `ProtocolViolation` and exits the message loop. + +**Resolution:** 2026-05-20 — Added two `[Fact]`s to `WorkerPipeSessionTests` and the supporting state to the shared `FakeRuntimeSession`. (1) `RunAsync_WhenGatewaySendsWorkerCancel_ForwardsCorrelationIdToRuntimeSession` writes a `WorkerCancel` envelope with correlation id `"cancel-correlation-1"` after the handshake, then drives a normal shutdown via `SendShutdownAndWaitAsync` — observing the shutdown ack proves the message loop kept running (no fault, no exit) and `Assert.Contains("cancel-correlation-1", runtime.CancelledCorrelationIds)` proves the cancel reached `IWorkerRuntimeSession.CancelCommand`. The shared `FakeRuntimeSession` was extended with a `CancelledCorrelationIds` snapshot list and an optional `CancelCommandReturnValue` (defaulting to `false`, preserving the prior behaviour). (2) `RunAsync_WhenGatewaySendsUnexpectedEnvelopeBodyAfterHandshake_ThrowsAndExitsMessageLoop` writes a second `GatewayHello` envelope post-handshake — valid envelope, invalid body case for the message-loop state — and asserts `Assert.ThrowsAsync(async () => await runTask)` with `ErrorCode == WorkerFrameProtocolErrorCode.UnexpectedEnvelopeBody`. Re-triage: the original recommendation said "the session writes a `WorkerFault` with category `ProtocolViolation`", but the source at `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:380-384` shows the `default:` arm throws `WorkerFrameProtocolException`; `RunMessageLoopAsync` has no fault-writing catch (only `CompleteStartupHandshakeAsync` writes faults during the handshake). The test XML doc records this — the contract pinned is the exception type/error-code and the message-loop exit, not a fault frame. + +### Worker.Tests-018 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs:18-31, 35-73, 75-145, 148-220, 222-342` | +| Status | Resolved | + +**Description:** Every `[Fact]` in `MxAccessLiveComCreationTests` gates on `RunLiveMxAccessTests()` and `return`s silently when the opt-in env var is not set. xUnit reports a `Fact` that returns normally as **passed**, so a CI run without `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1` shows five green "live MXAccess" tests that did not run a single line of MXAccess code. `docs/GatewayTesting.md` and the `IntegrationTests` project already provide the correct pattern — `LiveMxAccessFactAttribute` (in `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs`) emits xUnit's native `Skipped` status when the env var is absent — but `MxAccessLiveComCreationTests` does not consume it, so the gate is invisible in test output. The first test (`StartAsync_WhenOptedIn_CreatesInstalledMxAccessComObjectOnSta`) additionally inlines the env-var check (`string.Equals(Environment.GetEnvironmentVariable(...), "1", StringComparison.Ordinal)`) instead of using the local `RunLiveMxAccessTests()` helper, so the convention is inconsistent even within the same file. + +**Recommendation:** Move `LiveMxAccessFactAttribute` into a shared location both projects can reference (e.g. `MxGateway.Contracts.TestSupport` or a new `MxGateway.TestSupport` shared project), and decorate the five `MxAccessLiveComCreationTests` methods with `[LiveMxAccessFact]` instead of `[Fact]`. Drop the inline env-var checks. Skipped runs will then report `Skipped` rather than `Passed`, and CI will distinguish "live MXAccess unavailable" from "live MXAccess opted in, succeeded". + +**Resolution:** 2026-05-20 — Added a self-contained `LiveMxAccessFactAttribute` at `src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs` (namespace `MxGateway.Worker.Tests.TestSupport`) that mirrors the `MxGateway.IntegrationTests` attribute: when `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS` is not `1`, the attribute sets `Skip` so xUnit emits a native `Skipped` result rather than a misleading `Passed`. All five `MxAccessLiveComCreationTests` methods now use `[LiveMxAccessFact]`; the inline env-var check at the top of `StartAsync_WhenOptedIn_CreatesInstalledMxAccessComObjectOnSta` and the per-method `if (!RunLiveMxAccessTests()) return;` silent-returns were deleted. The worker tests target net48/x86 and the integration tests target net10.0, so introducing a cross-project shared assembly was not practical; the Worker.Tests attribute is a near-duplicate of the IntegrationTests attribute and the XML doc on the new file calls this out so the next reviewer understands why two copies exist. xUnit output now reports the five live tests as `[SKIP]` when the env var is absent — `dotnet test ...` shows `Skipped: 9, Total: 274`, with the five `MxAccessLiveComCreationTests` correctly counted as skipped rather than passed. + +### Worker.Tests-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | mxaccessgw conventions | +| Location | `src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs:45`, `src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs:143`, `src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs:55` | +| Status | Resolved | + +**Description:** Worker.Tests-009 renamed every `snake_case` alarm-test method to the project's `Method_Scenario_Expectation` convention, but the rename missed the dev-rig probe and live-smoke `[Fact]`s in the `MxGateway.Worker.Tests` root (not under `MxAccess/`): `AlarmsLiveSmokeTests.Alarms_full_pipeline_round_trip`, `AlarmClientWmProbeTests.Probe_AlarmClient_for_alarm_messages` (and its helpers), and `WnWrapConsumerProbeTests.ProbeWnWrapConsumer`. These are `[Fact(Skip=...)]` so they never execute in normal CI, but they still drift from `docs/style-guides/CSharpStyleGuide.md` and contradict the resolution claim in Worker.Tests-009 that "every `[Fact]`/`[Theory]` method in the five alarm test files" was renamed. + +**Recommendation:** Rename `Alarms_full_pipeline_round_trip` → `Alarms_FullPipelineRoundTrip_RaisesAndAcknowledges` (or similar `Method_Scenario_Expectation` form) and apply the same convention to the two probe methods. xUnit discovers by attribute, not name, so renames are behaviour-neutral. + +**Resolution:** 2026-05-20 — Renamed the three `snake_case` probe/smoke `[Fact]` methods to the project's `Method_Scenario_Expectation` PascalCase convention: `Alarms_full_pipeline_round_trip` → `Alarms_FullPipelineRoundTrip_RaisesAndAcknowledges` (in `Probes/AlarmsLiveSmokeTests.cs`), `ProbeAlarmClientWmMessages` → `ProbeAlarmClient_OnDevRig_LogsAlarmWindowMessages` (in `Probes/AlarmClientWmProbeTests.cs`), and `ProbeWnWrapConsumer` → `ProbeWnWrapConsumer_OnDevRig_LogsXmlAlarmStream` (in `Probes/WnWrapConsumerProbeTests.cs`). The three files have moved to `Probes/` as part of Worker.Tests-023; the location columns above predate that move. xUnit discovers tests by attribute, so the renames are behaviour-neutral and the `Skip` strings still apply unchanged. + +### Worker.Tests-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `src/MxGateway.Worker.Tests/MxAccess/MxAccessValueCacheTests.cs:88-108` | +| Status | Resolved | + +**Description:** `TryWaitForUpdate_ReturnsFalseAfterDeadline_WhenNoSetOccurs` asserts both a lower wall-clock bound (`stopwatch.ElapsedMilliseconds >= 60`, deadline was 80ms) and `pumpCalls > 1`. The 60ms floor is the same class of timing race Worker.Tests-003/004/013 corrected elsewhere: on a loaded CI agent a `Task.Run` scheduling delay can push the wait's start past the deadline so the loop runs zero or one iteration, the wait returns slightly *early* of the 60ms floor, and the test fails through no fault of the production code. The `pumpCalls > 1` check additionally races against the same scheduler — if the agent stalls the wait thread, `pumpStep` might fire only once before the deadline. The test purpose (verifying the timeout is honoured and pump-step is invoked) is sound but the assertions are wall-clock floors rather than deterministic checks. + +**Recommendation:** Drop the elapsed-time floor and the `pumpCalls > 1` assertion; verify only that `result` is false, `value` is default, and `pumpCalls >= 1` (the pump must fire at least once, but not "more than once"). The fact that `TryWaitForUpdate` returned false after the deadline is the contract the test exists to pin; the timing strictness is incidental. + +**Resolution:** 2026-05-20 — Eliminated the wall-clock dependency entirely (the equivalent of a manual time source for the `DateTime.UtcNow`-based deadline). The test now passes `DateTime.UtcNow.AddMilliseconds(-1)` — a deadline already in the past — so `TryWaitForUpdate`'s loop pumps once, immediately observes the elapsed deadline, and returns false with zero `Thread.Sleep`. The `Stopwatch`/`stopwatch.ElapsedMilliseconds >= 60` floor and the `pumpCalls > 1` strict-inequality assertions are gone. With an already-expired deadline the contract is deterministic: exactly one pump call (the loop must pump before checking the deadline so MXAccess messages can dispatch on the calling thread even when the deadline has just expired), `result == false`, `value` is default. Matches the pattern Worker.Tests-003/004/013 used — drop wall-clock floor checks in favour of a deterministic signal. + +### Worker.Tests-021 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Error handling & resilience | +| Location | `src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs` | +| Status | Resolved | + +**Description:** `WorkerFrameProtocolTests` covers `MalformedLength`, `MessageTooLarge` (read-side, added in Worker.Tests-012), `ProtocolVersionMismatch`, `SessionMismatch`, and `InvalidEnvelope` on `WorkerFrameReader`. Three documented protocol-error branches remain uncovered: (1) `WorkerFrameProtocolErrorCode.EndOfStream` from `WorkerFrameReader.ReadExactlyOrThrowAsync` (`src/MxGateway.Worker/Ipc/WorkerFrameReader.cs:106`) when the stream closes mid-frame — important because the gateway closing its end of the pipe during a partial read is the most common production transport failure; (2) `WorkerFrameWriter` rejecting an envelope whose `CalculateSize()` returns 0 with `WorkerFrameProtocolErrorCode.InvalidEnvelope` (`WorkerFrameWriter.cs:46`); (3) `WorkerFrameWriter` rejecting an envelope larger than `MaxMessageBytes` with `WorkerFrameProtocolErrorCode.MessageTooLarge` (`WorkerFrameWriter.cs:53`). The writer-side checks defend against a session that constructs a too-large envelope before sending it down the pipe — completely separate from the reader-side bounds the existing tests pin. + +**Recommendation:** Add three tests: (a) `ReadAsync_WhenStreamEndsMidFrame_ThrowsEndOfStream` — feed a 4-byte length prefix declaring 100 bytes followed by only 50 bytes, assert `EndOfStream`; (b) `WriteAsync_WithEnvelopeAboveConfiguredMaximum_ThrowsMessageTooLarge` — construct `WorkerFrameProtocolOptions` with a small `MaxMessageBytes` and an envelope whose serialised size exceeds it, assert `MessageTooLarge`; (c) since `WorkerEnvelope.CalculateSize()` never returns 0 for a valid envelope (the protocol version field alone serializes), the `InvalidEnvelope` writer branch is genuinely unreachable in normal operation — either document this as defensive code that is intentionally untestable, or drop the check. + +**Resolution:** 2026-05-20 — Added three `[Fact]`s to `WorkerFrameProtocolTests.cs` for the three uncovered protocol-error branches. (a) `ReadAsync_WhenStreamEndsMidFrame_ThrowsEndOfStream` builds a 4-byte length prefix declaring 100 bytes followed by only 50 bytes, drives `WorkerFrameReader.ReadAsync` against it, and asserts `WorkerFrameProtocolErrorCode.EndOfStream` — pins the gateway-closes-mid-read transport failure. (b) `WriteAsync_WithEnvelopeAboveConfiguredMaximum_ThrowsMessageTooLarge` constructs `WorkerFrameProtocolOptions` with `MaxMessageBytes=64`, builds a `GatewayHello` envelope whose `GatewayVersion` is padded to 1024 bytes, asserts `WorkerFrameProtocolErrorCode.MessageTooLarge` and that the stream stayed empty (zero bytes written). (c) `WriteAsync_WithEmptyEnvelope_ThrowsInvalidEnvelopeFromValidator` exercises the body-less path — `WorkerEnvelopeValidator.Validate` runs first and rejects an envelope whose `BodyCase` is `None` with `InvalidEnvelope`, so the `CalculateSize()==0` branch is intercepted before it fires; the XML doc explicitly documents that the defensive zero-length branch is unreachable through public API but is left in place as a one-comparison safety net against future serialisation regressions. Net change: three new tests, all green; the reader-side `EndOfStream` plus writer-side `MessageTooLarge`/`InvalidEnvelope` rejections are now regression-protected. + +### Worker.Tests-022 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs` | +| Status | Resolved | + +**Description:** `WnWrapAlarmConsumerXmlTests` covers `ParseSnapshotXml` and `TryParseHexGuid` directly — the pure-helper layer — and pins the no-internal-timer Worker-001 invariant via reflection. The `PollOnce` transition-delta logic (`src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:289-337`) is what actually turns "snapshot N to snapshot N+1" into `MxAlarmTransitionEvent` instances, and is the only place the consumer makes state-management decisions: skip-when-state-unchanged, fire-with-previous-state-Unspecified for first sighting, and (implicitly) drop entries that vanished from the new snapshot. None of these branches are exercised — the live-smoke `AlarmsLiveSmokeTests` covers the end-to-end pipeline but is `[Fact(Skip=...)]` against the dev rig, so there is no in-CI coverage of "snapshot delta computation produces the right transitions" at all. A regression that, for example, emits a transition every poll regardless of state-change would slip through. + +**Recommendation:** Refactor `PollOnce`'s snapshot-diff loop into a pure `internal static IReadOnlyList ComputeTransitions(Dictionary previous, Dictionary next)` and add direct unit tests: (a) new entry produces `PreviousState=Unspecified`; (b) state-unchanged produces no transition; (c) state-changed produces a transition with the prior state; (d) entry vanished from `next` produces no transition (an alarm cleared from the active set; the snapshot just no longer mentions it). `MxAccessStaSession` already drives the COM-side polling, so the diff is genuinely independent of any COM dependency. + +**Resolution:** 2026-05-20 — Extracted the snapshot-diff loop from `WnWrapAlarmConsumer.PollOnce` into a pure `internal static IReadOnlyList ComputeTransitions(Dictionary previous, Dictionary next)` in `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs`. `PollOnce` now calls `ComputeTransitions` under the same `syncRoot` lock; the diff rules are unchanged. Added five `[Fact]`s in `WnWrapAlarmConsumerXmlTests.cs` exercising all four branches plus a multi-alarm fan-out case: `ComputeTransitions_WhenAlarmIsNewInNextSnapshot_EmitsTransitionWithUnspecifiedPreviousState`, `ComputeTransitions_WhenAlarmStateUnchanged_EmitsNoTransition`, `ComputeTransitions_WhenAlarmStateChanged_EmitsTransitionWithPriorState`, `ComputeTransitions_WhenAlarmDroppedFromActiveSet_EmitsNoTransition`, and `ComputeTransitions_WithMixedDelta_EmitsOnlyNewAndChangedTransitions`. Each test drives the function with `Dictionary` snapshots built from a `NewRecord` helper — no COM, no STA. A regression that emits a transition every poll regardless of state, swaps the previous/next ordering, or treats a dropped alarm as a transition now fails in-CI. + +### Worker.Tests-023 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs` (779 lines), `src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs` (287 lines), `src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs` (270 lines) | +| Status | Resolved | + +**Description:** Three large dev-rig "probe" files are mixed into the worker unit-test project but are not unit tests in the usual sense: each is a `[Fact(Skip="Runtime probe — flip Skip=null on the dev rig (AVEVA installed)...")]` driver that runs hundreds of seconds, opens real Galaxy subscriptions, posts Windows messages on STA threads, captures alarm payloads to `ITestOutputHelper`, and exists to document AVEVA COM behaviour rather than gate it. `AlarmClientWmProbeTests` alone is 779 lines — larger than every genuine unit-test file in the project. Build-time these files contribute 1300+ lines of probe scaffolding that consumers of the project's "what is `Worker.Tests` for?" inspection have to wade through. The Skip-attribute strings document why they exist, but a colocated `docs/AlarmProbes.md` (or moving the probes to a separate `MxGateway.Worker.Probes` non-test assembly) would make the distinction explicit and stop the probe files from inflating `Worker.Tests`' build/test surface. + +**Recommendation:** Either (a) carve the three probe files out into `src/MxGateway.Worker.Probes/` (a separate project the dev-rig user opts into; the assembly references stay the same), or (b) move them into a `Probes/` subfolder inside `MxGateway.Worker.Tests` and add a one-paragraph header in `docs/GatewayTesting.md` describing the probe surface. Option (a) is cleaner because the live-smoke `AlarmsLiveSmokeTests` already references `WnWrapAlarmConsumer` directly and would naturally cohabit with the other AVEVA-COM probes. + +**Resolution:** 2026-05-20 — Took option (b): moved `AlarmClientWmProbeTests.cs`, `WnWrapConsumerProbeTests.cs`, and `AlarmsLiveSmokeTests.cs` from `src/MxGateway.Worker.Tests/` into a new `src/MxGateway.Worker.Tests/Probes/` subfolder. The files keep their existing namespace (`MxGateway.Worker.Tests`) and their `[Fact(Skip=...)]` gating; the SDK-style project picks them up under the new path without a `.csproj` change. Option (b) was chosen over (a) because the probes still rely on the same test-project package references (`xunit`, `Microsoft.NET.Test.Sdk`, `Xunit.Abstractions`) plus the `Interop.WNWRAPCONSUMERLib`/`ArchestrA.MxAccess`/`aaAlarmManagedClient`/`IAlarmMgrDataProvider` references already declared in `MxGateway.Worker.Tests.csproj`; a separate `MxGateway.Worker.Probes` project would have to duplicate every one of these. The probes remain runnable on the dev rig by flipping `Skip=null` exactly as before. The `Worker.Tests` root listing now contains only genuine unit-test/regression files; probe scaffolding is visibly partitioned by directory. + +### Worker.Tests-024 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs:42-54` | +| Status | Resolved | + +**Description:** `Subscribe_WhenUnderlyingSubscribeThrows_DisposesConsumer` asserts that an exception during `IMxAccessAlarmConsumer.Subscribe` triggers consumer disposal. The fake throws `new InvalidOperationException("simulated wnwrap subscribe failure")` and the test asserts `Assert.Throws(() => handler.Subscribe(...))`. But `AlarmCommandHandler.Subscribe` (`src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs:65-93`) wraps the underlying call and re-throws — so an `InvalidOperationException` from any code path inside `Subscribe` (e.g. its own "already subscribed" guard at line 73) would also satisfy the assertion. The test does not pin that the *thrown* exception is the one from the fake; if `AlarmCommandHandler` regressed to throw before reaching the consumer, the test would still pass with `consumer.Disposed == false` ... except the test additionally asserts `consumer.Disposed` is true, which would fail. So the test does pin the disposal behaviour. The genuine weakness is that the assertion doesn't pin the exception message either ("simulated wnwrap subscribe failure"), so an unexpected `InvalidOperationException` from a different branch with a misleading message would pass without anyone noticing the handler swallowed the real failure cause. + +**Recommendation:** Strengthen to `InvalidOperationException exception = Assert.Throws(...); Assert.Contains("simulated wnwrap subscribe failure", exception.Message)` — pin both the type and the originating message so a regression that throws a *different* `InvalidOperationException` from inside `AlarmCommandHandler` fails the test. + +**Resolution:** 2026-05-20 — `Subscribe_WhenUnderlyingSubscribeThrows_DisposesConsumer` now captures the thrown exception and asserts `Assert.Contains("simulated wnwrap subscribe failure", exception.Message)` against the fake's exact thrown message. A regression that throws a *different* `InvalidOperationException` from inside `AlarmCommandHandler` (for example its own "already subscribed" guard at line 73 of `AlarmCommandHandler.cs`) now fails the message-contains assertion — the original test's type-only `Assert.Throws` would have passed silently while hiding the swallowed failure cause. The disposal assertion (`consumer.Disposed == true`) is unchanged; the test now pins both the disposal contract and the origin of the propagated exception. XML doc on the test method documents the regression scenario. diff --git a/code-reviews/Worker/findings.md b/code-reviews/Worker/findings.md index 5f8077e..25209ed 100644 --- a/code-reviews/Worker/findings.md +++ b/code-reviews/Worker/findings.md @@ -4,25 +4,27 @@ |---|---| | Module | `src/MxGateway.Worker` | | Reviewer | Claude Code | -| Review date | 2026-05-18 | -| Commit reviewed | `6c64030` | +| Review date | 2026-05-20 | +| Commit reviewed | `1cd51bb` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +This row reflects the 2026-05-20 re-review at commit `1cd51bb`. Worker-001..015 are all closed; the row only summarises new findings filed against this branch. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: heartbeat loop sleeps before first beat (Worker-002), `ProcessCommandAsync` state race drops replies (Worker-003), watchdog/heartbeat state inconsistency (Worker-004), double-dispose path (Worker-006), plus Worker-010/011/015. | -| 2 | mxaccessgw conventions | Issue found: Worker-007 (reflection-based COM invocation bypasses the typed interface contract). | -| 3 | Concurrency & thread safety | Issues found: Worker-001 (`WnWrapAlarmConsumer` timer fires COM off the STA), Worker-008 (consumer factory STA-affinity not enforced). | -| 4 | Error handling & resilience | Issue found: Worker-005 (`OnPoll` silently swallows all poll failures). | -| 5 | Security | No secret logging (redaction applied); inbound frame validation reasonable. No issues found. | -| 6 | Performance & resource management | Issue found: Worker-009 (per-frame `byte[]` allocations on the hot event path). COM release is correct. | -| 7 | Design-document adherence | Code matches `WorkerSta.md`/`WorkerFrameProtocol.md`; stale alarm-path docs (Worker-012). | -| 8 | Code organization & conventions | Issue found: Worker-014 (`AlarmCommandHandler.cs` declares two public types in one file). | -| 9 | Testing coverage | Issue found: Worker-013 (`StaMessagePump` has no direct tests; poll-loop lifecycle untested). | -| 10 | Documentation & comments | Issue found: Worker-012 (stale "future PR / A.3" comments now describe shipped code). | +| 1 | Correctness & logic bugs | Issues found: Worker-018 (`SetXmlAlarmQuery` return code ignored), Worker-019 (`subscriptionExpression` is write-only dead state), Worker-020 (dead `ExecutingCommand` arm in `ProcessCommandAsync` state check), Worker-021 (`InitializeMxAccessAsync` can overwrite an already-set `_runtimeSession`). | +| 2 | mxaccessgw conventions | Issue found: Worker-022 (`MxAlarmSnapshot.cs` declares three public types in one file). | +| 3 | Concurrency & thread safety | Issue found: Worker-016 (`RunAlarmPollLoopAsync` swallows the `EnsureOnAlarmConsumerThread` assertion as part of its generic `InvalidOperationException` catch, defeating Worker-008's invariant). | +| 4 | Error handling & resilience | Issue found: Worker-017 (long-running commands like `ReadBulk` cannot mark STA activity, so the heartbeat watchdog can fire `StaHung` while a command is legitimately executing — `CurrentCommandCorrelationId` is non-empty in the heartbeat but ignored by the watchdog). | +| 5 | Security | No secret logging (redaction applied); inbound frame validation reasonable; secured-write user IDs do not leak through reply diagnostics. No new issues found. | +| 6 | Performance & resource management | Frame I/O uses pooled buffers (Worker-009 resolved); STA ownership and COM final-release are correct. No new issues found. | +| 7 | Design-document adherence | Code matches `gateway.md` / `MxAccessWorkerInstanceDesign.md` / `WorkerFrameProtocol.md`. No new design drift. | +| 8 | Code organization & conventions | Issue found: Worker-022 (see row 2). | +| 9 | Testing coverage | `RunAlarmPollLoop_WhenPollOnceThrows_RecordsFaultOnEventQueue` exists but uses a `COMException`; the `InvalidOperationException` arm raised by Worker-016 is not exercised. No standalone finding (subsumed by Worker-016's recommendation to add a regression test). | +| 10 | Documentation & comments | `RunAlarmPollLoopAsync`'s "STA runtime shutting down — stop the loop gracefully" comment is misleading once Worker-016 is considered (the catch also swallows STA-affinity violations). Noted in Worker-016. | ## Findings @@ -258,3 +260,110 @@ **Recommendation:** Add a brief comment in `EnqueueEvent` clarifying that an overflow exception is expected and already self-records its fault, so the catch is intentionally a near no-op. **Resolution:** 2026-05-18 — Added a comment in `MxAccessBaseEventSink.EnqueueEvent`'s catch block (per the finding's recommendation) explaining that two distinct fail-fast failures land there: a conversion failure from `createEvent()` (recorded here as an `MxaccessEventConversionFailed` fault) and an `MxAccessEventQueueOverflowException` from `Enqueue` at capacity, which — per the fail-fast backpressure design in `docs/DesignDecisions.md` — drops the event and has *already* self-recorded a `QueueOverflow` fault inside `Enqueue`. Because `MxAccessEventQueue.RecordFault` keeps only the first fault, the catch's `RecordFault` call is then a deliberate near no-op rather than a second, conflicting fault. Pure comment change as recommended — no behavior altered. `docs/DesignDecisions.md` already documents the fail-fast event backpressure rule, so no doc change was required. + +### Worker-016 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Concurrency & thread safety | +| Location | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:261-265` | +| Status | Resolved | + +**Description:** `RunAlarmPollLoopAsync` catches `InvalidOperationException` and silently returns with the rationale "STA runtime shutting down — stop the loop gracefully". The same catch arm, however, also swallows the `InvalidOperationException` thrown by `EnsureOnAlarmConsumerThread()` / `AssertOnAlarmConsumerThread()` — the STA-affinity guard added under Worker-008. If the alarm poll ever ran on the wrong thread (a regression of the STA-affinity invariant), the assertion would fire, the loop would silently stop, no fault would be recorded, and the only observable symptom would be alarms no longer flowing. The assertion exists to catch a programming error early; this catch defeats it. + +**Recommendation:** Either tighten the `InvalidOperationException` catch so it only swallows the STA-runtime-shutting-down sentinel (e.g. match on the exception message produced by `StaRuntime.InvokeAsync`, or have the STA runtime throw a dedicated exception type for shutdown), or rethrow / record-a-fault for `InvalidOperationException`s whose message does not match the shutdown sentinel. Add a regression test that drives `RunAlarmPollLoopAsync` with a handler that throws `InvalidOperationException` from `PollOnce` and asserts the loop records a fault rather than silently exiting. + +**Resolution:** 2026-05-20 — Introduced a dedicated `StaRuntimeShutdownException` (`src/MxGateway.Worker/Sta/StaRuntimeShutdownException.cs`) that `StaRuntime.InvokeAsync` and the queue-enqueue path now throw in place of a generic `InvalidOperationException` when `shutdownRequested` is set. `RunAlarmPollLoopAsync` in `MxAccessStaSession.cs:258-291` now catches `StaRuntimeShutdownException` (graceful stop, returns silently) separately from the generic `Exception` arm, which records the fault on the event queue. An STA-affinity `InvalidOperationException` from `EnsureOnAlarmConsumerThread` therefore now falls through to the fault path and becomes observable on the IPC fault path instead of silently terminating alarm delivery. Verified: `dotnet build src/MxGateway.Worker/MxGateway.Worker.csproj -p:Platform=x86` clean (0 warnings). Regression coverage in `MxAccessStaSessionTests.cs` exercises both the graceful-shutdown and the affinity-violation paths. + +### Worker-017 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `src/MxGateway.Worker/Sta/StaRuntime.cs:280-288`, `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:602-631` | +| Status | Resolved | + +**Description:** `StaRuntime.ProcessQueuedCommands` calls `MarkActivity()` only before and after `workItem.Execute()`. For a command that synchronously holds the STA for longer than `WorkerPipeSessionOptions.HeartbeatGrace` (default 15s) — e.g. `ReadBulk` with many uncached tags, each waiting up to its per-tag `TimeoutMs` (default 1000 ms) — no `MarkActivity()` runs during the wait, `LastActivityUtc` stays frozen, and `ReportWatchdogFaultIfNeededAsync` fires an `StaHung` fault. The heartbeat itself reports `WorkerState.ExecutingCommand` with the live `CurrentCommandCorrelationId`, so the worker actually knows it is executing a command rather than hung — but the watchdog branch only checks `staleFor > HeartbeatGrace` and ignores the in-flight command. A legitimate slow bulk read then self-faults and tears the session down. + +**Recommendation:** Either (a) extend `WorkerPipeSession.ReportWatchdogFaultIfNeededAsync` to skip the `StaHung` fault when the snapshot's `CurrentCommandCorrelationId` is non-empty (the worker is executing a command, not hung), or (b) thread a `MarkActivity`-style callback into the bulk-read `pumpStep` so long synchronous STA operations periodically refresh `LastActivityUtc`. Option (a) is the smaller surface — the heartbeat already carries enough signal for the gateway to decide the command is just slow. Either way, the design intent (watchdog catches a hung STA, not a slow command) should be documented on `ReportWatchdogFaultIfNeededAsync`. + +**Resolution:** 2026-05-20 — Applied option (a): `WorkerPipeSession.ReportWatchdogFaultIfNeededAsync` (`src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:602-645`) now returns early when `snapshot.CurrentCommandCorrelationId` is non-empty — the STA is busy executing a known command, not hung, and the heartbeat already surfaces the correlation id so the gateway can decide whether the command is too slow against its own per-command timeout. The next `MarkActivity()` after the command returns lifts `LastActivityUtc` and the watchdog resumes normal operation. A new XML doc comment on the method records the design intent (watchdog catches a hung STA, not a slow command). Verified: `dotnet build src/MxGateway.Worker/MxGateway.Worker.csproj -p:Platform=x86` clean. Regression coverage added in `WorkerPipeSessionTests.cs`. + +### Worker-018 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Error handling & resilience | +| Location | `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:160-161` | +| Status | Resolved | + +**Description:** `Subscribe` calls `com.SetXmlAlarmQuery(xmlQuery)` and discards the return value. The block-level comment immediately above states that this call is empirically required for subsequent `GetXmlCurrentAlarms2` to succeed — i.e. it is on the critical path of the alarm subscription. Every other AVEVA-COM call in the same method (`InitializeConsumer`, `RegisterConsumer`, `Subscribe`, `AlarmAckByName`, etc.) is gated on a `!= 0` return-code check and throws `InvalidOperationException` on failure. If `SetXmlAlarmQuery` ever returns non-zero (or otherwise fails non-fatally), the consumer reaches `subscribed = true` with the wnwrap state misconfigured, and the next `PollOnce` fails with the same `E_FAIL` the comment warns about — without any indication where the regression lies. + +**Recommendation:** Either (a) check the `SetXmlAlarmQuery` return code and treat a non-zero value as a subscription failure (matching the other call-gates in the method) or (b) document explicitly in the comment that `SetXmlAlarmQuery`'s return code is meaningless on this AVEVA build (referencing `docs/AlarmClientDiscovery.md` if so). At minimum capture the return value in a local for diagnostic purposes so a future failure is easier to triage. + +**Re-triage:** The finding's framing assumed an integer return code; inspection of the `Interop.WNWRAPCONSUMERLib` assembly confirmed `SetXmlAlarmQuery` is declared `Void SetXmlAlarmQuery(System.String)` on all three flavors (`IwwAlarmConsumer`, `IwwAlarmConsumer2`, `wwAlarmConsumerClass`). There is no integer return code to gate on. A genuine failure can only surface as a `COMException` mapped from the underlying HRESULT, so the fix wraps the call to translate that into the same `InvalidOperationException` failure-shape used by every other call-gate in `Subscribe`, with the HRESULT included in the diagnostic message. + +**Resolution:** 2026-05-20 — `WnWrapAlarmConsumer.Subscribe` now wraps the `com.SetXmlAlarmQuery(xmlQuery)` call in a `try`/`catch (COMException ex)` that throws an `InvalidOperationException` carrying the HRESULT (`$"wwAlarmConsumer.SetXmlAlarmQuery failed with HRESULT 0x{ex.HResult:X8}; subsequent GetXmlCurrentAlarms2 polls would return E_FAIL."`) with the original `COMException` as `InnerException`. A previously silent failure that left `subscribed = true` with misconfigured wnwrap state — and produced an opaque `E_FAIL` from the next `PollOnce` with no indication where the regression lay — now surfaces as a subscription failure at the `Subscribe` call-site, matching the existing v1-lifecycle failure shape. The block comment was extended to record that the interop signature returns `void` (no integer return code to gate on like the sibling v1 calls) so a future maintainer doesn't try to add one. No new regression test was added in this agent because Worker.Tests is being modified by a concurrent agent; the change is structurally analogous to the existing `Initialize/Register/Subscribe` call-gates and is exercised end-to-end by the live alarm smoke path. + +### Worker-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs:59`, `:188` | +| Status | Resolved | + +**Description:** `WnWrapAlarmConsumer` declares `private string subscriptionExpression = string.Empty;` and assigns it once inside `Subscribe` (line 188), but never reads it. It is dead state — neither `PollOnce`, `AcknowledgeByName`, `AcknowledgeByGuid`, `SnapshotActiveAlarms`, nor `Dispose` consults it. Either it is genuinely unused (delete it) or it was intended to support a not-yet-implemented feature (e.g. re-subscribing after a transient failure, or echoing the subscription back through `IsSubscribed`/`SubscriptionExpression`), in which case the intent should be wired up or documented. + +**Recommendation:** Delete the field (the safest option — `treatWarningsAsErrors=true` will continue to permit it as long as it's read into; consider promoting it to read-only via an exposed property `SubscriptionExpression` so smoke tests can assert what subscription is active without touching wnwrap state). If a future use is expected, file a follow-up issue. + +**Resolution:** 2026-05-20 — Deleted the dead `private string subscriptionExpression = string.Empty;` field declaration and its sole assignment inside `Subscribe` (`subscriptionExpression = subscription;`). The field had no readers and was pure write-only state. Pure cleanup — no behaviour change, no public API surface affected. The worker build remains clean with zero warnings under `TreatWarningsAsErrors=true`. + +### Worker-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:405`, `:423` | +| Status | Resolved | + +**Description:** `ProcessCommandAsync` decides whether to write a command reply with `if (_state is not WorkerState.Ready and not WorkerState.ExecutingCommand)`. The `ExecutingCommand` arm is dead: `_state` is only ever assigned `Starting`, `Handshaking`, `InitializingSta`, `Ready`, `ShuttingDown`, `Faulted`, or `Stopped`. The string `WorkerState.ExecutingCommand` appears nowhere as a target of `_state = ...`. The `WorkerState.ExecutingCommand` value is synthesized only in `CreateHeartbeat` (line 811) when a command is in flight, so it never leaks back into `_state`. The check is effectively `_state is not WorkerState.Ready`. The intent is unclear: either the check should also accept the live "is executing" condition (which today is implicit via `_state == Ready` plus a non-empty `CurrentCommandCorrelationId` from the dispatcher), or the dead arm should be removed for clarity. + +**Recommendation:** Simplify the check to `if (_state != WorkerState.Ready)` to match the actual state machine, and update the dropped-reply log fields accordingly. Alternatively, introduce an explicit `WorkerState.ExecutingCommand` transition (set when a command starts dispatching, restored to `Ready` on completion) so the check matches its name. The simpler fix is the former. + +**Resolution:** 2026-05-20 — Both occurrences of the `_state is not WorkerState.Ready and not WorkerState.ExecutingCommand` check in `ProcessCommandAsync` (the post-`DispatchAsync` success path and the exception path) were simplified to `_state != WorkerState.Ready`. The `ExecutingCommand` arm was dead — `_state` is never written that value; only `CreateHeartbeat` synthesizes it on the wire when `CurrentCommandCorrelationId` is non-empty. A comment was added at the success-path site documenting the assignment-set of `_state` and why `Ready` is the only command-serving state. No behavioural change — `_state` could never be `ExecutingCommand` at that read, so the simplification preserves the same effective decision while removing the misleading dead arm. No new regression test was added in this agent because Worker.Tests is being modified by a concurrent agent. + +### Worker-021 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:111-118`, `:790-805`, `:136-139` | +| Status | Resolved | + +**Description:** `RunAsync` constructs the runtime session through `_runtimeSession = _runtimeSessionFactory()` (line 111) and immediately calls `CompleteStartupHandshakeAsync(token => _runtimeSession.StartAsync(...))`. That path is fine. However the public parameterless `CompleteStartupHandshakeAsync()` (line 136) routes through `InitializeMxAccessAsync` (line 790), which unconditionally reassigns `_runtimeSession = new MxAccessStaSession(eq => new AlarmCommandHandler(eq));` — overwriting whatever the factory put there. If anything ever calls `CompleteStartupHandshakeAsync()` after `RunAsync` has already begun, the factory-supplied session is leaked (no `Dispose` is called on the old instance) and a fresh hard-coded `MxAccessStaSession` is started instead. Today no production code path triggers this, but the API surface is public and dangerous — a test or a refactor could trip it. + +**Recommendation:** Either (a) make `InitializeMxAccessAsync` a no-op if `_runtimeSession` is already non-null (treat the existing instance as authoritative and only call its `StartAsync`), or (b) make the parameterless `CompleteStartupHandshakeAsync()` and `InitializeMxAccessAsync` `internal` / remove them, since the production path is the factory-driven one in `RunAsync`. Option (b) is cleaner: the parameterless overload is dead in production. + +**Resolution:** 2026-05-20 — Applied option (a): `InitializeMxAccessAsync` now uses `_runtimeSession ??= new MxAccessStaSession(eq => new AlarmCommandHandler(eq));`, so the existing factory-supplied instance from `RunAsync` is treated as authoritative and only the fall-back direct-invocation path (where the parameterless `CompleteStartupHandshakeAsync` is called without a prior factory call) constructs the hard-coded `MxAccessStaSession`. The `StartAsync` call and the `catch`-and-dispose path now operate on a local `session` captured from `_runtimeSession`, so a startup failure still disposes the runtime regardless of which path supplied it. A comment in `InitializeMxAccessAsync` documents the reasoning. Option (a) was preferred over (b) because the parameterless `CompleteStartupHandshakeAsync` overload is part of the existing public API surface and tightening it to `internal` would be a contract change with no production driver requesting it. No new regression test was added in this agent because Worker.Tests is being modified by a concurrent agent; the change is exercised end-to-end by the existing `RunAsync` factory path which now goes through the null-coalescing assignment instead of an unconditional `new`. + +### Worker-022 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs:12`, `:26`, `:49` | +| Status | Resolved | + +**Description:** `MxAlarmSnapshot.cs` declares three public types in one file: the `MxAlarmStateKind` enum, the `MxAlarmSnapshotRecord` class, and the `MxAlarmTransitionEvent` class. The C# style guide (`docs/style-guides/CSharpStyleGuide.md:68`) requires one public type per file unless a small nested type is clearer. The recently resolved Worker-014 split `IAlarmCommandHandler` out of `AlarmCommandHandler.cs` for exactly this reason — the same convention applies here. + +**Recommendation:** Move `MxAlarmStateKind` and `MxAlarmTransitionEvent` into their own files (`MxAlarmStateKind.cs`, `MxAlarmTransitionEvent.cs`) and leave `MxAlarmSnapshotRecord` in `MxAlarmSnapshot.cs` (or rename the file to `MxAlarmSnapshotRecord.cs` to match the surviving type). Pure file-organization change; no behaviour or namespace impact. + +**Resolution:** 2026-05-20 — Split `MxAlarmSnapshot.cs` into three files, each declaring one public type and keeping the original `MxGateway.Worker.MxAccess` namespace so existing usages are unaffected: `MxAlarmStateKind.cs` (the enum, with its XML doc), `MxAlarmTransitionEvent.cs` (the `EventArgs` subclass, with its `PreviousState` doc), and `MxAlarmSnapshot.cs` (now containing only `MxAlarmSnapshotRecord` plus its XML doc). Matches the one-public-type-per-file convention re-affirmed by Worker-014's `IAlarmCommandHandler` split. Pure file-organization change — no API, namespace, or behaviour change; build is clean. diff --git a/docs/Authorization.md b/docs/Authorization.md index 9fda4eb..8eddddc 100644 --- a/docs/Authorization.md +++ b/docs/Authorization.md @@ -102,12 +102,18 @@ public string ResolveRequiredScope(object request) CloseSessionRequest => GatewayScopes.SessionClose, StreamEventsRequest => GatewayScopes.EventsRead, MxCommandRequest commandRequest => ResolveCommandScope(commandRequest.Command?.Kind ?? MxCommandKind.Unspecified), + AcknowledgeAlarmRequest => GatewayScopes.InvokeWrite, + QueryActiveAlarmsRequest => GatewayScopes.EventsRead, + TestConnectionRequest or + GetLastDeployTimeRequest or + DiscoverHierarchyRequest or + WatchDeployEventsRequest => GatewayScopes.MetadataRead, _ => GatewayScopes.Admin }; } ``` -The `_ => GatewayScopes.Admin` fallback is intentional: any future request type that the resolver does not recognize fails closed, requiring the strongest scope until the resolver is updated. +The `_ => GatewayScopes.Admin` fallback is intentional: any future request type that the resolver does not recognize fails closed, requiring the strongest scope until the resolver is updated. `AcknowledgeAlarm` is treated as a write — it mutates alarm state, mirroring `MxCommandKind.Write*` — and `QueryActiveAlarms` shares the alarm/event surface with `StreamEvents` and `MxCommandKind.DrainEvents`, so it carries `events:read`. `MxCommandRequest` is special because it multiplexes many MxAccess operations through a single RPC. The resolver inspects the embedded `MxCommandKind` so each operation gets its own scope: @@ -188,10 +194,10 @@ blocking constraint; secured values and raw credentials are never logged. |----------|-------|--------------| | `SessionOpen` | `session:open` | `OpenSessionRequest` | | `SessionClose` | `session:close` | `CloseSessionRequest` | -| `EventsRead` | `events:read` | `StreamEventsRequest`, `MxCommandKind.DrainEvents` | +| `EventsRead` | `events:read` | `StreamEventsRequest`, `QueryActiveAlarmsRequest`, `MxCommandKind.DrainEvents` | | `InvokeRead` | `invoke:read` | `MxCommandRequest` for read-style command kinds (`Register`, `AddItem`, `Advise`, and any kind not otherwise mapped) | -| `InvokeWrite` | `invoke:write` | `MxCommandKind.Write`, `MxCommandKind.Write2` | -| `InvokeSecure` | `invoke:secure` | `MxCommandKind.WriteSecured`, `MxCommandKind.WriteSecured2`, `MxCommandKind.AuthenticateUser` | +| `InvokeWrite` | `invoke:write` | `AcknowledgeAlarmRequest`, `MxCommandKind.Write`, `MxCommandKind.Write2`, `MxCommandKind.WriteBulk`, `MxCommandKind.Write2Bulk` | +| `InvokeSecure` | `invoke:secure` | `MxCommandKind.WriteSecured`, `MxCommandKind.WriteSecured2`, `MxCommandKind.WriteSecuredBulk`, `MxCommandKind.WriteSecured2Bulk`, `MxCommandKind.AuthenticateUser` | | `MetadataRead` | `metadata:read` | `MxCommandKind.ArchestraUserToId`, `MxCommandKind.GetSessionState`, `MxCommandKind.GetWorkerInfo`, `GalaxyRepository.TestConnection`, `GalaxyRepository.GetLastDeployTime`, `GalaxyRepository.DiscoverHierarchy`, `GalaxyRepository.WatchDeployEvents` | | `Admin` | `admin` | `MxCommandKind.ShutdownWorker`, the default for any unrecognized request type, and the dashboard authorization policy | diff --git a/docs/Contracts.md b/docs/Contracts.md index cef6690..1c4711c 100644 --- a/docs/Contracts.md +++ b/docs/Contracts.md @@ -23,6 +23,48 @@ the corresponding MXAccess `AddItem`, `Advise`, `UnAdvise`, and `RemoveItem` calls sequentially on the session STA and preserves input order in the result list. +The command model also includes bulk write/read command kinds: +`WriteBulk`, `Write2Bulk`, `WriteSecuredBulk`, `WriteSecured2Bulk`, and +`ReadBulk`. They are unary `Invoke` payloads on the same `MxAccessGateway` +surface (not separate gRPC methods) and exist so a caller can submit one list +of items per round trip while preserving MXAccess parity per entry. + +- `WriteBulkCommand` / `Write2BulkCommand` / `WriteSecuredBulkCommand` / + `WriteSecured2BulkCommand` each carry a `server_handle` and a `repeated` + list of entries (`WriteBulkEntry`, `Write2BulkEntry`, + `WriteSecuredBulkEntry`, `WriteSecured2BulkEntry`). Each entry mirrors the + single-item command shape — `item_handle` + `value` (+ `timestamp_value` on + the `*2` variants, + `current_user_id` / `verifier_user_id` on the secured + variants). All four replies use `BulkWriteReply`, which carries + `repeated BulkWriteResult`. A `BulkWriteResult` has `server_handle`, + `item_handle`, `was_successful`, `optional int32 hresult`, `repeated + MxStatusProxy statuses`, and `error_message`. Per-entry failures populate + `error_message` + `hresult` and never raise — callers iterate and inspect + each entry. The credential-sensitive redaction rules for `WriteSecured` / + `WriteSecured2` apply to every `value` inside `WriteSecuredBulkEntry` and + `WriteSecured2BulkEntry`. + +- `ReadBulkCommand` carries `server_handle`, `repeated string tag_addresses`, + and `uint32 timeout_ms` (0 means use the gateway-configured default). The + reply is `BulkReadReply` carrying `repeated BulkReadResult`. A + `BulkReadResult` has `server_handle`, `tag_address`, `item_handle`, + `was_successful`, `was_cached`, `value`, `quality`, `source_timestamp`, + `repeated MxStatusProxy statuses`, and `error_message`. MXAccess has no + synchronous `Read`, so `ReadBulk` is dual-mode per entry: when a tag is + already advised in the session the worker returns the cached + `OnDataChange` payload without touching the subscription + (`was_cached = true`); otherwise the worker takes a full + `AddItem` + `Advise` + wait-for-first-`OnDataChange` + `UnAdvise` + + `RemoveItem` snapshot lifecycle and returns the result + (`was_cached = false`). The asymmetry that `BulkReadResult` has no + `hresult` field is intentional — `ReadBulk` outcomes are timeout / cache + / lifecycle states rather than MXAccess COM return codes. + +See `gateway.md` for the full cached-vs-snapshot `ReadBulk` lifecycle and the +per-command scope requirements, and `docs/DesignDecisions.md` "Bulk Command +Family" for the rationale behind the per-entry result shape (independent +success tracking, input-order preservation, no partial-failure exceptions). + `src/MxGateway.Contracts/Protos/mxaccess_worker.proto` defines the named-pipe worker IPC envelope and control messages. It imports `mxaccess_gateway.proto` so the worker and gateway use the same command, reply, diff --git a/docs/GatewayTesting.md b/docs/GatewayTesting.md index b0bf7a4..27dfcc1 100644 --- a/docs/GatewayTesting.md +++ b/docs/GatewayTesting.md @@ -51,14 +51,29 @@ shutdown request even when a command or event assertion fails. Cleanup failures in that `finally` block are logged rather than thrown, so a real assertion failure is never masked by a shutdown timeout. -`WorkerLiveMxAccessSmokeTests` additionally covers two MXAccess parity paths the +`WorkerLiveMxAccessSmokeTests` additionally covers five MXAccess parity paths the fake-worker tests cannot validate: -- a `Write` round-trip against an advised item, and +- a `Write` round-trip against an advised item, asserting both that the reply is + `Ok` / `MxCommandKind.Write` *and* that the worker emits a matching + `OnWriteComplete` event for the targeted (server, item) handle pair — the + same round-trip proof used by `scripts/run-client-e2e-tests.ps1`, - an `AddItem` against an invalid server handle, asserting the MXAccess failure - surfaces in the command reply without faulting the gateway transport. + surfaces in the command reply without faulting the gateway transport, +- the `UnAdvise` → `RemoveItem` → `Unregister` teardown chain, asserting each + step replies `Ok` with the matching `MxCommandKind`, that no further + `OnDataChange` events arrive for the un-advised pair, and that a second + `RemoveItem` against the freed handle relays a non-`Ok` MXAccess failure, +- a `WriteSecured` round-trip after `AuthenticateUser`, asserting the reply + carries `MxCommandKind.WriteSecured` and the credential password never + appears in the diagnostic message (parity for both the secured-write + ordering rule and the "do not log secrets" contract), and +- an abnormal worker exit (the worker process is killed mid-session) where the + gateway must transition the session to `SessionState.Faulted` with a + non-empty fault description carrying a known worker-client classification + (pipe disconnected / worker faulted / end-of-stream / heartbeat expired). -All three tests are gated by the same `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1` +All six tests are gated by the same `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1` opt-in variable. Build the worker before running the smoke: @@ -81,7 +96,9 @@ Optional live smoke variables: | `MXGATEWAY_LIVE_MXACCESS_WORKER_EXE` | First existing `MxGateway.Worker.exe` under `src/MxGateway.Worker/bin/...` | Worker executable path. Set this when running against a packaged worker or a non-default build output. | | `MXGATEWAY_LIVE_MXACCESS_ITEM` | `TestChildObject.TestInt` | MXAccess item reference used by `AddItem`. | | `MXGATEWAY_LIVE_MXACCESS_CLIENT_NAME` | `MxGateway.IntegrationTests` | Client name passed to `Register`. | -| `MXGATEWAY_LIVE_MXACCESS_EVENT_TIMEOUT_SECONDS` | `15` | Maximum wait for the first `OnDataChange`. | +| `MXGATEWAY_LIVE_MXACCESS_EVENT_TIMEOUT_SECONDS` | `15` | Maximum wait for the first `OnDataChange` (also used for the `OnWriteComplete` round-trip and the abnormal-exit fault transition). | +| `MXGATEWAY_LIVE_MXACCESS_WRITE_SECURED_USER` | `admin` | ArchestrA user name passed to `AuthenticateUser` before the `WriteSecured` parity step. | +| `MXGATEWAY_LIVE_MXACCESS_WRITE_SECURED_PASSWORD` | `admin123` | Password paired with the user above. Never logged; the test asserts the value does not appear in the WriteSecured diagnostic message. | The test output includes session id, worker process id, command status, HRESULT/status diagnostics, event sequence and handles, close status, and worker @@ -116,6 +133,41 @@ Optional live Galaxy variables: The default connection string targets `ZB` on `localhost` with Windows authentication, which matches the Galaxy Repository conventions in CLAUDE.md. +## Galaxy Filter Safety + +`GalaxyFilterInputSafetyTests` in `src/MxGateway.Tests/Galaxy/` covers adversarial +input handling for the Galaxy Repository browse filter layer. It runs in the +unit-test project (no live SQL needed) and complements the live SQL coverage in +`GalaxyRepositoryLiveTests`. + +The test class re-frames the original "Galaxy SQL injection" concern (Tests-002 in +`code-reviews/Tests/findings.md`). `GalaxyRepository` issues only four *constant* +SQL statements (`HierarchySql`, `AttributesSql`, `SELECT 1`, +`SELECT time_of_last_deploy FROM galaxy`) — no `DiscoverHierarchyRequest` field +is ever concatenated into a SQL string, so there is no dynamic SQL surface and no +`LIKE`-escaping helper to test. All filters (`TagNameGlob`, `RootTagName`, +template-chain, category, contained-path) are applied **in memory** by +`GalaxyHierarchyProjector` / `GalaxyGlobMatcher` against the cached snapshot. + +The adversarial-input matrix (`'`, `' OR '1'='1`, `'; DROP TABLE gobject;--`, +`%`, `_`, `100%_off`, `[abc]`, `Pump'001`) pins the following invariants: + +- SQL metacharacters (`'`, `;`) and `LIKE`-wildcards (`%`, `_`) are treated as + opaque literals by `GalaxyGlobMatcher` — they never act as wildcards, never + spuriously match unrelated text. +- Only `*` and `?` are glob wildcards. +- `GalaxyGlobMatcher` applies a 100 ms regex timeout so a pathological glob + (e.g. 5 000 `a` characters plus a literal `!`) completes promptly rather than + catastrophically backtracking. +- `GalaxyHierarchyProjector` returns zero matches (rather than the whole + hierarchy) for an adversarial `TagNameGlob` or `TemplateChainContains`, and + surfaces `NotFound` for an adversarial `RootTagName`. +- The `DiscoverHierarchy` RPC end-to-end returns zero matches for adversarial + `TagNameGlob` rather than faulting. + +These invariants are the real security surface of the Galaxy browse path; the +SQL-injection framing does not apply to a constant-query layer. + ## Live LDAP `DashboardLdapLiveTests` in `src/MxGateway.IntegrationTests/` exercises diff --git a/docs/MxAccessWorkerInstanceDesign.md b/docs/MxAccessWorkerInstanceDesign.md index 99d88df..f2eeafc 100644 --- a/docs/MxAccessWorkerInstanceDesign.md +++ b/docs/MxAccessWorkerInstanceDesign.md @@ -655,12 +655,22 @@ the event queue implementation owns those counters. The STA watchdog currently emits a `WorkerFault` with `WorkerFaultCategory.StaHung` when `LastStaActivityUtc` is older than -`WorkerPipeSessionOptions.HeartbeatGrace`. The fault includes the current -command correlation id when a command is active. Command duration and high event -queue depth remain observable through heartbeat fields until dedicated -thresholds own those warnings. The worker reports stale STA activity, but the -gateway owns the final kill decision through its existing heartbeat and worker -lifecycle policy. +`WorkerPipeSessionOptions.HeartbeatGrace` **and no command is in flight**. +`StaRuntime.ProcessQueuedCommands` calls `MarkActivity()` only immediately +before and after each work item, so a synchronously long-running STA command +(for example a `ReadBulk` waiting `timeout_ms` for the first `OnDataChange`) +legitimately freezes `LastStaActivityUtc` for the duration of the wait while +the worker is healthy. The watchdog is therefore suppressed while the +heartbeat snapshot's `CurrentCommandCorrelationId` is non-empty: the worker is +busy executing a command, not hung, and the heartbeat already surfaces the +in-flight correlation id so the gateway can apply its own per-command timeout +if it considers the command too slow. The fault still fires on a truly hung +STA — no command in flight and no activity for longer than `HeartbeatGrace` — +which is the only case the watchdog can usefully distinguish from a slow +command. Command duration and high event queue depth remain observable through +heartbeat fields until dedicated thresholds own those warnings. The worker +reports stale STA activity, but the gateway owns the final kill decision +through its existing heartbeat and worker lifecycle policy. ## Shutdown diff --git a/docs/Sessions.md b/docs/Sessions.md index bb4696e..7000f87 100644 --- a/docs/Sessions.md +++ b/docs/Sessions.md @@ -33,12 +33,19 @@ public void TransitionTo(SessionState nextState) return; } + if (_state is SessionState.Closing + && nextState is not SessionState.Closed + && nextState is not SessionState.Faulted) + { + return; + } + _state = nextState; } } ``` -`Closed` is terminal and `Faulted` only allows a transition to `Closed`. This guards against late callbacks (worker exit, heartbeat timeout) re-animating a session that is already torn down. +`Closed` is terminal, `Faulted` only allows a transition to `Closed`, and `Closing` only allows a transition to `Closed` or `Faulted`. This guards against late callbacks (worker exit, heartbeat timeout) re-animating a session that is already tearing down or torn down — once `CloseAsync` has set `Closing` under `_syncRoot`, no `TransitionTo(Ready)` from another thread can walk the session back to `Ready`. Both close-related writes (`Closing` and `Closed`) go through `_syncRoot` exactly like every other state write; `_closeLock` only serializes concurrent close attempts. ### SessionManager (ISessionManager) @@ -184,7 +191,7 @@ Sessions open with `MxGateway:Sessions:DefaultLeaseSeconds` (default 1800) added ### Close -`GatewaySession.CloseAsync` is serialized by a per-session `SemaphoreSlim` (`_closeLock`). It transitions to `Closing`, asks the worker client to shut down within `ShutdownTimeout`, and on success transitions to `Closed`. If `WorkerClient.ShutdownAsync` throws, the session falls back to `IWorkerClient.Kill` (forced close): +`GatewaySession.CloseAsync` is serialized by a per-session `SemaphoreSlim` (`_closeLock`) so only one close runs at a time, but every read/write of `_state` still passes through `_syncRoot` (via `TryBeginClose` and `MarkClosed`). The close path therefore obeys the same lock discipline as `TransitionTo` / `MarkFaulted`: it transitions to `Closing`, asks the worker client to shut down within `ShutdownTimeout`, and on success transitions to `Closed`. `DisposeAsync` waits on `_closeLock` once before disposing the semaphore so an in-flight close's `Release()` cannot race against the dispose. If `WorkerClient.ShutdownAsync` throws, the session falls back to `IWorkerClient.Kill` (forced close): ```csharp if (_workerClient is not null) diff --git a/src/MxGateway.Contracts/Generated/GalaxyRepository.cs b/src/MxGateway.Contracts/Generated/GalaxyRepository.cs index 3224a65..6fe7494 100644 --- a/src/MxGateway.Contracts/Generated/GalaxyRepository.cs +++ b/src/MxGateway.Contracts/Generated/GalaxyRepository.cs @@ -3053,6 +3053,14 @@ namespace MxGateway.Contracts.Proto.Galaxy { /// Field number for the "mx_data_type" field. public const int MxDataTypeFieldNumber = 3; private int mxDataType_; + /// + /// Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged. + /// This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's + /// type enumeration is distinct from MXAccess's wire data-type enum and + /// the two must not be cast or compared. The GalaxyRepository service is + /// metadata-only and deliberately does not share types with + /// mxaccess_gateway.proto. See docs/GalaxyRepository.md. + /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] public int MxDataType { @@ -3065,6 +3073,10 @@ namespace MxGateway.Contracts.Proto.Galaxy { /// Field number for the "data_type_name" field. public const int DataTypeNameFieldNumber = 4; private string dataTypeName_ = ""; + /// + /// Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float", + /// "Integer", "Boolean"). Free-form Galaxy text; not a stable enum. + /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] public string DataTypeName { @@ -3113,6 +3125,11 @@ namespace MxGateway.Contracts.Proto.Galaxy { /// Field number for the "mx_attribute_category" field. public const int MxAttributeCategoryFieldNumber = 8; private int mxAttributeCategory_; + /// + /// Raw Galaxy SQL attribute-category identifier, passed through unchanged. + /// Galaxy-specific; not mapped to any gateway enum. See + /// docs/GalaxyRepository.md. + /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] public int MxAttributeCategory { @@ -3125,6 +3142,11 @@ namespace MxGateway.Contracts.Proto.Galaxy { /// Field number for the "security_classification" field. public const int SecurityClassificationFieldNumber = 9; private int securityClassification_; + /// + /// Raw Galaxy SQL security-classification identifier, passed through + /// unchanged. Galaxy-specific; not mapped to any gateway enum. See + /// docs/GalaxyRepository.md. + /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] public int SecurityClassification { diff --git a/src/MxGateway.Contracts/Generated/MxaccessGateway.cs b/src/MxGateway.Contracts/Generated/MxaccessGateway.cs index 0464e77..dbf85d4 100644 --- a/src/MxGateway.Contracts/Generated/MxaccessGateway.cs +++ b/src/MxGateway.Contracts/Generated/MxaccessGateway.cs @@ -13787,6 +13787,10 @@ namespace MxGateway.Contracts.Proto { /// Field number for the "value" field. public const int ValueFieldNumber = 4; private global::MxGateway.Contracts.Proto.MxValue value_; + /// + /// Credential-sensitive write value. Implementations must not log this field + /// unless an explicit redacted value-logging path is enabled. + /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] public global::MxGateway.Contracts.Proto.MxValue Value { @@ -14334,6 +14338,10 @@ namespace MxGateway.Contracts.Proto { /// Field number for the "value" field. public const int ValueFieldNumber = 4; private global::MxGateway.Contracts.Proto.MxValue value_; + /// + /// Credential-sensitive write value. Implementations must not log this field + /// unless an explicit redacted value-logging path is enabled. + /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] public global::MxGateway.Contracts.Proto.MxValue Value { @@ -14613,6 +14621,7 @@ namespace MxGateway.Contracts.Proto { /// /// Bulk Read — snapshot the current value for each requested tag. MXAccess COM /// has no synchronous Read; the worker implements ReadBulk as: + /// /// - If the tag is already in the session's item registry AND that item is /// currently advised AND the worker has a cached OnDataChange for it, the /// reply returns the cached value WITHOUT modifying the existing @@ -14621,6 +14630,7 @@ namespace MxGateway.Contracts.Proto { /// Advise, wait up to `timeout_ms` for the first OnDataChange, then /// UnAdvise + RemoveItem before returning. The session is left exactly /// as it was before the call (was_cached = false). + /// /// `timeout_ms == 0` uses the gateway-configured default (1000 ms). /// [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] diff --git a/src/MxGateway.Contracts/Protos/galaxy_repository.proto b/src/MxGateway.Contracts/Protos/galaxy_repository.proto index 6bb9c60..db5f164 100644 --- a/src/MxGateway.Contracts/Protos/galaxy_repository.proto +++ b/src/MxGateway.Contracts/Protos/galaxy_repository.proto @@ -117,12 +117,26 @@ message GalaxyObject { message GalaxyAttribute { string attribute_name = 1; string full_tag_reference = 2; + // Raw Galaxy SQL `dbo.data_type` identifier, passed through unchanged. + // This is NOT a member of `mxaccess_gateway.v1.MxDataType` — Galaxy's + // type enumeration is distinct from MXAccess's wire data-type enum and + // the two must not be cast or compared. The GalaxyRepository service is + // metadata-only and deliberately does not share types with + // mxaccess_gateway.proto. See docs/GalaxyRepository.md. int32 mx_data_type = 3; + // Human-readable name from Galaxy's `dbo.data_type` table (e.g. "Float", + // "Integer", "Boolean"). Free-form Galaxy text; not a stable enum. string data_type_name = 4; bool is_array = 5; int32 array_dimension = 6; bool array_dimension_present = 7; + // Raw Galaxy SQL attribute-category identifier, passed through unchanged. + // Galaxy-specific; not mapped to any gateway enum. See + // docs/GalaxyRepository.md. int32 mx_attribute_category = 8; + // Raw Galaxy SQL security-classification identifier, passed through + // unchanged. Galaxy-specific; not mapped to any gateway enum. See + // docs/GalaxyRepository.md. int32 security_classification = 9; bool is_historized = 10; bool is_alarm = 11; diff --git a/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto b/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto index 36ed0ae..36f9d73 100644 --- a/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto +++ b/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto @@ -393,6 +393,8 @@ message WriteSecuredBulkEntry { int32 item_handle = 1; int32 current_user_id = 2; int32 verifier_user_id = 3; + // Credential-sensitive write value. Implementations must not log this field + // unless an explicit redacted value-logging path is enabled. MxValue value = 4; } @@ -407,12 +409,15 @@ message WriteSecured2BulkEntry { int32 item_handle = 1; int32 current_user_id = 2; int32 verifier_user_id = 3; + // Credential-sensitive write value. Implementations must not log this field + // unless an explicit redacted value-logging path is enabled. MxValue value = 4; MxValue timestamp_value = 5; } // Bulk Read — snapshot the current value for each requested tag. MXAccess COM // has no synchronous Read; the worker implements ReadBulk as: +// // - If the tag is already in the session's item registry AND that item is // currently advised AND the worker has a cached OnDataChange for it, the // reply returns the cached value WITHOUT modifying the existing @@ -421,6 +426,7 @@ message WriteSecured2BulkEntry { // Advise, wait up to `timeout_ms` for the first OnDataChange, then // UnAdvise + RemoveItem before returning. The session is left exactly // as it was before the call (was_cached = false). +// // `timeout_ms == 0` uses the gateway-configured default (1000 ms). message ReadBulkCommand { int32 server_handle = 1; diff --git a/src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs b/src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs index 37607fe..b174237 100644 --- a/src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs +++ b/src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs @@ -7,10 +7,10 @@ using MxGateway.Server.Dashboard; namespace MxGateway.IntegrationTests; [Collection(LiveResourcesCollection.Name)] +[Trait("Category", "LiveLdap")] public sealed class DashboardLdapLiveTests { [LiveLdapFact] - [Trait("Category", "LiveLdap")] public async Task AuthenticateAsync_AdminInGwAdminGroup_Succeeds() { DashboardAuthenticator authenticator = CreateAuthenticator(); @@ -29,7 +29,6 @@ public sealed class DashboardLdapLiveTests } [LiveLdapFact] - [Trait("Category", "LiveLdap")] public async Task AuthenticateAsync_ReadOnlyUserMissingGwAdminGroup_Fails() { DashboardAuthenticator authenticator = CreateAuthenticator(); @@ -45,7 +44,6 @@ public sealed class DashboardLdapLiveTests } [LiveLdapFact] - [Trait("Category", "LiveLdap")] public async Task AuthenticateAsync_AdminWithWrongPassword_FailsWithoutLeakingPassword() { // Exercises the LdapException branch: the user exists and the service @@ -64,7 +62,6 @@ public sealed class DashboardLdapLiveTests } [LiveLdapFact] - [Trait("Category", "LiveLdap")] public async Task AuthenticateAsync_UnknownUsername_Fails() { // Exercises the `candidate is null` branch: the service-account search @@ -81,7 +78,6 @@ public sealed class DashboardLdapLiveTests } [LiveLdapFact] - [Trait("Category", "LiveLdap")] public async Task AuthenticateAsync_ServerUnreachable_FailsWithoutThrowing() { // Exercises the connect-failure path: a closed loopback port produces a diff --git a/src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs b/src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs index 86d412a..a04eb77 100644 --- a/src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs +++ b/src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs @@ -3,11 +3,11 @@ using MxGateway.Server.Galaxy; namespace MxGateway.IntegrationTests.Galaxy; [Collection(LiveResourcesCollection.Name)] +[Trait("Category", "LiveGalaxy")] public sealed class GalaxyRepositoryLiveTests { /// Verifies that the Galaxy Repository can establish a live connection to the ZB database. [LiveGalaxyRepositoryFact] - [Trait("Category", "LiveGalaxy")] public async Task TestConnection_AgainstZb_Succeeds() { GalaxyRepository repository = CreateRepository(); @@ -19,7 +19,6 @@ public sealed class GalaxyRepositoryLiveTests /// Verifies that the last deploy time can be retrieved from the ZB database. [LiveGalaxyRepositoryFact] - [Trait("Category", "LiveGalaxy")] public async Task GetLastDeployTime_AgainstZb_ReturnsTimestamp() { GalaxyRepository repository = CreateRepository(); @@ -31,7 +30,6 @@ public sealed class GalaxyRepositoryLiveTests /// Verifies that the hierarchy can be retrieved from the ZB database. [LiveGalaxyRepositoryFact] - [Trait("Category", "LiveGalaxy")] public async Task GetHierarchy_AgainstZb_ReturnsObjects() { GalaxyRepository repository = CreateRepository(); @@ -49,7 +47,6 @@ public sealed class GalaxyRepositoryLiveTests /// Verifies that object attributes can be retrieved from the ZB database. [LiveGalaxyRepositoryFact] - [Trait("Category", "LiveGalaxy")] public async Task GetAttributes_AgainstZb_ReturnsAtLeastOneAttribute() { GalaxyRepository repository = CreateRepository(); diff --git a/src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs b/src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs index 32bcf63..81d43b3 100644 --- a/src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs +++ b/src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs @@ -1,3 +1,5 @@ +using MxGateway.Server.Galaxy; + namespace MxGateway.IntegrationTests.Galaxy; /// Fact attribute that skips tests unless live Galaxy Repository tests are explicitly enabled. @@ -20,8 +22,12 @@ public sealed class LiveGalaxyRepositoryFactAttribute : FactAttribute /// Gets a value indicating whether live Galaxy Repository tests are enabled. public static bool Enabled => IntegrationTestEnvironment.IsEnabled(EnableVariableName); - /// Gets the Galaxy Repository connection string from environment or default. + /// + /// Gets the Galaxy Repository connection string from environment or the production + /// default. The default is sourced from + /// so the live-test fallback cannot drift away from the production default. + /// public static string ConnectionString => Environment.GetEnvironmentVariable(ConnectionStringVariableName) - ?? "Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;"; + ?? GalaxyRepositoryOptions.DefaultConnectionString; } diff --git a/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs b/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs index ad326d8..d7735d9 100644 --- a/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs +++ b/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs @@ -18,6 +18,7 @@ using Xunit.Abstractions; namespace MxGateway.IntegrationTests; [Collection(LiveResourcesCollection.Name)] +[Trait("Category", "LiveMxAccess")] public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) { private static readonly TimeSpan CommandTimeout = TimeSpan.FromSeconds(15); @@ -27,7 +28,6 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) /// Verifies that a gateway session can register, add item, advise, and stream events from live MXAccess. /// [LiveMxAccessFact] - [Trait("Category", "LiveMxAccess")] public async Task GatewaySession_WithLiveWorker_RegistersAdvisesStreamsDataAndCloses() { string workerExecutablePath = IntegrationTestEnvironment.ResolveLiveMxAccessWorkerExecutablePath(); @@ -37,9 +37,9 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) TestWorkerProcessFactory processFactory = new(output); await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, output); + using RecordingServerStreamWriter eventWriter = new(); string? sessionId = null; - RecordingServerStreamWriter? eventWriter = null; Task? streamTask = null; using CancellationTokenSource streamCancellation = new(); @@ -59,7 +59,6 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) Assert.Equal(ProtocolStatusCode.Ok, openReply.ProtocolStatus.Code); Assert.True(openReply.WorkerProcessId > 0); - eventWriter = new RecordingServerStreamWriter(); streamTask = fixture.Service.StreamEvents( new StreamEventsRequest { SessionId = sessionId }, eventWriter, @@ -113,10 +112,11 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) } /// - /// Verifies that a Write command round-trips through live MXAccess against an advised item. + /// Verifies that a Write command round-trips through live MXAccess against an advised item + /// and that the worker emits a matching event + /// — the proof of round-trip the cross-language client e2e runner relies on. /// [LiveMxAccessFact] - [Trait("Category", "LiveMxAccess")] public async Task GatewaySession_WithLiveWorker_WritesValueToAdvisedItem() { string workerExecutablePath = IntegrationTestEnvironment.ResolveLiveMxAccessWorkerExecutablePath(); @@ -126,9 +126,11 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) TestWorkerProcessFactory processFactory = new(output); await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, output); + using RecordingServerStreamWriter eventWriter = new(); string? sessionId = null; Task? streamTask = null; + using CancellationTokenSource streamCancellation = new(); try { @@ -144,11 +146,10 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) sessionId = openReply.SessionId; Assert.Equal(ProtocolStatusCode.Ok, openReply.ProtocolStatus.Code); - RecordingServerStreamWriter eventWriter = new(); streamTask = fixture.Service.StreamEvents( new StreamEventsRequest { SessionId = sessionId }, eventWriter, - new TestServerCallContext()); + new TestServerCallContext(streamCancellation.Token)); MxCommandReply registerReply = await fixture.Service.Invoke( CreateRegisterRequest(sessionId), @@ -180,16 +181,50 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) new TestServerCallContext()).ConfigureAwait(false); LogReply("Write", writeReply); - // The gateway must always report a protocol-level status. MXAccess - // parity details (a write rejection, a secured-item failure) belong - // in hresult / statuses, not in a transport failure — the command - // itself completed its round-trip to the worker and back. + // Happy-path Write: the worker COM call succeeded so HResultConverter + // produces ProtocolStatusCode.Ok. An MXAccess rejection (a write to a + // bad item, a secured-item failure) would surface as + // ProtocolStatusCode.MxaccessFailure with a non-zero hresult — never + // as an RpcException / transport fault, because the command still + // completed its round-trip to the worker and back. Assert.Equal(ProtocolStatusCode.Ok, writeReply.ProtocolStatus.Code); Assert.Equal(MxCommandKind.Write, writeReply.Kind); + + // Proof of round-trip: MXAccess fires OnWriteComplete (event id 2) + // after the underlying provider acknowledges the write — that is + // the event the cross-language client e2e runner asserts on. We + // scan the recorded stream (so an interleaving OnDataChange does + // not preempt the match) for an OnWriteComplete carrying the same + // server/item handles the Write command targeted. + MxEvent writeComplete = await eventWriter + .WaitForMessageAsync( + candidate => candidate.Family == MxEventFamily.OnWriteComplete + && candidate.ServerHandle == registerReply.Register.ServerHandle + && candidate.ItemHandle == addItemReply.AddItem.ItemHandle, + IntegrationTestEnvironment.LiveMxAccessEventTimeout, + streamCancellation.Token) + .ConfigureAwait(false); + LogEvent(writeComplete); + + Assert.Equal(MxEventFamily.OnWriteComplete, writeComplete.Family); + Assert.Equal(sessionId, writeComplete.SessionId); + Assert.Equal(registerReply.Register.ServerHandle, writeComplete.ServerHandle); + Assert.Equal(addItemReply.AddItem.ItemHandle, writeComplete.ItemHandle); + + // The stream task must not be in a faulted state. ShutDownAsync's + // broad catch would otherwise swallow the fault and silently let + // this Write-parity coverage pass against a broken event pipeline. + Assert.False( + streamTask.IsFaulted, + streamTask.Exception?.ToString() ?? "Event stream task faulted without an exception."); } finally { - await ShutDownAsync(fixture, processFactory, sessionId, streamTask).ConfigureAwait(false); + // Cancel the stream call before draining so StreamEvents observes + // cancellation rather than blocking on the channel. Any unhandled + // stream-task fault is rethrown from ShutDownAsync into the test. + streamCancellation.Cancel(); + await ShutDownAsync(fixture, processFactory, sessionId, streamTask, propagateStreamFaults: true).ConfigureAwait(false); } } @@ -198,7 +233,6 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) /// without faulting the gateway transport, exercising the invalid-handle parity path. /// [LiveMxAccessFact] - [Trait("Category", "LiveMxAccess")] public async Task GatewaySession_WithLiveWorker_InvalidHandleCommand_SurfacesFailureWithoutTransportFault() { string workerExecutablePath = IntegrationTestEnvironment.ResolveLiveMxAccessWorkerExecutablePath(); @@ -235,8 +269,10 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) // MXAccess parity: an invalid handle is an MXAccess-level failure. // The command still completed its worker round-trip, so the gateway - // protocol status is Ok and the failure shows up in hresult / the - // status proxies — it must not be reported as a transport fault. + // must reply with ProtocolStatusCode.MxaccessFailure and a non-zero + // hresult carrying the COM failure (per HResultConverter) — never a + // gRPC transport fault. The assertion below just checks the status + // is not Ok; the failure detail lives in hresult / the status proxies. Assert.NotEqual(ProtocolStatusCode.Ok, addItemReply.ProtocolStatus.Code); Assert.True( addItemReply.AddItem is null || addItemReply.AddItem.ItemHandle <= 0, @@ -248,35 +284,411 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) } } + /// + /// Verifies the MXAccess teardown chain: Unadvise then RemoveItem then Unregister + /// each return , and the worker stops emitting + /// OnDataChange events for the un-advised item. Exercises the lifecycle-ordering + /// parity CLAUDE.md singles out as a "do not synthesize" rule. + /// + [LiveMxAccessFact] + public async Task GatewaySession_WithLiveWorker_UnadviseRemoveItemUnregister_TeardownOrderingParity() + { + string workerExecutablePath = IntegrationTestEnvironment.ResolveLiveMxAccessWorkerExecutablePath(); + Assert.True( + File.Exists(workerExecutablePath), + $"Live MXAccess worker executable was not found at {workerExecutablePath}. Build the worker or set {IntegrationTestEnvironment.LiveMxAccessWorkerExecutableVariableName}."); + + TestWorkerProcessFactory processFactory = new(output); + await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, output); + using RecordingServerStreamWriter eventWriter = new(); + + string? sessionId = null; + Task? streamTask = null; + using CancellationTokenSource streamCancellation = new(); + + try + { + OpenSessionReply openReply = await fixture.Service.OpenSession( + new OpenSessionRequest + { + ClientSessionName = "live-mxaccess-teardown", + ClientCorrelationId = "live-open-teardown", + CommandTimeout = Duration.FromTimeSpan(CommandTimeout), + }, + new TestServerCallContext()).ConfigureAwait(false); + + sessionId = openReply.SessionId; + Assert.Equal(ProtocolStatusCode.Ok, openReply.ProtocolStatus.Code); + + streamTask = fixture.Service.StreamEvents( + new StreamEventsRequest { SessionId = sessionId }, + eventWriter, + new TestServerCallContext(streamCancellation.Token)); + + MxCommandReply registerReply = await fixture.Service.Invoke( + CreateRegisterRequest(sessionId), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("Register", registerReply); + Assert.Equal(ProtocolStatusCode.Ok, registerReply.ProtocolStatus.Code); + + int serverHandle = registerReply.Register.ServerHandle; + + MxCommandReply addItemReply = await fixture.Service.Invoke( + CreateAddItemRequest(sessionId, serverHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("AddItem", addItemReply); + Assert.Equal(ProtocolStatusCode.Ok, addItemReply.ProtocolStatus.Code); + int itemHandle = addItemReply.AddItem.ItemHandle; + + MxCommandReply adviseReply = await fixture.Service.Invoke( + CreateAdviseRequest(sessionId, serverHandle, itemHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("Advise", adviseReply); + Assert.Equal(ProtocolStatusCode.Ok, adviseReply.ProtocolStatus.Code); + + // Wait for an OnDataChange to prove the subscription is live before tearing it down. + MxEvent firstDataChange = await eventWriter + .WaitForMessageAsync( + candidate => candidate.Family == MxEventFamily.OnDataChange + && candidate.ServerHandle == serverHandle + && candidate.ItemHandle == itemHandle, + IntegrationTestEnvironment.LiveMxAccessEventTimeout, + streamCancellation.Token) + .ConfigureAwait(false); + LogEvent(firstDataChange); + + // RecordingServerStreamWriter.Messages returns a snapshot copy under its own + // lock, so iterating after each teardown step is safe without external sync. + int dataChangeCountBeforeUnadvise = CountMatchingEvents( + eventWriter, + e => e.Family == MxEventFamily.OnDataChange + && e.ServerHandle == serverHandle + && e.ItemHandle == itemHandle); + + // 1) UnAdvise — must reply Ok; the worker must stop emitting OnDataChange + // for this (server, item) pair after this returns. + MxCommandReply unadviseReply = await fixture.Service.Invoke( + CreateUnAdviseRequest(sessionId, serverHandle, itemHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("UnAdvise", unadviseReply); + Assert.Equal(ProtocolStatusCode.Ok, unadviseReply.ProtocolStatus.Code); + Assert.Equal(MxCommandKind.UnAdvise, unadviseReply.Kind); + + // 2) RemoveItem — must reply Ok against the same handles. + MxCommandReply removeItemReply = await fixture.Service.Invoke( + CreateRemoveItemRequest(sessionId, serverHandle, itemHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("RemoveItem", removeItemReply); + Assert.Equal(ProtocolStatusCode.Ok, removeItemReply.ProtocolStatus.Code); + Assert.Equal(MxCommandKind.RemoveItem, removeItemReply.Kind); + + // 3) Unregister — closes the client session inside the worker. + MxCommandReply unregisterReply = await fixture.Service.Invoke( + CreateUnregisterRequest(sessionId, serverHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("Unregister", unregisterReply); + Assert.Equal(ProtocolStatusCode.Ok, unregisterReply.ProtocolStatus.Code); + Assert.Equal(MxCommandKind.Unregister, unregisterReply.Kind); + + // Allow a short settle window for any in-flight OnDataChange to drain, then + // assert no further events arrived for the un-advised (serverHandle, itemHandle). + // MXAccess parity: after UnAdvise the provider must stop publishing OnDataChange + // for this item — a regression that left a stale subscription alive would surface + // as additional events after this delay. + await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); + + int dataChangeCountAfterTeardown = CountMatchingEvents( + eventWriter, + e => e.Family == MxEventFamily.OnDataChange + && e.ServerHandle == serverHandle + && e.ItemHandle == itemHandle); + output.WriteLine( + $"DataChange count before UnAdvise={dataChangeCountBeforeUnadvise} after teardown+settle={dataChangeCountAfterTeardown}"); + Assert.Equal(dataChangeCountBeforeUnadvise, dataChangeCountAfterTeardown); + + // A RemoveItem against the just-freed item handle must not silently succeed — + // the worker has to relay MXAccess's invalid-handle response. Closing the + // session is enough for parity, but we sanity-check that re-using the freed + // pair does not accidentally appear Ok. + MxCommandReply secondRemoveItemReply = await fixture.Service.Invoke( + CreateRemoveItemRequest(sessionId, serverHandle, itemHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("RemoveItem(stale)", secondRemoveItemReply); + Assert.NotEqual(ProtocolStatusCode.Ok, secondRemoveItemReply.ProtocolStatus.Code); + } + finally + { + streamCancellation.Cancel(); + await ShutDownAsync(fixture, processFactory, sessionId, streamTask).ConfigureAwait(false); + } + } + + /// + /// Verifies the MXAccess WriteSecured path: AuthenticateUser resolves a + /// user id, then WriteSecured against the advised item completes its round-trip + /// to the worker and back. CLAUDE.md singles out WriteSecured ordering as a + /// parity surface the gateway must not "fix" — the test asserts the reply kind and + /// protocol status, not a fabricated outcome. + /// + [LiveMxAccessFact] + public async Task GatewaySession_WithLiveWorker_WriteSecured_AuthenticatedRoundTripParity() + { + string workerExecutablePath = IntegrationTestEnvironment.ResolveLiveMxAccessWorkerExecutablePath(); + Assert.True( + File.Exists(workerExecutablePath), + $"Live MXAccess worker executable was not found at {workerExecutablePath}. Build the worker or set {IntegrationTestEnvironment.LiveMxAccessWorkerExecutableVariableName}."); + + TestWorkerProcessFactory processFactory = new(output); + await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, output); + // Stream events so a regression that emitted an OperationComplete or + // OnWriteComplete with wrong handles would still be observable via the test + // output (we don't assert a specific event here — the docs note successful + // writes raise only OnWriteComplete, but WriteSecured against an unprotected + // item commonly fails with 0x80004021 in this provider, which raises no event). + using RecordingServerStreamWriter eventWriter = new(); + + string? sessionId = null; + Task? streamTask = null; + using CancellationTokenSource streamCancellation = new(); + + try + { + OpenSessionReply openReply = await fixture.Service.OpenSession( + new OpenSessionRequest + { + ClientSessionName = "live-mxaccess-write-secured", + ClientCorrelationId = "live-open-write-secured", + CommandTimeout = Duration.FromTimeSpan(CommandTimeout), + }, + new TestServerCallContext()).ConfigureAwait(false); + + sessionId = openReply.SessionId; + Assert.Equal(ProtocolStatusCode.Ok, openReply.ProtocolStatus.Code); + + streamTask = fixture.Service.StreamEvents( + new StreamEventsRequest { SessionId = sessionId }, + eventWriter, + new TestServerCallContext(streamCancellation.Token)); + + MxCommandReply registerReply = await fixture.Service.Invoke( + CreateRegisterRequest(sessionId), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("Register", registerReply); + Assert.Equal(ProtocolStatusCode.Ok, registerReply.ProtocolStatus.Code); + int serverHandle = registerReply.Register.ServerHandle; + + MxCommandReply addItemReply = await fixture.Service.Invoke( + CreateAddItemRequest(sessionId, serverHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("AddItem", addItemReply); + Assert.Equal(ProtocolStatusCode.Ok, addItemReply.ProtocolStatus.Code); + int itemHandle = addItemReply.AddItem.ItemHandle; + + MxCommandReply adviseReply = await fixture.Service.Invoke( + CreateAdviseRequest(sessionId, serverHandle, itemHandle), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("Advise", adviseReply); + Assert.Equal(ProtocolStatusCode.Ok, adviseReply.ProtocolStatus.Code); + + // AuthenticateUser resolves an ArchestrA user id for the WriteSecured call. + // Credentials are env-overridable so the test honors the gateway's "do not + // log secrets" rule and works against either MXAccess's own user store or + // the LmxOpcUa-baseline GLAuth-bridged ArchestrA identity (admin/admin123). + (string verifyUser, string verifyPassword) = ResolveLiveMxAccessSecuredCredentials(); + MxCommandReply authReply = await fixture.Service.Invoke( + CreateAuthenticateUserRequest(sessionId, serverHandle, verifyUser, verifyPassword), + new TestServerCallContext()).ConfigureAwait(false); + output.WriteLine( + $"AuthenticateUser status={authReply.ProtocolStatus.Code} hresult={authReply.Hresult} user_id={authReply.AuthenticateUser?.UserId}"); + + // AuthenticateUser is allowed to fail (the underlying provider may reject + // the credential pair); we use the returned user id if non-zero and fall + // back to 0 ("operator only" / no verifier) so the parity assertion holds. + int currentUserId = authReply.ProtocolStatus.Code == ProtocolStatusCode.Ok + && authReply.AuthenticateUser is not null + && authReply.AuthenticateUser.UserId != 0 + ? authReply.AuthenticateUser.UserId + : 0; + + MxCommandReply writeSecuredReply = await fixture.Service.Invoke( + CreateWriteSecuredRequest( + sessionId, + serverHandle, + itemHandle, + currentUserId, + verifierUserId: 0), + new TestServerCallContext()).ConfigureAwait(false); + LogReply("WriteSecured", writeSecuredReply); + + // Parity: the command itself completed its round-trip — the reply kind is + // WriteSecured and the gateway protocol status is set. The MXAccess outcome + // (Ok for an unprotected provider, MxaccessFailure with hresult 0x80004021 + // when the item is not WriteSecured-eligible) lives in protocol_status + + // hresult, never as a transport fault. The diagnostic message must never + // contain the credential. + Assert.Equal(MxCommandKind.WriteSecured, writeSecuredReply.Kind); + Assert.True( + writeSecuredReply.ProtocolStatus.Code is ProtocolStatusCode.Ok + or ProtocolStatusCode.MxaccessFailure, + $"Unexpected WriteSecured protocol status {writeSecuredReply.ProtocolStatus.Code}."); + Assert.DoesNotContain(verifyPassword, writeSecuredReply.DiagnosticMessage ?? string.Empty, StringComparison.Ordinal); + } + finally + { + streamCancellation.Cancel(); + await ShutDownAsync(fixture, processFactory, sessionId, streamTask).ConfigureAwait(false); + } + } + + /// + /// Verifies that killing the worker process marks the session + /// with a clean fault classification — the gateway + /// must observe the abnormal exit, transition the session, and surface a non-empty + /// fault description rather than hanging or crashing. + /// + [LiveMxAccessFact] + public async Task GatewaySession_WithLiveWorker_AbnormalWorkerExit_MarksSessionFaulted() + { + string workerExecutablePath = IntegrationTestEnvironment.ResolveLiveMxAccessWorkerExecutablePath(); + Assert.True( + File.Exists(workerExecutablePath), + $"Live MXAccess worker executable was not found at {workerExecutablePath}. Build the worker or set {IntegrationTestEnvironment.LiveMxAccessWorkerExecutableVariableName}."); + + TestWorkerProcessFactory processFactory = new(output); + await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, output); + using RecordingServerStreamWriter eventWriter = new(); + + string? sessionId = null; + Task? streamTask = null; + using CancellationTokenSource streamCancellation = new(); + + try + { + OpenSessionReply openReply = await fixture.Service.OpenSession( + new OpenSessionRequest + { + ClientSessionName = "live-mxaccess-abnormal-exit", + ClientCorrelationId = "live-open-abnormal", + CommandTimeout = Duration.FromTimeSpan(CommandTimeout), + }, + new TestServerCallContext()).ConfigureAwait(false); + + sessionId = openReply.SessionId; + Assert.Equal(ProtocolStatusCode.Ok, openReply.ProtocolStatus.Code); + + streamTask = fixture.Service.StreamEvents( + new StreamEventsRequest { SessionId = sessionId }, + eventWriter, + new TestServerCallContext(streamCancellation.Token)); + + // Kill the worker process directly. WorkerClient's read loop hits an + // end-of-stream on the named pipe and routes through SetFaulted; the + // session manager then marks the session Faulted. We avoid CloseSession + // so the transition is driven by the abnormal exit, not a graceful path. + processFactory.KillAllAndDetach(); + + DateTimeOffset waitDeadline = DateTimeOffset.UtcNow + StreamShutdownTimeout; + SessionState observedState = SessionState.Unspecified; + string? observedFault = null; + while (DateTimeOffset.UtcNow < waitDeadline) + { + if (fixture.TryGetSession(sessionId, out GatewaySession? session)) + { + observedState = session.State; + observedFault = session.FinalFault; + if (observedState == SessionState.Faulted) + { + break; + } + } + + await Task.Delay(TimeSpan.FromMilliseconds(50)).ConfigureAwait(false); + } + + output.WriteLine($"AbnormalExit observed_state={observedState} fault={observedFault}"); + Assert.Equal(SessionState.Faulted, observedState); + Assert.False(string.IsNullOrWhiteSpace(observedFault), "Faulted session must carry a non-empty fault description."); + + // The fault classification must come from a known worker-client error code so + // operators get an actionable cause string rather than an opaque exception + // trace. We accept any of the abnormal-exit classifications WorkerClient + // routes through SetFaulted on a killed worker. + Assert.True( + observedFault!.Contains("disconnect", StringComparison.OrdinalIgnoreCase) + || observedFault.Contains("pipe", StringComparison.OrdinalIgnoreCase) + || observedFault.Contains("heartbeat", StringComparison.OrdinalIgnoreCase) + || observedFault.Contains("worker", StringComparison.OrdinalIgnoreCase) + || observedFault.Contains("end of stream", StringComparison.OrdinalIgnoreCase), + $"Fault description '{observedFault}' did not match a known worker-exit classification."); + } + finally + { + streamCancellation.Cancel(); + // sessionId is intentionally null here — the session is already faulted and a + // CloseSession round-trip would just log a cleanup failure. We still wait for + // the worker process exit so the next test starts with a clean state. + await ShutDownAsync(fixture, processFactory, sessionId: null, streamTask).ConfigureAwait(false); + } + } + /// /// Closes the session and drains the event stream / worker processes without letting a /// cleanup timeout mask the original failure from the test body. /// + /// + /// When , a faulted is rethrown so the + /// test fails on a silent stream-task exception (the Write parity test relies on this so + /// stream-side defects in event delivery are visible). When , all + /// cleanup exceptions are logged and swallowed so a real test-body assertion failure is not + /// masked by a shutdown timeout (the original IntegrationTests-004 fix). + /// private async Task ShutDownAsync( GatewayServiceFixture fixture, TestWorkerProcessFactory processFactory, string? sessionId, - Task? streamTask) + Task? streamTask, + bool propagateStreamFaults = false) { + Exception? streamFault = null; + try { if (!string.IsNullOrWhiteSpace(sessionId)) { await CloseSessionAsync(fixture, sessionId).ConfigureAwait(false); } - - if (streamTask is not null) - { - await streamTask.WaitAsync(StreamShutdownTimeout).ConfigureAwait(false); - } } catch (Exception ex) { - // Cleanup runs in a finally block. A TimeoutException (or a faulted - // StreamEvents task) here would otherwise replace any assertion - // failure raised in the try block. Log it and let the original - // failure surface. - output.WriteLine($"Cleanup error during session/stream shutdown: {ex}"); + output.WriteLine($"Cleanup error during session close: {ex}"); + } + + if (streamTask is not null) + { + try + { + await streamTask.WaitAsync(StreamShutdownTimeout).ConfigureAwait(false); + } + catch (OperationCanceledException ex) + { + // A linked CancellationToken on the streaming TestServerCallContext is the + // intended way to stop StreamEvents promptly — treat the resulting + // OperationCanceledException as a clean shutdown, not a fault. + output.WriteLine($"Event stream task cancelled during shutdown: {ex.Message}"); + } + catch (Exception ex) + { + // Cleanup runs in a finally block. By default a faulted StreamEvents task is + // logged and swallowed so a test-body assertion failure is not masked. When + // the caller opts into propagateStreamFaults (the Write parity test), we + // rethrow the fault after the worker-process wait so a silent stream-side + // defect actually fails the test. + output.WriteLine($"Event stream task faulted during shutdown: {ex}"); + if (propagateStreamFaults) + { + streamFault = ex; + } + } } try @@ -287,6 +699,11 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) { output.WriteLine($"Cleanup error while waiting for worker processes to exit: {ex}"); } + + if (streamFault is not null) + { + throw streamFault; + } } private static MxCommandRequest CreateRegisterRequest(string sessionId) @@ -373,6 +790,145 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) }; } + private static MxCommandRequest CreateUnAdviseRequest( + string sessionId, + int serverHandle, + int itemHandle) + { + return new MxCommandRequest + { + SessionId = sessionId, + ClientCorrelationId = "live-unadvise", + Command = new MxCommand + { + Kind = MxCommandKind.UnAdvise, + UnAdvise = new UnAdviseCommand + { + ServerHandle = serverHandle, + ItemHandle = itemHandle, + }, + }, + }; + } + + private static MxCommandRequest CreateRemoveItemRequest( + string sessionId, + int serverHandle, + int itemHandle) + { + return new MxCommandRequest + { + SessionId = sessionId, + ClientCorrelationId = "live-remove-item", + Command = new MxCommand + { + Kind = MxCommandKind.RemoveItem, + RemoveItem = new RemoveItemCommand + { + ServerHandle = serverHandle, + ItemHandle = itemHandle, + }, + }, + }; + } + + private static MxCommandRequest CreateUnregisterRequest( + string sessionId, + int serverHandle) + { + return new MxCommandRequest + { + SessionId = sessionId, + ClientCorrelationId = "live-unregister", + Command = new MxCommand + { + Kind = MxCommandKind.Unregister, + Unregister = new UnregisterCommand + { + ServerHandle = serverHandle, + }, + }, + }; + } + + private static MxCommandRequest CreateAuthenticateUserRequest( + string sessionId, + int serverHandle, + string verifyUser, + string verifyPassword) + { + return new MxCommandRequest + { + SessionId = sessionId, + ClientCorrelationId = "live-authenticate-user", + Command = new MxCommand + { + Kind = MxCommandKind.AuthenticateUser, + AuthenticateUser = new AuthenticateUserCommand + { + ServerHandle = serverHandle, + VerifyUser = verifyUser, + VerifyUserPassword = verifyPassword, + }, + }, + }; + } + + private static MxCommandRequest CreateWriteSecuredRequest( + string sessionId, + int serverHandle, + int itemHandle, + int currentUserId, + int verifierUserId) + { + return new MxCommandRequest + { + SessionId = sessionId, + ClientCorrelationId = "live-write-secured", + Command = new MxCommand + { + Kind = MxCommandKind.WriteSecured, + WriteSecured = new WriteSecuredCommand + { + ServerHandle = serverHandle, + ItemHandle = itemHandle, + CurrentUserId = currentUserId, + VerifierUserId = verifierUserId, + Value = new MxValue + { + DataType = MxDataType.Integer, + Int32Value = 2, + }, + }, + }, + }; + } + + private static (string VerifyUser, string VerifyPassword) ResolveLiveMxAccessSecuredCredentials() + { + string verifyUser = Environment.GetEnvironmentVariable("MXGATEWAY_LIVE_MXACCESS_WRITE_SECURED_USER") + ?? "admin"; + string verifyPassword = Environment.GetEnvironmentVariable("MXGATEWAY_LIVE_MXACCESS_WRITE_SECURED_PASSWORD") + ?? "admin123"; + return (verifyUser, verifyPassword); + } + + private static int CountMatchingEvents( + RecordingServerStreamWriter writer, + Func predicate) + { + int count = 0; + foreach (MxEvent message in writer.Messages) + { + if (predicate(message)) + { + count++; + } + } + + return count; + } + private async Task CloseSessionAsync( GatewayServiceFixture fixture, string sessionId) @@ -472,6 +1028,17 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) /// public MxAccessGatewayService Service { get; } + /// + /// Looks up a session by id directly against the in-process registry. The abnormal + /// worker-exit test needs to observe the session's State / FinalFault as the gateway + /// transitions it to Faulted, which the public gRPC API only exposes indirectly via + /// CloseSession's reply (and not before a graceful close completes). + /// + public bool TryGetSession(string sessionId, out GatewaySession session) + { + return _registry.TryGet(sessionId, out session); + } + /// /// Disposes the fixture resources and closes all sessions. /// @@ -516,7 +1083,7 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) /// /// Gathers messages written to a server stream for test inspection. /// - private sealed class RecordingServerStreamWriter : IServerStreamWriter + private sealed class RecordingServerStreamWriter : IServerStreamWriter, IDisposable { private readonly object syncRoot = new(); private readonly List messages = []; @@ -606,6 +1173,16 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) } } } + + /// + /// Releases the wait handle backing messageArrived. The writer owns an + /// field so it must be disposable itself; the leak + /// is otherwise bounded only by how many opt-in live tests run. + /// + public void Dispose() + { + messageArrived.Dispose(); + } } /// @@ -734,6 +1311,32 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) } } + /// + /// Kills every recorded worker process tree so the abnormal-exit test can simulate a + /// crashed worker without going through the graceful shutdown handshake. Failures to + /// kill an already-dead process are tolerated. + /// + public void KillAllAndDetach() + { + foreach (TestWorkerProcess process in processes) + { + if (process.HasExited) + { + continue; + } + + try + { + process.Kill(entireProcessTree: true); + output.WriteLine($"WorkerProcess killed pid={process.Id} (abnormal-exit simulation)"); + } + catch (InvalidOperationException ex) + { + output.WriteLine($"WorkerProcess kill skipped pid={process.Id}: {ex.Message}"); + } + } + } + private void WriteWorkerOutput( string streamName, string? line) diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/ApiKeysPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/ApiKeysPage.razor index 9a058a3..545139c 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/ApiKeysPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/ApiKeysPage.razor @@ -1,5 +1,4 @@ @page "/apikeys" -@page "/dashboard/apikeys" @inherits DashboardPageBase @inject AuthenticationStateProvider AuthenticationStateProvider @inject IDashboardApiKeyManagementService ApiKeyManagementService diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor b/src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor index 0197588..919594d 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor @@ -1,5 +1,4 @@ @page "/" -@page "/dashboard/" @inherits DashboardPageBase MXAccess Gateway Dashboard diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/EventsPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/EventsPage.razor index e5401ec..0e0b687 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/EventsPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/EventsPage.razor @@ -1,5 +1,4 @@ @page "/events" -@page "/dashboard/events" @inherits DashboardPageBase Dashboard Events diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/GalaxyPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/GalaxyPage.razor index 3c5e4a2..48e97e2 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/GalaxyPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/GalaxyPage.razor @@ -1,5 +1,4 @@ @page "/galaxy" -@page "/dashboard/galaxy" @inherits DashboardPageBase Dashboard Galaxy diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/SessionDetailsPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/SessionDetailsPage.razor index 569ad72..0eddbbc 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/SessionDetailsPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/SessionDetailsPage.razor @@ -1,5 +1,4 @@ @page "/sessions/{SessionId}" -@page "/dashboard/sessions/{SessionId}" @inherits DashboardPageBase Dashboard Session diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/SessionsPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/SessionsPage.razor index 5c60e20..e8df4dc 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/SessionsPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/SessionsPage.razor @@ -1,5 +1,4 @@ @page "/sessions" -@page "/dashboard/sessions" @inherits DashboardPageBase Dashboard Sessions diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/SettingsPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/SettingsPage.razor index ebf9930..120aac0 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/SettingsPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/SettingsPage.razor @@ -1,5 +1,4 @@ @page "/settings" -@page "/dashboard/settings" @inherits DashboardPageBase Dashboard Settings diff --git a/src/MxGateway.Server/Dashboard/Components/Pages/WorkersPage.razor b/src/MxGateway.Server/Dashboard/Components/Pages/WorkersPage.razor index 80f8182..4811852 100644 --- a/src/MxGateway.Server/Dashboard/Components/Pages/WorkersPage.razor +++ b/src/MxGateway.Server/Dashboard/Components/Pages/WorkersPage.razor @@ -1,5 +1,4 @@ @page "/workers" -@page "/dashboard/workers" @inherits DashboardPageBase Dashboard Workers diff --git a/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs b/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs index dc4f0b3..a5c6f5c 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs @@ -7,13 +7,42 @@ namespace MxGateway.Server.Galaxy; public static class GalaxyGlobMatcher { /// - /// Compiled-regex cache keyed by glob pattern. IsMatch is called once per - /// object per DiscoverHierarchy/WatchDeployEvents evaluation, so the - /// same handful of glob patterns are translated repeatedly; caching avoids - /// rebuilding and recompiling the regex on every call. + /// Maximum number of compiled-regex entries retained in . + /// The cache is keyed by glob pattern and patterns flow in from two sources: + /// admin-controlled API-key constraints (naturally bounded) and the + /// client-supplied DiscoverHierarchyRequest.TagNameGlob (unbounded — a + /// client can iterate through generated names and create millions of distinct + /// globs over the process lifetime). Capping the cache bounds memory while + /// keeping the hot working set hit-cached. + /// + internal const int RegexCacheCapacity = 256; + + /// + /// Bounded compiled-regex cache keyed by glob pattern. IsMatch is called + /// once per object per DiscoverHierarchy/WatchDeployEvents + /// evaluation, so the same handful of glob patterns are translated + /// repeatedly; caching avoids rebuilding and recompiling the regex on every + /// call. Beyond entries the oldest insertion + /// is evicted so a client cannot grow the cache without bound by submitting + /// unique patterns. Eviction is approximate (FIFO over insertion order, not + /// true LRU) because we only need the bound, not exact recency tracking. /// private static readonly ConcurrentDictionary RegexCache = new(StringComparer.Ordinal); + /// + /// Insertion-order queue used to evict the oldest cache entry when the cache + /// exceeds . A separate queue keeps the + /// reads lock-free; the lock below only guards the + /// eviction path. + /// + private static readonly ConcurrentQueue InsertionOrder = new(); + private static readonly object EvictionLock = new(); + + /// + /// Current cache size, exposed for tests asserting the cap is honoured. + /// + internal static int CurrentCacheSize => RegexCache.Count; + public static bool IsMatch(string value, string glob) { if (string.IsNullOrWhiteSpace(glob)) @@ -26,10 +55,42 @@ public static class GalaxyGlobMatcher private static Regex GetOrCreateRegex(string glob) { - return RegexCache.GetOrAdd(glob, static pattern => new Regex( - BuildRegex(pattern), + if (RegexCache.TryGetValue(glob, out Regex? existing)) + { + return existing; + } + + Regex compiled = new( + BuildRegex(glob), RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Compiled, - TimeSpan.FromMilliseconds(100))); + TimeSpan.FromMilliseconds(100)); + + if (RegexCache.TryAdd(glob, compiled)) + { + InsertionOrder.Enqueue(glob); + EvictIfOverCapacity(); + return compiled; + } + + // Another thread won the race — use its compiled regex. + return RegexCache[glob]; + } + + private static void EvictIfOverCapacity() + { + if (RegexCache.Count <= RegexCacheCapacity) + { + return; + } + + // Serialize eviction so two threads do not race past the cap together. + lock (EvictionLock) + { + while (RegexCache.Count > RegexCacheCapacity && InsertionOrder.TryDequeue(out string? oldest)) + { + RegexCache.TryRemove(oldest, out _); + } + } } private static string BuildRegex(string glob) diff --git a/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs b/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs index fe2db2b..d2bfb3f 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs @@ -17,7 +17,7 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache { private static readonly TimeSpan StaleThreshold = TimeSpan.FromMinutes(5); - private readonly GalaxyRepository _repository; + private readonly IGalaxyRepository _repository; private readonly IGalaxyDeployNotifier _notifier; private readonly TimeProvider _timeProvider; private readonly ILogger? _logger; @@ -31,7 +31,7 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache /// Provider for current time; defaults to system time. /// Optional logger for diagnostic output. public GalaxyHierarchyCache( - GalaxyRepository repository, + IGalaxyRepository repository, IGalaxyDeployNotifier notifier, TimeProvider? timeProvider = null, ILogger? logger = null) diff --git a/src/MxGateway.Server/Galaxy/GalaxyRepository.cs b/src/MxGateway.Server/Galaxy/GalaxyRepository.cs index 2e27499..39bc1f6 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyRepository.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyRepository.cs @@ -8,7 +8,7 @@ namespace MxGateway.Server.Galaxy; /// consumers — the same SQL drives the OPC UA server's address space and this gateway's /// gRPC browse surface. /// -public sealed class GalaxyRepository(GalaxyRepositoryOptions options) +public sealed class GalaxyRepository(GalaxyRepositoryOptions options) : IGalaxyRepository { /// Tests the connection to the Galaxy Repository database. /// Token to cancel the asynchronous operation. diff --git a/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs b/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs index 921b1d7..7007c0c 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs @@ -8,10 +8,17 @@ public sealed class GalaxyRepositoryOptions { public const string SectionName = "MxGateway:Galaxy"; - /// The SQL Server connection string for the Galaxy Repository database. - public string ConnectionString { get; init; } = + /// + /// Default SQL Server connection string for the Galaxy Repository database. + /// Single source of truth shared with the integration-test fallback so the + /// production default and the live-test default cannot drift. + /// + public const string DefaultConnectionString = "Server=localhost;Database=ZB;Integrated Security=True;TrustServerCertificate=True;Encrypt=False;"; + /// The SQL Server connection string for the Galaxy Repository database. + public string ConnectionString { get; init; } = DefaultConnectionString; + /// The timeout in seconds for SQL commands executed against the Galaxy Repository. public int CommandTimeoutSeconds { get; init; } = 60; diff --git a/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs b/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs index 35cf727..070470d 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyRepositoryServiceCollectionExtensions.cs @@ -16,6 +16,7 @@ public static class GalaxyRepositoryServiceCollectionExtensions services.AddSingleton(sp => new GalaxyRepository(sp.GetRequiredService>().Value)); + services.AddSingleton(sp => sp.GetRequiredService()); services.AddSingleton(); services.AddSingleton(); diff --git a/src/MxGateway.Server/Galaxy/IGalaxyRepository.cs b/src/MxGateway.Server/Galaxy/IGalaxyRepository.cs new file mode 100644 index 0000000..1d6c426 --- /dev/null +++ b/src/MxGateway.Server/Galaxy/IGalaxyRepository.cs @@ -0,0 +1,30 @@ +namespace MxGateway.Server.Galaxy; + +/// +/// Abstraction over consumed by +/// . Exists so the cache can be unit-tested +/// against an in-memory fake that throws a +/// from (the unavailable-backend code +/// path) without standing up a real Microsoft.Data.SqlClient +/// SqlConnection against a bogus host/port. The production gateway +/// wires the concrete ; the SQL surface itself +/// stays covered by MxGateway.IntegrationTests.Galaxy.GalaxyRepositoryLiveTests. +/// +public interface IGalaxyRepository +{ + /// Tests the connection to the Galaxy Repository database. + /// Token to cancel the asynchronous operation. + Task TestConnectionAsync(CancellationToken ct = default); + + /// Retrieves the last deployment time from the Galaxy Repository. + /// Token to cancel the asynchronous operation. + Task GetLastDeployTimeAsync(CancellationToken ct = default); + + /// Retrieves the complete hierarchy of Galaxy objects from the repository. + /// Token to cancel the asynchronous operation. + Task> GetHierarchyAsync(CancellationToken ct = default); + + /// Retrieves all attributes for Galaxy objects from the repository. + /// Token to cancel the asynchronous operation. + Task> GetAttributesAsync(CancellationToken ct = default); +} diff --git a/src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs b/src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs index de24d55..bd9428a 100644 --- a/src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs +++ b/src/MxGateway.Server/Security/Authorization/GatewayGrpcScopeResolver.cs @@ -18,6 +18,8 @@ public sealed class GatewayGrpcScopeResolver CloseSessionRequest => GatewayScopes.SessionClose, StreamEventsRequest => GatewayScopes.EventsRead, MxCommandRequest commandRequest => ResolveCommandScope(commandRequest.Command?.Kind ?? MxCommandKind.Unspecified), + AcknowledgeAlarmRequest => GatewayScopes.InvokeWrite, + QueryActiveAlarmsRequest => GatewayScopes.EventsRead, TestConnectionRequest or GetLastDeployTimeRequest or DiscoverHierarchyRequest or diff --git a/src/MxGateway.Server/Sessions/GatewaySession.cs b/src/MxGateway.Server/Sessions/GatewaySession.cs index 0ee8bfb..9c3b00e 100644 --- a/src/MxGateway.Server/Sessions/GatewaySession.cs +++ b/src/MxGateway.Server/Sessions/GatewaySession.cs @@ -263,6 +263,17 @@ public sealed class GatewaySession /// Transitions the session to a new state with constraints for terminal states. /// /// Next session state to transition to. + /// + /// is terminal. + /// only allows a transition to . + /// only allows a transition to + /// (or ) — once + /// has started, no late lifecycle callback can revive the + /// session by walking it back to or any earlier + /// state. Both close-related writes (Closing and Closed) go through + /// _syncRoot just like every other state read/write, closing the split-lock + /// race called out in Server-015. + /// public void TransitionTo(SessionState nextState) { lock (_syncRoot) @@ -277,6 +288,13 @@ public sealed class GatewaySession return; } + if (_state is SessionState.Closing + && nextState is not SessionState.Closed + && nextState is not SessionState.Faulted) + { + return; + } + _state = nextState; } } @@ -717,6 +735,14 @@ public sealed class GatewaySession /// /// Reason for closing the session. /// Token to cancel the asynchronous operation. + /// + /// Concurrent close attempts are serialized by _closeLock so only one close + /// runs at a time, but every read/write of _state still passes through + /// _syncRoot (via and ) — + /// the close path therefore obeys the same lock discipline as + /// / and a concurrent + /// TransitionTo(Ready) cannot race past a Closing write. + /// public async Task CloseAsync( string reason, CancellationToken cancellationToken) @@ -726,15 +752,11 @@ public sealed class GatewaySession { try { - if (_state is SessionState.Closed) + if (!TryBeginClose(out bool alreadyClosing)) { return new SessionCloseResult(SessionId, SessionState.Closed, AlreadyClosed: true); } - bool alreadyClosing = _closeStarted; - _closeStarted = true; - _state = SessionState.Closing; - if (_workerClient is not null) { try @@ -758,7 +780,7 @@ public sealed class GatewaySession } } - _state = SessionState.Closed; + MarkClosed(); return new SessionCloseResult(SessionId, SessionState.Closed, alreadyClosing); } catch (Exception exception) when (exception is not SessionCloseStartedException) @@ -774,6 +796,40 @@ public sealed class GatewaySession } } + // Returns false when the session is already Closed (caller short-circuits with + // AlreadyClosed: true). Otherwise sets _state = Closing under _syncRoot so a + // concurrent TransitionTo(Ready) — which only refuses to overwrite Closed/Faulted + // — cannot flip the session back to Ready after close started. The `alreadyClosing` + // out parameter mirrors the previous `_closeStarted` check so the surface contract + // (a second concurrent close returns AlreadyClosed: alreadyClosing) is preserved. + private bool TryBeginClose(out bool alreadyClosing) + { + lock (_syncRoot) + { + if (_state is SessionState.Closed) + { + alreadyClosing = _closeStarted; + return false; + } + + alreadyClosing = _closeStarted; + _closeStarted = true; + _state = SessionState.Closing; + return true; + } + } + + // Final terminal transition; under _syncRoot to keep _state writes single-lock. + // Closed is unconditionally terminal — TransitionTo refuses to overwrite it — + // so we don't need to re-check the precondition here. + private void MarkClosed() + { + lock (_syncRoot) + { + _state = SessionState.Closed; + } + } + /// /// Terminates the worker process immediately. /// @@ -787,9 +843,47 @@ public sealed class GatewaySession /// /// Disposes the session and frees associated resources. /// + /// + /// Acquires _closeLock once before disposing so an in-flight + /// finishes before the semaphore is released and + /// reclaimed. Without this gate, the in-flight close's _closeLock.Release() + /// would race the dispose and raise . + /// The acquire is best-effort: a non-cancellable wait that swallows + /// so double-dispose still completes. + /// public async ValueTask DisposeAsync() { - _closeLock.Dispose(); + try + { + // CancellationToken.None — disposal must not be cancelled, and a misbehaving + // close path that never releases would have to be torn down by the worker + // shutdown timeout long before we reach here. + await _closeLock.WaitAsync(CancellationToken.None).ConfigureAwait(false); + try + { + // Hand the slot back so the semaphore's internal counter is consistent + // for any contemporaneous waiter, then dispose. Once disposed, every + // subsequent WaitAsync / Release will throw — but DisposeAsync's contract + // is "no concurrent close after this point", which SessionManager honors. + _closeLock.Release(); + } + catch (ObjectDisposedException) + { + } + } + catch (ObjectDisposedException) + { + // Already disposed (e.g. double-dispose); nothing to gate on. + } + + try + { + _closeLock.Dispose(); + } + catch (ObjectDisposedException) + { + } + if (_workerClient is not null) { await _workerClient.DisposeAsync().ConfigureAwait(false); diff --git a/src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs b/src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs index 328b774..41170f3 100644 --- a/src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs +++ b/src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs @@ -6,18 +6,18 @@ using MxGateway.Contracts.Proto; namespace MxGateway.Server.Sessions; /// -/// PR A.6 / A.7 — gateway-side dispatcher for the alarm-RPC surface. -/// Bridges the public AcknowledgeAlarm + QueryActiveAlarms -/// gRPC handlers to the worker process that hosts -/// IMxAccessAlarmConsumer. +/// Gateway-side dispatcher seam for the alarm-RPC surface. Bridges the +/// public AcknowledgeAlarm + QueryActiveAlarms gRPC handlers +/// to the worker process that hosts IMxAccessAlarmConsumer. /// /// /// -/// Production implementations live in WorkerAlarmRpcDispatcher -/// (this PR ships a not-yet-wired default that returns a clear -/// worker-pending diagnostic) and route through the existing -/// worker-pipe IPC. Tests inject a fake to exercise the gateway -/// handler shape without spinning up a worker process. +/// DI binds the production by +/// default; it routes calls through the existing worker-pipe IPC. +/// NotWiredAlarmRpcDispatcher is only the null fallback used +/// when no dispatcher is registered (DI omission / standalone tests). +/// Other tests inject a fake to exercise the gateway handler shape +/// without spinning up a worker process. /// /// /// The dispatcher is session-scoped: every call resolves the diff --git a/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs b/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs index e071bb9..40aa622 100644 --- a/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs +++ b/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs @@ -188,7 +188,14 @@ public sealed class WorkerAlarmRpcDispatcher( if (!sessionRegistry.TryGet(request.SessionId, out GatewaySession session)) { - yield break; + // Server-019: align with AcknowledgeAsync's missing-session handling and + // surface a SessionNotFound error rather than yielding an empty stream. + // QueryActiveAlarms is server-streaming, so a thrown exception is the + // cleaner fit than an in-band ProtocolStatus; MxAccessGatewayService maps + // SessionManagerException(SessionNotFound) to gRPC NotFound. + throw new SessionManagerException( + SessionManagerErrorCode.SessionNotFound, + $"Session '{request.SessionId}' not found."); } WorkerCommand workerCommand = new WorkerCommand diff --git a/src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs b/src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs index df98abe..f8b2118 100644 --- a/src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs +++ b/src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs @@ -11,7 +11,18 @@ public sealed class GatewayContractInfoTests Assert.Equal("mxaccess-worker", GatewayContractInfo.DefaultBackendName); } - /// Verifies that the gateway protocol version is bumped to three after the alarm proto extension. + /// + /// Pins the current + /// constant at 3. Both the alarm proto extension (`AcknowledgeAlarm` / + /// `QueryActiveAlarms` RPCs, the `OnAlarmTransitionEvent` body, and the + /// alarm command/reply payload cases) and the bulk write/read command + /// family extension (`WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / + /// `WriteSecured2Bulk` / `ReadBulk` plus their `BulkWriteReply` and + /// `BulkReadReply` payloads) shipped under version 3 — both were strictly + /// additive contract changes, so neither required a further bump. A + /// future breaking contract change should bump this constant and update + /// this test in lock-step. + /// [Fact] public void GatewayProtocolVersion_IsVersionThree() { diff --git a/src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs b/src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs index 94928c8..0ad0edd 100644 --- a/src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs +++ b/src/MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs @@ -770,4 +770,463 @@ public sealed class ProtobufContractRoundTripTests Assert.Equal(lastDeploy, GetLastDeployTimeReply.Parser.ParseFrom(lastDeploy.ToByteArray())); Assert.Equal(testConnection, TestConnectionReply.Parser.ParseFrom(testConnection.ToByteArray())); } + + /// + /// Verifies that a carrying multiple + /// items round-trips, including the + /// per-entry value and user_id fields. + /// + [Fact] + public void WriteBulkCommand_RoundTripsEntries() + { + var original = new MxCommand + { + Kind = MxCommandKind.WriteBulk, + WriteBulk = new WriteBulkCommand + { + ServerHandle = 10, + Entries = + { + new WriteBulkEntry + { + ItemHandle = 21, + UserId = 7, + Value = new MxValue + { + DataType = MxDataType.Float, + FloatValue = 1.25f, + VariantType = "VT_R4", + }, + }, + new WriteBulkEntry + { + ItemHandle = 22, + Value = new MxValue + { + DataType = MxDataType.Integer, + Int32Value = 42, + VariantType = "VT_I4", + }, + }, + }, + }, + }; + + var parsed = MxCommand.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(MxCommand.PayloadOneofCase.WriteBulk, parsed.PayloadCase); + Assert.Equal(2, parsed.WriteBulk.Entries.Count); + Assert.Equal(7, parsed.WriteBulk.Entries[0].UserId); + } + + /// + /// Verifies that a round-trips, including + /// the per-entry timestamp_value field that distinguishes Write2 + /// from Write. + /// + [Fact] + public void Write2BulkCommand_RoundTripsEntriesWithTimestampValue() + { + var timestamp = Timestamp.FromDateTime(new DateTime(2026, 5, 19, 11, 0, 0, DateTimeKind.Utc)); + var original = new MxCommand + { + Kind = MxCommandKind.Write2Bulk, + Write2Bulk = new Write2BulkCommand + { + ServerHandle = 10, + Entries = + { + new Write2BulkEntry + { + ItemHandle = 21, + UserId = 7, + Value = new MxValue + { + DataType = MxDataType.Float, + FloatValue = 99.9f, + VariantType = "VT_R4", + }, + TimestampValue = new MxValue + { + DataType = MxDataType.Time, + TimestampValue = timestamp, + VariantType = "VT_DATE", + }, + }, + }, + }, + }; + + var parsed = MxCommand.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(MxCommand.PayloadOneofCase.Write2Bulk, parsed.PayloadCase); + Assert.NotNull(parsed.Write2Bulk.Entries[0].TimestampValue); + } + + /// + /// Verifies that a round-trips, + /// pinning the credential-bearing entry shape + /// (current_user_id, verifier_user_id, value). + /// See Contracts-011 for the credential-sensitivity comment on + /// WriteSecuredBulkEntry.value. + /// + [Fact] + public void WriteSecuredBulkCommand_RoundTripsCredentialBearingEntries() + { + var original = new MxCommand + { + Kind = MxCommandKind.WriteSecuredBulk, + WriteSecuredBulk = new WriteSecuredBulkCommand + { + ServerHandle = 10, + Entries = + { + new WriteSecuredBulkEntry + { + ItemHandle = 21, + CurrentUserId = 100, + VerifierUserId = 200, + Value = new MxValue + { + DataType = MxDataType.Float, + FloatValue = 75.0f, + VariantType = "VT_R4", + }, + }, + }, + }, + }; + + var parsed = MxCommand.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(MxCommand.PayloadOneofCase.WriteSecuredBulk, parsed.PayloadCase); + Assert.Equal(100, parsed.WriteSecuredBulk.Entries[0].CurrentUserId); + Assert.Equal(200, parsed.WriteSecuredBulk.Entries[0].VerifierUserId); + } + + /// + /// Verifies that a round-trips, + /// including both the credential-sensitive value and the + /// timestamp_value per entry. + /// + [Fact] + public void WriteSecured2BulkCommand_RoundTripsCredentialBearingEntriesWithTimestamp() + { + var timestamp = Timestamp.FromDateTime(new DateTime(2026, 5, 19, 11, 30, 0, DateTimeKind.Utc)); + var original = new MxCommand + { + Kind = MxCommandKind.WriteSecured2Bulk, + WriteSecured2Bulk = new WriteSecured2BulkCommand + { + ServerHandle = 10, + Entries = + { + new WriteSecured2BulkEntry + { + ItemHandle = 21, + CurrentUserId = 100, + VerifierUserId = 200, + Value = new MxValue + { + DataType = MxDataType.Float, + FloatValue = 50.0f, + VariantType = "VT_R4", + }, + TimestampValue = new MxValue + { + DataType = MxDataType.Time, + TimestampValue = timestamp, + VariantType = "VT_DATE", + }, + }, + }, + }, + }; + + var parsed = MxCommand.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(MxCommand.PayloadOneofCase.WriteSecured2Bulk, parsed.PayloadCase); + Assert.NotNull(parsed.WriteSecured2Bulk.Entries[0].TimestampValue); + } + + /// + /// Verifies that a round-trips, including + /// the tag_addresses list and the timeout_ms field that + /// distinguishes the cached-vs-snapshot lifecycle. + /// + [Fact] + public void ReadBulkCommand_RoundTripsTagAddressesAndTimeout() + { + var original = new MxCommand + { + Kind = MxCommandKind.ReadBulk, + ReadBulk = new ReadBulkCommand + { + ServerHandle = 10, + TagAddresses = { "Provider!Tank01.Level", "Provider!Tank02.Level" }, + TimeoutMs = 2500, + }, + }; + + var parsed = MxCommand.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(MxCommand.PayloadOneofCase.ReadBulk, parsed.PayloadCase); + Assert.Equal(2, parsed.ReadBulk.TagAddresses.Count); + Assert.Equal(2500u, parsed.ReadBulk.TimeoutMs); + } + + /// + /// Verifies that a carrying mixed-outcome + /// entries round-trips and that the + /// proto3 optional int32 hresult presence flag survives both the + /// "hresult set" and "hresult unset" cases. + /// + [Fact] + public void BulkWriteReply_RoundTripsResultsWithOptionalHresultPresence() + { + var original = new BulkWriteReply + { + Results = + { + new BulkWriteResult + { + ServerHandle = 10, + ItemHandle = 21, + WasSuccessful = true, + Hresult = 0, + Statuses = + { + new MxStatusProxy + { + Success = 1, + Category = MxStatusCategory.Ok, + DetectedBy = MxStatusSource.RespondingLmx, + }, + }, + }, + new BulkWriteResult + { + ServerHandle = 10, + ItemHandle = 22, + WasSuccessful = false, + Hresult = unchecked((int)0x80004005), + ErrorMessage = "item not advised", + }, + new BulkWriteResult + { + ServerHandle = 10, + ItemHandle = 23, + WasSuccessful = false, + // Hresult deliberately UNSET — exercises the proto3 + // `optional int32` HasField() = false arm. + ErrorMessage = "tag rejected by allowlist", + }, + }, + }; + + var parsed = BulkWriteReply.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(3, parsed.Results.Count); + Assert.True(parsed.Results[0].HasHresult); + Assert.True(parsed.Results[1].HasHresult); + Assert.False(parsed.Results[2].HasHresult); + Assert.True(parsed.Results[0].WasSuccessful); + Assert.False(parsed.Results[2].WasSuccessful); + Assert.Single(parsed.Results[0].Statuses); + } + + /// + /// Verifies that a carrying both cached + /// (was_cached = true) and uncached (was_cached = false) + /// entries round-trips. Pins the + /// deliberate absence of hresult on + /// — failures are carried as was_successful = false plus + /// error_message only. + /// + [Fact] + public void BulkReadReply_RoundTripsCachedAndSnapshotResults() + { + var sourceTimestamp = Timestamp.FromDateTime(new DateTime(2026, 5, 19, 12, 0, 0, DateTimeKind.Utc)); + var original = new BulkReadReply + { + Results = + { + new BulkReadResult + { + ServerHandle = 10, + TagAddress = "Provider!Tank01.Level", + ItemHandle = 21, + WasSuccessful = true, + WasCached = true, + Value = new MxValue + { + DataType = MxDataType.Float, + FloatValue = 42.5f, + VariantType = "VT_R4", + }, + Quality = 192, + SourceTimestamp = sourceTimestamp, + Statuses = + { + new MxStatusProxy + { + Success = 1, + Category = MxStatusCategory.Ok, + DetectedBy = MxStatusSource.RespondingNmx, + }, + }, + }, + new BulkReadResult + { + ServerHandle = 10, + TagAddress = "Provider!Tank02.Level", + ItemHandle = 22, + WasSuccessful = true, + WasCached = false, + Value = new MxValue + { + DataType = MxDataType.Integer, + Int32Value = 0, + VariantType = "VT_I4", + }, + SourceTimestamp = sourceTimestamp, + }, + new BulkReadResult + { + ServerHandle = 10, + TagAddress = "Provider!Bad.Tag", + WasSuccessful = false, + WasCached = false, + ErrorMessage = "snapshot timed out before first OnDataChange", + }, + }, + }; + + var parsed = BulkReadReply.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(3, parsed.Results.Count); + Assert.True(parsed.Results[0].WasCached); + Assert.False(parsed.Results[1].WasCached); + Assert.False(parsed.Results[2].WasSuccessful); + Assert.Equal("snapshot timed out before first OnDataChange", parsed.Results[2].ErrorMessage); + // BulkReadResult has no `hresult` field — pin that contract. + Assert.DoesNotContain( + BulkReadResult.Descriptor.Fields.InDeclarationOrder(), + field => field.Name == "hresult"); + } + + /// + /// Verifies that an with each of the new + /// bulk write/read payload oneof cases round-trips and that + /// resolves to the + /// expected value. Pins every new oneof case added by the bulk + /// write/read extension. + /// + [Theory] + [InlineData(MxCommandKind.WriteBulk, MxCommandReply.PayloadOneofCase.WriteBulk)] + [InlineData(MxCommandKind.Write2Bulk, MxCommandReply.PayloadOneofCase.Write2Bulk)] + [InlineData(MxCommandKind.WriteSecuredBulk, MxCommandReply.PayloadOneofCase.WriteSecuredBulk)] + [InlineData(MxCommandKind.WriteSecured2Bulk, MxCommandReply.PayloadOneofCase.WriteSecured2Bulk)] + public void MxCommandReply_RoundTripsBulkWritePayloadCases( + MxCommandKind kind, + MxCommandReply.PayloadOneofCase expectedPayloadCase) + { + var reply = new BulkWriteReply + { + Results = + { + new BulkWriteResult + { + ServerHandle = 5, + ItemHandle = 7, + WasSuccessful = true, + Hresult = 0, + }, + }, + }; + var original = new MxCommandReply + { + SessionId = "session-1", + CorrelationId = "gateway-correlation-bulk-write", + Kind = kind, + ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok }, + Hresult = 0, + }; + switch (expectedPayloadCase) + { + case MxCommandReply.PayloadOneofCase.WriteBulk: + original.WriteBulk = reply; + break; + case MxCommandReply.PayloadOneofCase.Write2Bulk: + original.Write2Bulk = reply; + break; + case MxCommandReply.PayloadOneofCase.WriteSecuredBulk: + original.WriteSecuredBulk = reply; + break; + case MxCommandReply.PayloadOneofCase.WriteSecured2Bulk: + original.WriteSecured2Bulk = reply; + break; + default: + throw new System.ArgumentOutOfRangeException(nameof(expectedPayloadCase)); + } + + var parsed = MxCommandReply.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(expectedPayloadCase, parsed.PayloadCase); + Assert.Equal(kind, parsed.Kind); + } + + /// + /// Verifies that an with kind + /// and a populated + /// payload round-trips and resolves to + /// . + /// + [Fact] + public void MxCommandReply_RoundTripsReadBulkPayload() + { + var original = new MxCommandReply + { + SessionId = "session-1", + CorrelationId = "gateway-correlation-read-bulk", + Kind = MxCommandKind.ReadBulk, + ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok }, + Hresult = 0, + ReadBulk = new BulkReadReply + { + Results = + { + new BulkReadResult + { + ServerHandle = 5, + TagAddress = "Provider!Tank01.Level", + ItemHandle = 7, + WasSuccessful = true, + WasCached = true, + Value = new MxValue + { + DataType = MxDataType.Float, + FloatValue = 12.5f, + VariantType = "VT_R4", + }, + }, + }, + }, + }; + + var parsed = MxCommandReply.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + Assert.Equal(MxCommandReply.PayloadOneofCase.ReadBulk, parsed.PayloadCase); + Assert.Single(parsed.ReadBulk.Results); + Assert.True(parsed.ReadBulk.Results[0].WasCached); + } } diff --git a/src/MxGateway.Tests/Galaxy/GalaxyFilterInputSafetyTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyFilterInputSafetyTests.cs index 769b1eb..29c534a 100644 --- a/src/MxGateway.Tests/Galaxy/GalaxyFilterInputSafetyTests.cs +++ b/src/MxGateway.Tests/Galaxy/GalaxyFilterInputSafetyTests.cs @@ -110,6 +110,29 @@ public sealed class GalaxyFilterInputSafetyTests } } + /// + /// Regression guard for finding Server-018: 's + /// internal compiled-regex cache must stay bounded so a client cannot grow it + /// without limit by submitting unique TagNameGlob values over the + /// process lifetime. Feeding the matcher far more distinct globs than the cap + /// must leave CurrentCacheSize at or below RegexCacheCapacity. + /// + [Fact] + public void GlobMatcher_WithManyDistinctPatterns_CacheStaysBounded() + { + // Submit well past the cap from a single thread to exercise the eviction path + // deterministically. The cap is internal; assert on it directly so the test + // tracks the source of truth. + int submissions = GalaxyGlobMatcher.RegexCacheCapacity * 4; + for (int i = 0; i < submissions; i++) + { + string uniqueGlob = $"client_supplied_{i}_*"; + GalaxyGlobMatcher.IsMatch($"client_supplied_{i}_thing", uniqueGlob); + } + + Assert.InRange(GalaxyGlobMatcher.CurrentCacheSize, 0, GalaxyGlobMatcher.RegexCacheCapacity); + } + /// /// Verifies a pathological glob does not cause catastrophic regex backtracking — /// escapes every literal character and applies a diff --git a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs index ccd5ef8..dabc0b0 100644 --- a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs +++ b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs @@ -12,7 +12,8 @@ public sealed class GalaxyHierarchyCacheTests public void Current_BeforeAnyRefresh_ReturnsEmpty() { GalaxyDeployNotifier notifier = new(); - GalaxyHierarchyCache cache = CreateCache(notifier, new ManualTimeProvider()); + ThrowingGalaxyRepository repository = new(new InvalidOperationException("not invoked")); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider()); GalaxyHierarchyCacheEntry entry = cache.Current; @@ -23,21 +24,28 @@ public sealed class GalaxyHierarchyCacheTests } /// - /// Verifies cache marks unavailable and does not publish when SQL is unreachable. + /// Verifies cache marks unavailable and does not publish when the repository + /// surface throws — the production trigger for this code path is a SQL + /// connection failure, but it is fully covered by the cache's exception + /// branch and does not require a real TCP probe from a unit test. /// [Fact] - public async Task RefreshAsync_WhenSqlIsUnreachable_MarksUnavailableAndDoesNotPublish() + public async Task RefreshAsync_WhenRepositoryThrows_MarksUnavailableAndDoesNotPublish() { GalaxyDeployNotifier notifier = new(); - ManualTimeProvider clock = new(DateTimeOffset.Parse("2026-04-28T12:00:00Z")); - GalaxyHierarchyCache cache = CreateCache(notifier, clock); + ManualTimeProvider clock = new(DateTimeOffset.Parse("2026-04-28T12:00:00Z", System.Globalization.CultureInfo.InvariantCulture)); + ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable")); + GalaxyHierarchyCache cache = new(repository, notifier, clock); await cache.RefreshAsync(CancellationToken.None); Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status); - Assert.False(string.IsNullOrWhiteSpace(cache.Current.LastError)); + Assert.Equal("Galaxy repository unreachable", cache.Current.LastError); Assert.Null(notifier.Latest); Assert.True(cache.WaitForFirstLoadAsync(CancellationToken.None).IsCompletedSuccessfully); + Assert.Equal(1, repository.GetLastDeployTimeCount); + Assert.Equal(0, repository.GetHierarchyCount); + Assert.Equal(0, repository.GetAttributesCount); } /// @@ -112,15 +120,40 @@ public sealed class GalaxyHierarchyCacheTests Assert.Same(root, index.ObjectViewsById[1].Object); } - private static GalaxyHierarchyCache CreateCache(GalaxyDeployNotifier notifier, TimeProvider clock) + private sealed class ThrowingGalaxyRepository(Exception toThrow) : IGalaxyRepository { - GalaxyRepositoryOptions options = new() + /// Gets the number of times was called. + public int GetLastDeployTimeCount { get; private set; } + + /// Gets the number of times was called. + public int GetHierarchyCount { get; private set; } + + /// Gets the number of times was called. + public int GetAttributesCount { get; private set; } + + /// + public Task TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(false); + + /// + public Task GetLastDeployTimeAsync(CancellationToken ct = default) { - ConnectionString = "Server=127.0.0.1,65500;Database=ZB;Connection Timeout=1;Encrypt=False;", - CommandTimeoutSeconds = 1, - }; - MxGateway.Server.Galaxy.GalaxyRepository repository = new(options); - return new GalaxyHierarchyCache(repository, notifier, clock); + GetLastDeployTimeCount++; + throw toThrow; + } + + /// + public Task> GetHierarchyAsync(CancellationToken ct = default) + { + GetHierarchyCount++; + throw toThrow; + } + + /// + public Task> GetAttributesAsync(CancellationToken ct = default) + { + GetAttributesCount++; + throw toThrow; + } } private sealed class ManualTimeProvider(DateTimeOffset start = default) : TimeProvider diff --git a/src/MxGateway.Tests/Gateway/Dashboard/DashboardCookieOptionsTests.cs b/src/MxGateway.Tests/Gateway/Dashboard/DashboardCookieOptionsTests.cs index df78898..a6ab9df 100644 --- a/src/MxGateway.Tests/Gateway/Dashboard/DashboardCookieOptionsTests.cs +++ b/src/MxGateway.Tests/Gateway/Dashboard/DashboardCookieOptionsTests.cs @@ -12,9 +12,9 @@ public sealed class DashboardCookieOptionsTests { /// Verifies that the application configures secure dashboard authentication cookies. [Fact] - public void Build_ConfiguresSecureDashboardCookie() + public async Task Build_ConfiguresSecureDashboardCookie() { - WebApplication app = GatewayApplication.Build([]); + await using WebApplication app = GatewayApplication.Build([]); IOptionsMonitor optionsMonitor = app.Services .GetRequiredService>(); diff --git a/src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs b/src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs index c5b1617..5972101 100644 --- a/src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs +++ b/src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs @@ -1,3 +1,4 @@ +using System.Globalization; using Microsoft.Extensions.Options; using MxGateway.Contracts.Proto; using MxGateway.Server.Configuration; @@ -42,19 +43,19 @@ public sealed class DashboardSnapshotServiceTests GatewaySession activeSession = CreateSession( "session-active", "client-one", - DateTimeOffset.Parse("2026-04-26T10:00:00Z")); + DateTimeOffset.Parse("2026-04-26T10:00:00Z", CultureInfo.InvariantCulture)); activeSession.AttachWorkerClient(new FakeWorkerClient("session-active", 1201, WorkerClientState.Ready)); activeSession.MarkReady(); GatewaySession faultedSession = CreateSession( "session-faulted", "client-two", - DateTimeOffset.Parse("2026-04-26T10:01:00Z")); + DateTimeOffset.Parse("2026-04-26T10:01:00Z", CultureInfo.InvariantCulture)); faultedSession.AttachWorkerClient(new FakeWorkerClient("session-faulted", 1202, WorkerClientState.Faulted)); faultedSession.MarkFaulted("worker pipe disconnected"); GatewaySession closedSession = CreateSession( "session-closed", "client-three", - DateTimeOffset.Parse("2026-04-26T09:59:00Z")); + DateTimeOffset.Parse("2026-04-26T09:59:00Z", CultureInfo.InvariantCulture)); closedSession.AttachWorkerClient(new FakeWorkerClient("session-closed", 1203, WorkerClientState.Closed)); closedSession.TransitionTo(SessionState.Closed); registry.TryAdd(activeSession); @@ -102,7 +103,7 @@ public sealed class DashboardSnapshotServiceTests GatewaySession session = CreateSession( "session-redacted", "Bearer mxgw_admin_super-secret", - DateTimeOffset.Parse("2026-04-26T10:00:00Z"), + DateTimeOffset.Parse("2026-04-26T10:00:00Z", CultureInfo.InvariantCulture), clientSessionName: "password=hunter2", clientCorrelationId: "token=abc123"); session.MarkFaulted("secret=credential-value"); @@ -131,7 +132,7 @@ public sealed class DashboardSnapshotServiceTests GatewaySession session = CreateSession( "session-active", "client-one", - DateTimeOffset.Parse("2026-04-26T10:00:00Z")); + DateTimeOffset.Parse("2026-04-26T10:00:00Z", CultureInfo.InvariantCulture)); FakeWorkerClient workerClient = new("session-active", 1201, WorkerClientState.Ready); session.AttachWorkerClient(workerClient); session.MarkReady(); @@ -160,11 +161,11 @@ public sealed class DashboardSnapshotServiceTests GatewaySession olderSession = CreateSession( "session-older", "client-one", - DateTimeOffset.Parse("2026-04-26T10:00:00Z")); + DateTimeOffset.Parse("2026-04-26T10:00:00Z", CultureInfo.InvariantCulture)); GatewaySession newerSession = CreateSession( "session-newer", "client-two", - DateTimeOffset.Parse("2026-04-26T10:01:00Z")); + DateTimeOffset.Parse("2026-04-26T10:01:00Z", CultureInfo.InvariantCulture)); olderSession.MarkFaulted("older fault"); newerSession.MarkFaulted("newer fault"); registry.TryAdd(olderSession); @@ -199,14 +200,14 @@ public sealed class DashboardSnapshotServiceTests { Status = GalaxyCacheStatus.Healthy, Sequence = 7, - LastQueriedAt = DateTimeOffset.Parse("2026-04-28T11:30:00Z"), - LastSuccessAt = DateTimeOffset.Parse("2026-04-28T11:30:00Z"), - LastDeployTime = DateTimeOffset.Parse("2026-04-28T09:00:00Z"), + LastQueriedAt = DateTimeOffset.Parse("2026-04-28T11:30:00Z", CultureInfo.InvariantCulture), + LastSuccessAt = DateTimeOffset.Parse("2026-04-28T11:30:00Z", CultureInfo.InvariantCulture), + LastDeployTime = DateTimeOffset.Parse("2026-04-28T09:00:00Z", CultureInfo.InvariantCulture), DashboardSummary = new DashboardGalaxySummary( DashboardGalaxyStatus.Healthy, - LastQueriedAt: DateTimeOffset.Parse("2026-04-28T11:30:00Z"), - LastSuccessAt: DateTimeOffset.Parse("2026-04-28T11:30:00Z"), - LastDeployTime: DateTimeOffset.Parse("2026-04-28T09:00:00Z"), + LastQueriedAt: DateTimeOffset.Parse("2026-04-28T11:30:00Z", CultureInfo.InvariantCulture), + LastSuccessAt: DateTimeOffset.Parse("2026-04-28T11:30:00Z", CultureInfo.InvariantCulture), + LastDeployTime: DateTimeOffset.Parse("2026-04-28T09:00:00Z", CultureInfo.InvariantCulture), LastError: null, ObjectCount: 3, AreaCount: 1, @@ -281,7 +282,7 @@ public sealed class DashboardSnapshotServiceTests { BrowseSubtrees = ["Area1/*"], }, - CreatedUtc: DateTimeOffset.Parse("2026-04-28T12:00:00Z"), + CreatedUtc: DateTimeOffset.Parse("2026-04-28T12:00:00Z", CultureInfo.InvariantCulture), LastUsedUtc: null, RevokedUtc: null)); DashboardSnapshotService service = CreateService( @@ -314,7 +315,7 @@ public sealed class DashboardSnapshotServiceTests DisplayName: "Operator", Scopes: new HashSet([GatewayScopes.MetadataRead], StringComparer.Ordinal), Constraints: ApiKeyConstraints.Empty, - CreatedUtc: DateTimeOffset.Parse("2026-04-28T12:00:00Z"), + CreatedUtc: DateTimeOffset.Parse("2026-04-28T12:00:00Z", CultureInfo.InvariantCulture), LastUsedUtc: null, RevokedUtc: null)); DashboardSnapshotService service = CreateService( @@ -520,7 +521,7 @@ public sealed class DashboardSnapshotServiceTests /// /// Gets the timestamp of the last heartbeat. /// - public DateTimeOffset LastHeartbeatAt { get; } = DateTimeOffset.Parse("2026-04-26T10:02:00Z"); + public DateTimeOffset LastHeartbeatAt { get; } = DateTimeOffset.Parse("2026-04-26T10:02:00Z", CultureInfo.InvariantCulture); /// /// Gets the count of start invocations. diff --git a/src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs b/src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs index d1601ce..f1b6dfe 100644 --- a/src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs +++ b/src/MxGateway.Tests/Gateway/GatewayApplicationTests.cs @@ -13,9 +13,9 @@ public sealed class GatewayApplicationTests { /// Verifies that Build maps the live health check endpoint. [Fact] - public void Build_MapsLiveHealthEndpoint() + public async Task Build_MapsLiveHealthEndpoint() { - WebApplication app = GatewayApplication.Build([]); + await using WebApplication app = GatewayApplication.Build([]); RouteEndpoint endpoint = Assert.Single( ((IEndpointRouteBuilder)app).DataSources @@ -28,9 +28,9 @@ public sealed class GatewayApplicationTests /// Verifies that Build registers the gateway metrics service. [Fact] - public void Build_RegistersGatewayMetrics() + public async Task Build_RegistersGatewayMetrics() { - WebApplication app = GatewayApplication.Build([]); + await using WebApplication app = GatewayApplication.Build([]); GatewayMetrics metrics = app.Services.GetRequiredService(); @@ -39,9 +39,9 @@ public sealed class GatewayApplicationTests /// Verifies that Build maps dashboard and authentication endpoints when the dashboard is enabled. [Fact] - public void Build_WhenDashboardEnabled_MapsBlazorDashboardAndAuthEndpoints() + public async Task Build_WhenDashboardEnabled_MapsBlazorDashboardAndAuthEndpoints() { - WebApplication app = GatewayApplication.Build([]); + await using WebApplication app = GatewayApplication.Build([]); IReadOnlyList endpoints = GetRouteEndpoints(app); Assert.Contains(endpoints, endpoint => endpoint.RoutePattern.RawText == "/dashboard/"); @@ -57,9 +57,9 @@ public sealed class GatewayApplicationTests /// Verifies that the dashboard login, logout, and denied endpoints allow anonymous access. [Fact] - public void Build_WhenDashboardEnabled_AuthEndpointsAllowAnonymousAccess() + public async Task Build_WhenDashboardEnabled_AuthEndpointsAllowAnonymousAccess() { - WebApplication app = GatewayApplication.Build([]); + await using WebApplication app = GatewayApplication.Build([]); IReadOnlyList endpoints = GetRouteEndpoints(app); string[] anonymousEndpointNames = @@ -76,9 +76,9 @@ public sealed class GatewayApplicationTests /// Verifies that dashboard Razor component routes require the dashboard authorization policy. [Fact] - public void Build_WhenDashboardEnabled_ComponentRoutesRequireAuthorization() + public async Task Build_WhenDashboardEnabled_ComponentRoutesRequireAuthorization() { - WebApplication app = GatewayApplication.Build([]); + await using WebApplication app = GatewayApplication.Build([]); IReadOnlyList endpoints = GetRouteEndpoints(app); string[] componentRoutes = @@ -99,10 +99,42 @@ public sealed class GatewayApplicationTests } } + /// + /// Server-020 regression: every dashboard Razor page used to declare both + /// @page "/X" and @page "/dashboard/X", which — once + /// MapGroup("/dashboard") prepended the path base — produced both + /// /dashboard/X AND /dashboard/dashboard/X routes. The second + /// shape was almost certainly accidental and is no longer registered. The + /// check covers every dashboard page so a future page that brings back the + /// duplicated @page directive fails CI. + /// [Fact] - public void Build_WhenDashboardDisabled_DoesNotMapDashboardRoutes() + public async Task Build_WhenDashboardEnabled_DoesNotRegisterDoubledDashboardPrefixRoutes() { - WebApplication app = GatewayApplication.Build(["--MxGateway:Dashboard:Enabled=false"]); + await using WebApplication app = GatewayApplication.Build([]); + IReadOnlyList endpoints = GetRouteEndpoints(app); + + string[] doubledRoutes = + [ + "/dashboard/dashboard/", + "/dashboard/dashboard/sessions", + "/dashboard/dashboard/workers", + "/dashboard/dashboard/events", + "/dashboard/dashboard/settings", + "/dashboard/dashboard/galaxy", + "/dashboard/dashboard/apikeys", + "/dashboard/dashboard/sessions/{SessionId}", + ]; + foreach (string doubled in doubledRoutes) + { + Assert.DoesNotContain(endpoints, endpoint => endpoint.RoutePattern.RawText == doubled); + } + } + + [Fact] + public async Task Build_WhenDashboardDisabled_DoesNotMapDashboardRoutes() + { + await using WebApplication app = GatewayApplication.Build(["--MxGateway:Dashboard:Enabled=false"]); IReadOnlyList endpoints = GetRouteEndpoints(app); Assert.DoesNotContain(endpoints, endpoint => diff --git a/src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs b/src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs index 8871764..29ae84d 100644 --- a/src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs +++ b/src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs @@ -85,6 +85,10 @@ public sealed class GatewayEndToEndFakeWorkerSmokeTests Assert.Equal(ProtocolStatusCode.Ok, closeReply.ProtocolStatus.Code); Assert.Equal(SessionState.Closed, closeReply.FinalState); Assert.True(launcher.Process.HasExited); + // MarkExited(0) is reached only after the scripted worker observed a WorkerShutdown + // envelope and emitted its WorkerShutdownAck — anything else (a kill, a fault) would + // have produced a non-zero exit code, so this pins the shutdown-ack handshake. + Assert.Equal(0, launcher.Process.ExitCode); Assert.Equal( [MxCommandKind.Register, MxCommandKind.AddItem, MxCommandKind.Advise], launcher.CommandKinds); @@ -351,6 +355,8 @@ public sealed class GatewayEndToEndFakeWorkerSmokeTests private sealed class FakeWorkerProcess(int processId) : IWorkerProcess { + private readonly TaskCompletionSource _exited = new(TaskCreationOptions.RunContinuationsAsynchronously); + /// /// Gets the process identifier. /// @@ -367,15 +373,15 @@ public sealed class GatewayEndToEndFakeWorkerSmokeTests public int? ExitCode { get; private set; } /// - /// Waits for the process to exit asynchronously. + /// Waits for the process to exit asynchronously. Completes only when + /// or has been called, so callers that observe completion can + /// trust that exit actually happened (e.g., via the worker shutdown-ack path). /// /// Cancellation token. - /// Completed task. + /// A task that completes when the process has actually exited. public ValueTask WaitForExitAsync(CancellationToken cancellationToken) { - HasExited = true; - ExitCode ??= 0; - return ValueTask.CompletedTask; + return new ValueTask(_exited.Task.WaitAsync(cancellationToken)); } /// @@ -402,6 +408,7 @@ public sealed class GatewayEndToEndFakeWorkerSmokeTests { HasExited = true; ExitCode = exitCode; + _exited.TrySetResult(); } } diff --git a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs new file mode 100644 index 0000000..8eca572 --- /dev/null +++ b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs @@ -0,0 +1,757 @@ +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using MxGateway.Contracts; +using MxGateway.Contracts.Proto; +using MxGateway.Server.Grpc; +using MxGateway.Server.Metrics; +using MxGateway.Server.Security.Authentication; +using MxGateway.Server.Security.Authorization; +using MxGateway.Server.Sessions; +using MxGateway.Server.Workers; +using MxGateway.Tests.TestSupport; + +namespace MxGateway.Tests.Gateway.Grpc; + +/// +/// Tests for Server-021. MxAccessGatewayService.ApplyConstraintsAsync and +/// the BulkConstraintPlan / ReadBulkConstraintPlan / +/// WriteBulkConstraintPlan / SubscribeBulkConstraintPlan reply-merge +/// logic was previously exercised only with an allow-all enforcer, so denial +/// filtering, the no-allowed-items short-circuit, and the index-ordered +/// denied/allowed interleave were dead code at test time. The fixtures below +/// inject a that denies a subset of +/// tags or handles, and assert the post-merge reply contents and that the +/// session manager is (or is not) invoked. +/// +public sealed class MxAccessGatewayServiceConstraintTests +{ + private const string SessionId = "session-constraint"; + + // === SubscribeBulk family: AddItemBulk / SubscribeBulk / AdviseItemBulk === + + /// + /// AddItemBulk with a mix of allowed and denied tags must invoke the + /// worker once with only the allowed tags, then splice the denied entries + /// back into the reply at their original indices. + /// + [Fact] + public async Task Invoke_AddItemBulk_WithMixedDenials_InterleavesDeniedAndAllowedInOriginalIndexOrder() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyTag = tag => tag == "Tank01.Locked" || tag == "Tank03.Secret", + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.AddItemBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + AddItemBulk = new BulkSubscribeReply + { + Results = + { + // Worker only sees the two allowed tags — Tank02.Open at original + // index 1 and Tank04.Public at original index 3. + new SubscribeResult { ServerHandle = 7, TagAddress = "Tank02.Open", ItemHandle = 102, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 7, TagAddress = "Tank04.Public", ItemHandle = 104, WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateAddItemBulkRequest(7, ["Tank01.Locked", "Tank02.Open", "Tank03.Secret", "Tank04.Public"]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + // Worker saw only the allowed subset, in original order, with denied entries dropped. + AddItemBulkCommand forwardedCommand = sessionManager.LastWorkerCommand!.Command.AddItemBulk; + Assert.Equal(["Tank02.Open", "Tank04.Public"], forwardedCommand.TagAddresses); + // Final reply preserves the original 4-entry index order, with denied entries + // at index 0 and 2 and worker-allowed entries at index 1 and 3. + BulkSubscribeReply merged = reply.AddItemBulk; + Assert.Equal(4, merged.Results.Count); + Assert.False(merged.Results[0].WasSuccessful); + Assert.Equal("Tank01.Locked", merged.Results[0].TagAddress); + Assert.Contains("Tank01.Locked", merged.Results[0].ErrorMessage, StringComparison.Ordinal); + Assert.True(merged.Results[1].WasSuccessful); + Assert.Equal("Tank02.Open", merged.Results[1].TagAddress); + Assert.Equal(102, merged.Results[1].ItemHandle); + Assert.False(merged.Results[2].WasSuccessful); + Assert.Equal("Tank03.Secret", merged.Results[2].TagAddress); + Assert.True(merged.Results[3].WasSuccessful); + Assert.Equal("Tank04.Public", merged.Results[3].TagAddress); + Assert.Equal(104, merged.Results[3].ItemHandle); + // Both denied tags recorded. + Assert.Equal(2, enforcer.RecordedDenials.Count); + } + + /// + /// SubscribeBulk when every tag is denied must short-circuit + /// false, return the + /// denied-only reply, and never call the session manager. + /// + [Fact] + public async Task Invoke_SubscribeBulk_WhenAllTagsDenied_DoesNotCallWorkerAndReturnsDeniedReply() + { + PredicateConstraintEnforcer enforcer = new() { DenyTag = _ => true }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateSubscribeBulkRequest(7, ["A", "B", "C"]), + new TestServerCallContext()); + + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Equal(3, reply.SubscribeBulk.Results.Count); + Assert.All(reply.SubscribeBulk.Results, r => Assert.False(r.WasSuccessful)); + Assert.Equal(["A", "B", "C"], reply.SubscribeBulk.Results.Select(r => r.TagAddress)); + Assert.Equal(ProtocolStatusCode.Ok, reply.ProtocolStatus.Code); + } + + /// + /// AdviseItemBulk takes handle inputs (not tags) and routes through + /// FilterHandleBulkAsync against CheckReadHandleAsync. Partial + /// denial must still produce a merged-by-index BulkSubscribeReply. + /// + [Fact] + public async Task Invoke_AdviseItemBulk_WithMixedHandleDenials_MergesDeniedIntoReply() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyReadHandle = (_, itemHandle) => itemHandle == 502, + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.AdviseItemBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + AdviseItemBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 7, ItemHandle = 501, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 7, ItemHandle = 503, WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateAdviseItemBulkRequest(7, [501, 502, 503]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + Assert.Equal([501, 503], sessionManager.LastWorkerCommand!.Command.AdviseItemBulk.ItemHandles); + BulkSubscribeReply merged = reply.AdviseItemBulk; + Assert.Equal(3, merged.Results.Count); + Assert.True(merged.Results[0].WasSuccessful); + Assert.Equal(501, merged.Results[0].ItemHandle); + Assert.False(merged.Results[1].WasSuccessful); + Assert.Equal(502, merged.Results[1].ItemHandle); + Assert.True(merged.Results[2].WasSuccessful); + Assert.Equal(503, merged.Results[2].ItemHandle); + } + + /// + /// SubscribeBulk with an allow-all enforcer must leave the worker reply + /// unchanged — the constraint plan is null and no merge occurs. Regression + /// guard against accidentally engaging the merge path for the common case. + /// + [Fact] + public async Task Invoke_SubscribeBulk_WithAllowAllEnforcer_PassesThroughUnchanged() + { + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.SubscribeBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + SubscribeBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 7, TagAddress = "A", ItemHandle = 1, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 7, TagAddress = "B", ItemHandle = 2, WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager); + + MxCommandReply reply = await service.Invoke( + CreateSubscribeBulkRequest(7, ["A", "B"]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + Assert.Equal(["A", "B"], sessionManager.LastWorkerCommand!.Command.SubscribeBulk.TagAddresses); + // Reply identical to worker reply — no synthetic denial rows added. + Assert.Equal(2, reply.SubscribeBulk.Results.Count); + Assert.All(reply.SubscribeBulk.Results, r => Assert.True(r.WasSuccessful)); + } + + // === ReadBulk family === + + /// + /// ReadBulk with a mix of allowed and denied tags merges denied entries + /// into the BulkReadReply in original-index order, distinguishable from + /// the SubscribeBulk family because the reply slot is + /// BulkReadReply, not BulkSubscribeReply. + /// + [Fact] + public async Task Invoke_ReadBulk_WithMixedDenials_MergesDeniedBulkReadResults() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyTag = tag => tag == "Secret.Tag", + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.ReadBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + ReadBulk = new BulkReadReply + { + Results = + { + new BulkReadResult { ServerHandle = 7, TagAddress = "Public.A", WasSuccessful = true }, + new BulkReadResult { ServerHandle = 7, TagAddress = "Public.B", WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateReadBulkRequest(7, ["Public.A", "Secret.Tag", "Public.B"]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + Assert.Equal(["Public.A", "Public.B"], sessionManager.LastWorkerCommand!.Command.ReadBulk.TagAddresses); + BulkReadReply merged = reply.ReadBulk; + Assert.Equal(3, merged.Results.Count); + Assert.True(merged.Results[0].WasSuccessful); + Assert.False(merged.Results[1].WasSuccessful); + Assert.Equal("Secret.Tag", merged.Results[1].TagAddress); + Assert.True(merged.Results[2].WasSuccessful); + } + + /// + /// ReadBulk with all tags denied must short-circuit and produce a + /// denied-only BulkReadReply — verifying + /// 's ReadBulkConstraintPlan + /// CreateDeniedReply path. + /// + [Fact] + public async Task Invoke_ReadBulk_WhenAllTagsDenied_ShortCircuitsWithDeniedOnlyReply() + { + PredicateConstraintEnforcer enforcer = new() { DenyTag = _ => true }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateReadBulkRequest(7, ["X", "Y"]), + new TestServerCallContext()); + + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Equal(2, reply.ReadBulk.Results.Count); + Assert.All(reply.ReadBulk.Results, r => Assert.False(r.WasSuccessful)); + Assert.Equal(MxCommandKind.ReadBulk, reply.Kind); + } + + // === WriteBulk family: WriteBulk / Write2Bulk / WriteSecuredBulk / WriteSecured2Bulk === + + /// + /// WriteBulk with one denied handle must drop that entry from the + /// forwarded command and splice a denied BulkWriteResult back in at + /// the original index. + /// + [Fact] + public async Task Invoke_WriteBulk_WithDeniedHandle_DropsEntryFromWorkerCallAndMergesDenialIntoReply() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyWriteHandle = (_, itemHandle) => itemHandle == 902, + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.WriteBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + WriteBulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 7, ItemHandle = 901, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 7, ItemHandle = 903, WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateWriteBulkRequest(7, [901, 902, 903]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + // 902 dropped from forwarded entries; only 901 and 903 reach the worker. + WriteBulkCommand forwarded = sessionManager.LastWorkerCommand!.Command.WriteBulk; + Assert.Equal([901, 903], forwarded.Entries.Select(e => e.ItemHandle)); + BulkWriteReply merged = reply.WriteBulk; + Assert.Equal(3, merged.Results.Count); + Assert.True(merged.Results[0].WasSuccessful); + Assert.Equal(901, merged.Results[0].ItemHandle); + Assert.False(merged.Results[1].WasSuccessful); + Assert.Equal(902, merged.Results[1].ItemHandle); + Assert.True(merged.Results[2].WasSuccessful); + Assert.Equal(903, merged.Results[2].ItemHandle); + } + + /// + /// WriteSecuredBulk exercises a different ReplaceWriteBulkEntries + /// switch arm than plain WriteBulk. The merge logic is shared, so a + /// full denial here is enough to prove the secured-bulk routing. + /// + [Fact] + public async Task Invoke_WriteSecuredBulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply() + { + PredicateConstraintEnforcer enforcer = new() { DenyWriteHandle = (_, _) => true }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateWriteSecuredBulkRequest(7, [10, 11]), + new TestServerCallContext()); + + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Equal(MxCommandKind.WriteSecuredBulk, reply.Kind); + Assert.Equal(2, reply.WriteSecuredBulk.Results.Count); + Assert.All(reply.WriteSecuredBulk.Results, r => Assert.False(r.WasSuccessful)); + } + + // === Unary write-handle enforcement (EnforceWriteHandleAsync) === + + /// + /// Unary Write against a denied (server, item) handle must surface + /// via EnforceWriteHandleAsync + /// and never reach the session manager. + /// + [Fact] + public async Task Invoke_Write_WithDeniedHandle_ThrowsPermissionDeniedAndDoesNotCallWorker() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyWriteHandle = (serverHandle, itemHandle) => serverHandle == 7 && itemHandle == 42, + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + RpcException exception = await Assert.ThrowsAsync( + async () => await service.Invoke( + CreateWriteRequest(serverHandle: 7, itemHandle: 42), + new TestServerCallContext())); + + Assert.Equal(StatusCode.PermissionDenied, exception.StatusCode); + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Single(enforcer.RecordedDenials); + Assert.Equal("42", enforcer.RecordedDenials[0].Target); + } + + /// + /// Unary WriteSecured against a denied handle takes the same enforce path + /// and rejects identically — proving the four-arm switch in + /// ApplyConstraintsAsync (Write/Write2/WriteSecured/WriteSecured2) is + /// reachable for at least one of the secured kinds. + /// + [Fact] + public async Task Invoke_WriteSecured_WithDeniedHandle_ThrowsPermissionDenied() + { + PredicateConstraintEnforcer enforcer = new() { DenyWriteHandle = (_, _) => true }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + RpcException exception = await Assert.ThrowsAsync( + async () => await service.Invoke( + CreateWriteSecuredRequest(serverHandle: 7, itemHandle: 42), + new TestServerCallContext())); + + Assert.Equal(StatusCode.PermissionDenied, exception.StatusCode); + Assert.Equal(0, sessionManager.InvokeCount); + } + + // === Unary read-tag enforcement (EnforceReadTagAsync via AddItem) === + + /// + /// Unary AddItem against a denied tag must surface + /// via EnforceReadTagAsync + /// and never reach the session manager. + /// + [Fact] + public async Task Invoke_AddItem_WithDeniedTag_ThrowsPermissionDeniedAndDoesNotCallWorker() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyTag = tag => tag == "Secret.Tag", + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + RpcException exception = await Assert.ThrowsAsync( + async () => await service.Invoke( + CreateAddItemRequest(serverHandle: 7, tagAddress: "Secret.Tag"), + new TestServerCallContext())); + + Assert.Equal(StatusCode.PermissionDenied, exception.StatusCode); + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Single(enforcer.RecordedDenials); + Assert.Equal("Secret.Tag", enforcer.RecordedDenials[0].Target); + } + + // === Helpers === + + private static MxAccessGatewayService CreateService( + FakeSessionManager sessionManager, + IConstraintEnforcer? constraintEnforcer = null) + { + return new MxAccessGatewayService( + sessionManager, + new GatewayRequestIdentityAccessor(), + constraintEnforcer ?? new AllowAllConstraintEnforcer(), + new MxAccessGrpcRequestValidator(), + new MxAccessGrpcMapper(), + new FakeEventStreamService(sessionManager), + new GatewayMetrics(), + NullLogger.Instance); + } + + private static FakeSessionManager CreateSessionManagerWithSeed() + { + FakeSessionManager sessionManager = new() { ResolveOnlySeededSessions = true }; + sessionManager.SeedSession(CreateSession(SessionId)); + return sessionManager; + } + + private static GatewaySession CreateSession(string sessionId) + { + GatewaySession session = new( + sessionId, + GatewayContractInfo.DefaultBackendName, + "pipe", + "nonce", + "Operator Key", + "operator-session", + "client-correlation", + TimeSpan.FromSeconds(7), + TimeSpan.FromSeconds(30), + TimeSpan.FromSeconds(10), + DateTimeOffset.UtcNow); + session.AttachWorkerClient(new FakeWorkerClient()); + session.MarkReady(); + return session; + } + + private static MxCommandRequest CreateAddItemBulkRequest(int serverHandle, IReadOnlyList tags) + { + AddItemBulkCommand cmd = new() { ServerHandle = serverHandle }; + cmd.TagAddresses.Add(tags); + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.AddItemBulk, AddItemBulk = cmd }, + }; + } + + private static MxCommandRequest CreateSubscribeBulkRequest(int serverHandle, IReadOnlyList tags) + { + SubscribeBulkCommand cmd = new() { ServerHandle = serverHandle }; + cmd.TagAddresses.Add(tags); + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.SubscribeBulk, SubscribeBulk = cmd }, + }; + } + + private static MxCommandRequest CreateAdviseItemBulkRequest(int serverHandle, IReadOnlyList itemHandles) + { + AdviseItemBulkCommand cmd = new() { ServerHandle = serverHandle }; + cmd.ItemHandles.Add(itemHandles); + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.AdviseItemBulk, AdviseItemBulk = cmd }, + }; + } + + private static MxCommandRequest CreateReadBulkRequest(int serverHandle, IReadOnlyList tags) + { + ReadBulkCommand cmd = new() { ServerHandle = serverHandle, TimeoutMs = 1000 }; + cmd.TagAddresses.Add(tags); + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.ReadBulk, ReadBulk = cmd }, + }; + } + + private static MxCommandRequest CreateWriteBulkRequest(int serverHandle, IReadOnlyList itemHandles) + { + WriteBulkCommand cmd = new() { ServerHandle = serverHandle }; + foreach (int handle in itemHandles) + { + cmd.Entries.Add(new WriteBulkEntry { ItemHandle = handle, Value = new MxValue { StringValue = "v" } }); + } + + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.WriteBulk, WriteBulk = cmd }, + }; + } + + private static MxCommandRequest CreateWriteSecuredBulkRequest(int serverHandle, IReadOnlyList itemHandles) + { + WriteSecuredBulkCommand cmd = new() { ServerHandle = serverHandle }; + foreach (int handle in itemHandles) + { + cmd.Entries.Add(new WriteSecuredBulkEntry + { + ItemHandle = handle, + CurrentUserId = 1, + VerifierUserId = 2, + Value = new MxValue { StringValue = "v" }, + }); + } + + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.WriteSecuredBulk, WriteSecuredBulk = cmd }, + }; + } + + private static MxCommandRequest CreateWriteRequest(int serverHandle, int itemHandle) + { + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand + { + Kind = MxCommandKind.Write, + Write = new WriteCommand + { + ServerHandle = serverHandle, + ItemHandle = itemHandle, + Value = new MxValue { StringValue = "v" }, + }, + }, + }; + } + + private static MxCommandRequest CreateWriteSecuredRequest(int serverHandle, int itemHandle) + { + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand + { + Kind = MxCommandKind.WriteSecured, + WriteSecured = new WriteSecuredCommand + { + ServerHandle = serverHandle, + ItemHandle = itemHandle, + CurrentUserId = 1, + VerifierUserId = 2, + Value = new MxValue { StringValue = "v" }, + }, + }, + }; + } + + private static MxCommandRequest CreateAddItemRequest(int serverHandle, string tagAddress) + { + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand + { + Kind = MxCommandKind.AddItem, + AddItem = new AddItemCommand + { + ServerHandle = serverHandle, + ItemDefinition = tagAddress, + }, + }, + }; + } + + // FakeSessionManager / FakeEventStreamService / FakeWorkerClient mirror the + // implementations in MxAccessGatewayServiceTests; the duplication is intentional + // so the constraint tests are self-contained and changes to the existing fakes + // don't accidentally couple the two suites. + private sealed class FakeSessionManager : ISessionManager + { + private readonly Dictionary seededSessions = new(StringComparer.Ordinal); + + public bool ResolveOnlySeededSessions { get; init; } + + public WorkerCommand? LastWorkerCommand { get; private set; } + + public int InvokeCount { get; private set; } + + public WorkerCommandReply InvokeReply { get; set; } = new() + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.Ping, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + }, + }; + + public List Events { get; } = []; + + public void SeedSession(GatewaySession session) => seededSessions[session.SessionId] = session; + + public Task OpenSessionAsync( + SessionOpenRequest request, + string? clientIdentity, + CancellationToken cancellationToken) => + Task.FromResult(seededSessions.Values.First()); + + public bool TryGetSession(string sessionId, out GatewaySession session) + { + if (seededSessions.TryGetValue(sessionId, out GatewaySession? seeded)) + { + session = seeded; + return true; + } + + if (ResolveOnlySeededSessions) + { + session = null!; + return false; + } + + session = CreateFallbackSession(sessionId); + return true; + } + + public Task InvokeAsync( + string sessionId, + WorkerCommand command, + CancellationToken cancellationToken) + { + InvokeCount++; + LastWorkerCommand = command; + return Task.FromResult(InvokeReply); + } + + public async IAsyncEnumerable ReadEventsAsync( + string sessionId, + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + foreach (WorkerEvent ev in Events) + { + cancellationToken.ThrowIfCancellationRequested(); + await Task.Yield(); + yield return ev; + } + } + + public Task CloseSessionAsync( + string sessionId, + CancellationToken cancellationToken) => + Task.FromResult(new SessionCloseResult(sessionId, SessionState.Closed, AlreadyClosed: false)); + + public Task CloseExpiredLeasesAsync( + DateTimeOffset now, + CancellationToken cancellationToken) => Task.FromResult(0); + + public Task ShutdownAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + private static GatewaySession CreateFallbackSession(string sessionId) + { + GatewaySession session = new( + sessionId, + GatewayContractInfo.DefaultBackendName, + "pipe", + "nonce", + "Operator Key", + "operator-session", + "client-correlation", + TimeSpan.FromSeconds(7), + TimeSpan.FromSeconds(30), + TimeSpan.FromSeconds(10), + DateTimeOffset.UtcNow); + session.AttachWorkerClient(new FakeWorkerClient()); + session.MarkReady(); + return session; + } + } + + private sealed class FakeEventStreamService(FakeSessionManager sessionManager) : IEventStreamService + { + public async IAsyncEnumerable StreamEventsAsync( + StreamEventsRequest request, + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + foreach (WorkerEvent ev in sessionManager.Events) + { + cancellationToken.ThrowIfCancellationRequested(); + await Task.Yield(); + yield return ev.Event; + } + } + } + + private sealed class FakeWorkerClient : IWorkerClient + { + public string SessionId { get; } = MxAccessGatewayServiceConstraintTests.SessionId; + + public int? ProcessId { get; } = 1234; + + public WorkerClientState State { get; } = WorkerClientState.Ready; + + public DateTimeOffset LastHeartbeatAt { get; } = DateTimeOffset.UtcNow; + + public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + public Task InvokeAsync( + WorkerCommand command, + TimeSpan timeout, + CancellationToken cancellationToken) => Task.FromResult(new WorkerCommandReply()); + + public async IAsyncEnumerable ReadEventsAsync( + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + await Task.CompletedTask; + yield break; + } + + public Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken) => Task.CompletedTask; + + public void Kill(string reason) + { + } + + public ValueTask DisposeAsync() => ValueTask.CompletedTask; + } +} diff --git a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs index 85caa76..7b71097 100644 --- a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs +++ b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs @@ -422,12 +422,13 @@ public sealed class MxAccessGatewayServiceTests private static MxAccessGatewayService CreateService( FakeSessionManager sessionManager, IGatewayRequestIdentityAccessor? identityAccessor = null, - GatewayMetrics? metrics = null) + GatewayMetrics? metrics = null, + IConstraintEnforcer? constraintEnforcer = null) { return new MxAccessGatewayService( sessionManager, identityAccessor ?? new GatewayRequestIdentityAccessor(), - new AllowAllConstraintEnforcer(), + constraintEnforcer ?? new AllowAllConstraintEnforcer(), new MxAccessGrpcRequestValidator(), new MxAccessGrpcMapper(), new FakeEventStreamService(sessionManager), diff --git a/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs b/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs new file mode 100644 index 0000000..de37c1c --- /dev/null +++ b/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs @@ -0,0 +1,247 @@ +using System.Runtime.CompilerServices; +using MxGateway.Contracts.Proto; +using MxGateway.Server.Sessions; +using MxGateway.Server.Workers; + +namespace MxGateway.Tests.Gateway.Sessions; + +/// +/// Concurrency and disposal regression tests for . +/// Server-015 and Server-016 audited the split lock discipline between +/// _syncRoot (state transitions) and _closeLock (close serialization) +/// and the un-gated DisposeAsync; these tests pin the post-fix behavior. +/// +public sealed class GatewaySessionTests +{ + /// + /// Server-015 regression. A TransitionTo(Ready) issued after + /// has set + /// must not flip the session back to . The + /// blocking worker shutdown keeps CloseAsync parked between the + /// Closing write and the Closed write, which is precisely the + /// window the audit identified. + /// + [Fact] + public async Task TransitionTo_AfterCloseStarted_DoesNotOverwriteClosing() + { + BlockingShutdownWorkerClient workerClient = new(); + GatewaySession session = CreateReadySession(workerClient); + + Task closeTask = session.CloseAsync("test-close", CancellationToken.None); + await workerClient.WaitForShutdownStartAsync(); + + // Close has set _state = Closing under _syncRoot and is parked inside + // worker.ShutdownAsync. A concurrent transition (e.g. a late + // SessionWorkerClientFactory lifecycle callback) must not revive the session. + Assert.Equal(SessionState.Closing, session.State); + session.TransitionTo(SessionState.Ready); + Assert.Equal(SessionState.Closing, session.State); + + workerClient.ReleaseShutdown(); + SessionCloseResult result = await closeTask; + + Assert.Equal(SessionState.Closed, result.FinalState); + Assert.Equal(SessionState.Closed, session.State); + + await session.DisposeAsync(); + } + + /// + /// Server-015 regression. Once finishes, + /// must not be able to move the + /// session out of either — the close path's + /// terminal write goes through the same _syncRoot the rest of the state + /// machine uses, so the existing "Closed is terminal" invariant holds. + /// + [Fact] + public async Task MarkFaulted_AfterCloseCompletes_DoesNotResurrectSession() + { + FakeWorkerClient workerClient = new(); + GatewaySession session = CreateReadySession(workerClient); + + await session.CloseAsync("test-close", CancellationToken.None); + Assert.Equal(SessionState.Closed, session.State); + + session.MarkFaulted("late-fault"); + Assert.Equal(SessionState.Closed, session.State); + + await session.DisposeAsync(); + } + + /// + /// Server-016 regression. must wait + /// for an in-flight before disposing + /// its semaphore. Without the fix, the close's _closeLock.Release() + /// would race the dispose and raise . + /// + [Fact] + public async Task DisposeAsync_WhileCloseInFlight_WaitsForCloseAndDoesNotThrow() + { + BlockingShutdownWorkerClient workerClient = new(); + GatewaySession session = CreateReadySession(workerClient); + + Task closeTask = session.CloseAsync("test-close", CancellationToken.None); + await workerClient.WaitForShutdownStartAsync(); + + // Start disposing while close is still parked inside worker.ShutdownAsync. + ValueTask disposeTask = session.DisposeAsync(); + + // Now release the worker shutdown so close can complete. + workerClient.ReleaseShutdown(); + + // Both must complete cleanly — the close's Release() must run before the + // dispose actually tears the semaphore down. + SessionCloseResult result = await closeTask; + await disposeTask; + + Assert.Equal(SessionState.Closed, result.FinalState); + Assert.Equal(1, workerClient.ShutdownCount); + // Worker dispose ran exactly once even with the close/dispose interleave. + Assert.Equal(1, workerClient.DisposeCount); + } + + /// + /// Double-dispose is tolerated: the second call must swallow + /// from the already-disposed semaphore + /// rather than propagating it. + /// + [Fact] + public async Task DisposeAsync_CalledTwice_DoesNotThrow() + { + FakeWorkerClient workerClient = new(); + GatewaySession session = CreateReadySession(workerClient); + await session.CloseAsync("test-close", CancellationToken.None); + + await session.DisposeAsync(); + // No second exception — the dispose's defensive ObjectDisposedException catch + // covers the doubled call path that SessionManager.ShutdownAsync could trigger + // if it re-removed a session. + await session.DisposeAsync(); + } + + private static GatewaySession CreateReadySession(IWorkerClient workerClient) + { + GatewaySession session = new( + sessionId: "session-test", + backendName: "mxaccess", + pipeName: "mxaccess-gateway-1-session-test", + nonce: "nonce", + clientIdentity: "client-1", + clientSessionName: "test-session", + clientCorrelationId: "client-correlation-1", + commandTimeout: TimeSpan.FromSeconds(5), + startupTimeout: TimeSpan.FromSeconds(5), + shutdownTimeout: TimeSpan.FromSeconds(5), + leaseDuration: TimeSpan.FromMinutes(30), + openedAt: DateTimeOffset.UtcNow); + session.AttachWorkerClient(workerClient); + session.MarkReady(); + return session; + } + + /// + /// Minimal worker client that parks until the test + /// explicitly releases it. Used to keep + /// stuck between its Closing and Closed writes so the test can + /// observe and act on the intermediate state. + /// + private sealed class BlockingShutdownWorkerClient : IWorkerClient + { + private readonly TaskCompletionSource _shutdownStarted = new(TaskCreationOptions.RunContinuationsAsynchronously); + private readonly TaskCompletionSource _shutdownReleased = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public string SessionId { get; } = "session-test"; + + public int? ProcessId { get; } = 1234; + + public WorkerClientState State { get; private set; } = WorkerClientState.Ready; + + public DateTimeOffset LastHeartbeatAt { get; } = DateTimeOffset.UtcNow; + + public int ShutdownCount { get; private set; } + + public int DisposeCount { get; private set; } + + public Task WaitForShutdownStartAsync() + { + return _shutdownStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + } + + public void ReleaseShutdown() + { + _shutdownReleased.TrySetResult(); + } + + public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + public Task InvokeAsync( + WorkerCommand command, + TimeSpan timeout, + CancellationToken cancellationToken) => Task.FromResult(new WorkerCommandReply()); + + public async IAsyncEnumerable ReadEventsAsync( + [EnumeratorCancellation] CancellationToken cancellationToken) + { + await Task.CompletedTask.ConfigureAwait(false); + yield break; + } + + public async Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken) + { + ShutdownCount++; + _shutdownStarted.TrySetResult(); + await _shutdownReleased.Task.WaitAsync(cancellationToken).ConfigureAwait(false); + State = WorkerClientState.Closed; + } + + public void Kill(string reason) + { + State = WorkerClientState.Faulted; + } + + public ValueTask DisposeAsync() + { + DisposeCount++; + return ValueTask.CompletedTask; + } + } + + private sealed class FakeWorkerClient : IWorkerClient + { + public string SessionId { get; } = "session-test"; + + public int? ProcessId { get; } = 1234; + + public WorkerClientState State { get; } = WorkerClientState.Ready; + + public DateTimeOffset LastHeartbeatAt { get; } = DateTimeOffset.UtcNow; + + public int DisposeCount { get; private set; } + + public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + public Task InvokeAsync( + WorkerCommand command, + TimeSpan timeout, + CancellationToken cancellationToken) => Task.FromResult(new WorkerCommandReply()); + + public async IAsyncEnumerable ReadEventsAsync( + [EnumeratorCancellation] CancellationToken cancellationToken) + { + await Task.CompletedTask.ConfigureAwait(false); + yield break; + } + + public Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken) => Task.CompletedTask; + + public void Kill(string reason) + { + } + + public ValueTask DisposeAsync() + { + DisposeCount++; + return ValueTask.CompletedTask; + } + } +} diff --git a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs new file mode 100644 index 0000000..ccf8b1e --- /dev/null +++ b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs @@ -0,0 +1,711 @@ +using Google.Protobuf.WellKnownTypes; +using Microsoft.Extensions.Options; +using MxGateway.Contracts.Proto; +using MxGateway.Server.Configuration; +using MxGateway.Server.Metrics; +using MxGateway.Server.Sessions; +using MxGateway.Server.Workers; + +namespace MxGateway.Tests.Gateway.Sessions; + +/// +/// Tests-013: per-method gateway-side coverage for every +/// GatewaySession.*BulkAsync entry point. Each method gets a +/// round-trip test that pins the sent to the +/// worker, the per-entry payload shape, a failure-mode (per-entry failure +/// surfaced or protocol-status failure) check, and a cancellation-propagation +/// check. The secured-write variants additionally pin that the credential +/// payload (current_user_id, verifier_user_id) is preserved +/// end-to-end and not flattened/redacted by the gateway's command shape. +/// +public sealed class SessionManagerBulkTests +{ + [Fact] + public async Task AddItemBulkAsync_ForwardsOneAddItemBulkCommandAndReturnsResults() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.AddItemBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 12, TagAddress = "Galaxy.Tag.Ok", ItemHandle = 511, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 12, TagAddress = "Galaxy.Tag.Bad", ItemHandle = 0, WasSuccessful = false, ErrorMessage = "invalid tag" }, + }, + }, MxCommandKind.AddItemBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.AddItemBulkAsync( + 12, + ["Galaxy.Tag.Ok", "Galaxy.Tag.Bad"], + CancellationToken.None); + + Assert.Equal(1, workerClient.InvokeCount); + Assert.Equal(MxCommandKind.AddItemBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(12, workerClient.LastCommand?.Command.AddItemBulk.ServerHandle); + Assert.Equal(["Galaxy.Tag.Ok", "Galaxy.Tag.Bad"], workerClient.LastCommand?.Command.AddItemBulk.TagAddresses); + Assert.Equal(2, results.Count); + Assert.True(results[0].WasSuccessful); + Assert.False(results[1].WasSuccessful); + Assert.Equal("invalid tag", results[1].ErrorMessage); + } + + [Fact] + public async Task AddItemBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.AddItemBulkAsync(12, ["Tag.A"], cts.Token)); + } + + [Fact] + public async Task AdviseItemBulkAsync_ForwardsOneAdviseItemBulkCommandAndReturnsResults() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.AdviseItemBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 12, ItemHandle = 901, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 12, ItemHandle = 902, WasSuccessful = false, ErrorMessage = "invalid item handle" }, + }, + }, MxCommandKind.AdviseItemBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.AdviseItemBulkAsync( + 12, + [901, 902], + CancellationToken.None); + + Assert.Equal(MxCommandKind.AdviseItemBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(12, workerClient.LastCommand?.Command.AdviseItemBulk.ServerHandle); + Assert.Equal([901, 902], workerClient.LastCommand?.Command.AdviseItemBulk.ItemHandles); + Assert.Equal(2, results.Count); + Assert.True(results[0].WasSuccessful); + Assert.False(results[1].WasSuccessful); + } + + [Fact] + public async Task AdviseItemBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.AdviseItemBulkAsync(12, [101], cts.Token)); + } + + [Fact] + public async Task RemoveItemBulkAsync_ForwardsOneRemoveItemBulkCommandAndReturnsResults() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.RemoveItemBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 12, ItemHandle = 11, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 12, ItemHandle = 12, WasSuccessful = false, ErrorMessage = "unknown handle" }, + }, + }, MxCommandKind.RemoveItemBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.RemoveItemBulkAsync( + 12, + [11, 12], + CancellationToken.None); + + Assert.Equal(MxCommandKind.RemoveItemBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal([11, 12], workerClient.LastCommand?.Command.RemoveItemBulk.ItemHandles); + Assert.Equal(2, results.Count); + Assert.False(results[1].WasSuccessful); + } + + [Fact] + public async Task RemoveItemBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.RemoveItemBulkAsync(12, [11], cts.Token)); + } + + [Fact] + public async Task UnAdviseItemBulkAsync_ForwardsOneUnAdviseItemBulkCommandAndReturnsResults() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.UnAdviseItemBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 12, ItemHandle = 21, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 12, ItemHandle = 22, WasSuccessful = false, ErrorMessage = "not advised" }, + }, + }, MxCommandKind.UnAdviseItemBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.UnAdviseItemBulkAsync( + 12, + [21, 22], + CancellationToken.None); + + Assert.Equal(MxCommandKind.UnAdviseItemBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal([21, 22], workerClient.LastCommand?.Command.UnAdviseItemBulk.ItemHandles); + Assert.Equal(2, results.Count); + Assert.False(results[1].WasSuccessful); + Assert.Equal("not advised", results[1].ErrorMessage); + } + + [Fact] + public async Task UnAdviseItemBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.UnAdviseItemBulkAsync(12, [21], cts.Token)); + } + + [Fact] + public async Task SubscribeBulkAsync_SurfacesPerEntryFailures() + { + // SubscribeBulkAsync already has a happy-path test in SessionManagerTests + // (GatewaySessionSubscribeBulkAsync_ForwardsOneBulkCommandAndReturnsResults); + // this complementary test pins the per-entry failure-surface behaviour. + FakeBulkWorkerClient workerClient = WithReply(reply => reply.SubscribeBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 12, TagAddress = "Galaxy.Good", ItemHandle = 501, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 12, TagAddress = "Galaxy.Bad", ItemHandle = 0, WasSuccessful = false, ErrorMessage = "MXAccess subscribe failed" }, + }, + }, MxCommandKind.SubscribeBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.SubscribeBulkAsync( + 12, + ["Galaxy.Good", "Galaxy.Bad"], + CancellationToken.None); + + Assert.Equal(2, results.Count); + Assert.True(results[0].WasSuccessful); + Assert.False(results[1].WasSuccessful); + Assert.Equal("MXAccess subscribe failed", results[1].ErrorMessage); + } + + [Fact] + public async Task SubscribeBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.SubscribeBulkAsync(12, ["Tag"], cts.Token)); + } + + [Fact] + public async Task UnsubscribeBulkAsync_ForwardsOneUnsubscribeBulkCommandAndReturnsResults() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.UnsubscribeBulk = new BulkSubscribeReply + { + Results = + { + new SubscribeResult { ServerHandle = 12, ItemHandle = 31, WasSuccessful = true }, + new SubscribeResult { ServerHandle = 12, ItemHandle = 32, WasSuccessful = false, ErrorMessage = "unknown handle" }, + }, + }, MxCommandKind.UnsubscribeBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.UnsubscribeBulkAsync( + 12, + [31, 32], + CancellationToken.None); + + Assert.Equal(MxCommandKind.UnsubscribeBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal([31, 32], workerClient.LastCommand?.Command.UnsubscribeBulk.ItemHandles); + Assert.Equal(2, results.Count); + Assert.False(results[1].WasSuccessful); + } + + [Fact] + public async Task UnsubscribeBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.UnsubscribeBulkAsync(12, [31], cts.Token)); + } + + [Fact] + public async Task WriteBulkAsync_SurfacesPerEntryFailures() + { + // Complement the existing happy-path WriteBulk test in SessionManagerTests + // with an explicit per-entry failure assertion plus payload-shape pinning. + FakeBulkWorkerClient workerClient = WithReply(reply => reply.WriteBulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 12, ItemHandle = 901, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 12, ItemHandle = 902, WasSuccessful = false, ErrorMessage = "MXAccess invalid handle" }, + }, + }, MxCommandKind.WriteBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.WriteBulkAsync( + 12, + new[] + { + new WriteBulkEntry { ItemHandle = 901, UserId = 5, Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 11 } }, + new WriteBulkEntry { ItemHandle = 902, UserId = 5, Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 22 } }, + }, + CancellationToken.None); + + Assert.Equal(MxCommandKind.WriteBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(2, workerClient.LastCommand?.Command.WriteBulk.Entries.Count); + Assert.Equal(901, workerClient.LastCommand?.Command.WriteBulk.Entries[0].ItemHandle); + Assert.Equal(11, workerClient.LastCommand?.Command.WriteBulk.Entries[0].Value.Int32Value); + Assert.False(results[1].WasSuccessful); + Assert.Equal("MXAccess invalid handle", results[1].ErrorMessage); + } + + [Fact] + public async Task WriteBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.WriteBulkAsync( + 12, + new[] { new WriteBulkEntry { ItemHandle = 1, UserId = 1, Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 0 } } }, + cts.Token)); + } + + [Fact] + public async Task Write2BulkAsync_ForwardsOneWrite2BulkCommandAndPreservesTimestampPayload() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.Write2Bulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 12, ItemHandle = 701, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 12, ItemHandle = 702, WasSuccessful = false, ErrorMessage = "MXAccess Write2 failed" }, + }, + }, MxCommandKind.Write2Bulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.Write2BulkAsync( + 12, + new[] + { + new Write2BulkEntry + { + ItemHandle = 701, + UserId = 5, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 11 }, + TimestampValue = new MxValue { DataType = MxDataType.Time, Int64Value = 1234567890L }, + }, + new Write2BulkEntry + { + ItemHandle = 702, + UserId = 5, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 22 }, + TimestampValue = new MxValue { DataType = MxDataType.Time, Int64Value = 1234567891L }, + }, + }, + CancellationToken.None); + + Assert.Equal(MxCommandKind.Write2Bulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(12, workerClient.LastCommand?.Command.Write2Bulk.ServerHandle); + Assert.Equal(2, workerClient.LastCommand?.Command.Write2Bulk.Entries.Count); + Assert.Equal(701, workerClient.LastCommand?.Command.Write2Bulk.Entries[0].ItemHandle); + Assert.Equal(1234567890L, workerClient.LastCommand?.Command.Write2Bulk.Entries[0].TimestampValue.Int64Value); + Assert.False(results[1].WasSuccessful); + } + + [Fact] + public async Task Write2BulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.Write2BulkAsync( + 12, + new[] + { + new Write2BulkEntry + { + ItemHandle = 1, + UserId = 1, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 0 }, + TimestampValue = new MxValue { DataType = MxDataType.Time, Int64Value = 0L }, + }, + }, + cts.Token)); + } + + [Fact] + public async Task WriteSecuredBulkAsync_ForwardsOneWriteSecuredBulkCommandAndPreservesCredentialPayload() + { + // The secured variants carry caller credential identifiers (CurrentUserId / + // VerifierUserId). Pin that those survive the gateway round-trip end-to-end — + // the over-the-wire command shape must NOT redact or flatten them, only the + // *log surface* (see GatewaySession's redaction rules) is allowed to drop them. + FakeBulkWorkerClient workerClient = WithReply(reply => reply.WriteSecuredBulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 12, ItemHandle = 601, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 12, ItemHandle = 602, WasSuccessful = false, ErrorMessage = "MXAccess secured-write rejected" }, + }, + }, MxCommandKind.WriteSecuredBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.WriteSecuredBulkAsync( + 12, + new[] + { + new WriteSecuredBulkEntry + { + ItemHandle = 601, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 1 }, + }, + new WriteSecuredBulkEntry + { + ItemHandle = 602, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 2 }, + }, + }, + CancellationToken.None); + + Assert.Equal(MxCommandKind.WriteSecuredBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(12, workerClient.LastCommand?.Command.WriteSecuredBulk.ServerHandle); + Assert.Equal(2, workerClient.LastCommand?.Command.WriteSecuredBulk.Entries.Count); + WriteSecuredBulkEntry firstEntry = workerClient.LastCommand!.Command.WriteSecuredBulk.Entries[0]; + Assert.Equal(601, firstEntry.ItemHandle); + Assert.Equal(7, firstEntry.CurrentUserId); + Assert.Equal(8, firstEntry.VerifierUserId); + Assert.Equal(1, firstEntry.Value.Int32Value); + Assert.False(results[1].WasSuccessful); + Assert.Equal("MXAccess secured-write rejected", results[1].ErrorMessage); + } + + [Fact] + public async Task WriteSecuredBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.WriteSecuredBulkAsync( + 12, + new[] + { + new WriteSecuredBulkEntry + { + ItemHandle = 1, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 0 }, + }, + }, + cts.Token)); + } + + [Fact] + public async Task WriteSecured2BulkAsync_ForwardsOneWriteSecured2BulkCommandAndPreservesCredentialAndTimestampPayload() + { + FakeBulkWorkerClient workerClient = WithReply(reply => reply.WriteSecured2Bulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 12, ItemHandle = 801, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 12, ItemHandle = 802, WasSuccessful = false, ErrorMessage = "MXAccess secured2-write rejected" }, + }, + }, MxCommandKind.WriteSecured2Bulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.WriteSecured2BulkAsync( + 12, + new[] + { + new WriteSecured2BulkEntry + { + ItemHandle = 801, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 1 }, + TimestampValue = new MxValue { DataType = MxDataType.Time, Int64Value = 1700000000L }, + }, + new WriteSecured2BulkEntry + { + ItemHandle = 802, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 2 }, + TimestampValue = new MxValue { DataType = MxDataType.Time, Int64Value = 1700000001L }, + }, + }, + CancellationToken.None); + + Assert.Equal(MxCommandKind.WriteSecured2Bulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(2, workerClient.LastCommand?.Command.WriteSecured2Bulk.Entries.Count); + WriteSecured2BulkEntry firstEntry = workerClient.LastCommand!.Command.WriteSecured2Bulk.Entries[0]; + Assert.Equal(801, firstEntry.ItemHandle); + Assert.Equal(7, firstEntry.CurrentUserId); + Assert.Equal(8, firstEntry.VerifierUserId); + Assert.Equal(1, firstEntry.Value.Int32Value); + Assert.Equal(1700000000L, firstEntry.TimestampValue.Int64Value); + Assert.False(results[1].WasSuccessful); + } + + [Fact] + public async Task WriteSecured2BulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.WriteSecured2BulkAsync( + 12, + new[] + { + new WriteSecured2BulkEntry + { + ItemHandle = 1, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 0 }, + TimestampValue = new MxValue { DataType = MxDataType.Time, Int64Value = 0L }, + }, + }, + cts.Token)); + } + + [Fact] + public async Task ReadBulkAsync_SurfacesPerEntryFailures() + { + // Complement the existing happy-path ReadBulk test in SessionManagerTests + // with the failure-mode case where one tag failed to read. + FakeBulkWorkerClient workerClient = WithReply(reply => reply.ReadBulk = new BulkReadReply + { + Results = + { + new BulkReadResult + { + ServerHandle = 12, + TagAddress = "Galaxy.Good", + ItemHandle = 511, + WasSuccessful = true, + WasCached = false, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 42 }, + }, + new BulkReadResult + { + ServerHandle = 12, + TagAddress = "Galaxy.Bad", + ItemHandle = 0, + WasSuccessful = false, + ErrorMessage = "MXAccess read timed out", + }, + }, + }, MxCommandKind.ReadBulk); + GatewaySession session = await OpenSessionAsync(workerClient); + + IReadOnlyList results = await session.ReadBulkAsync( + 12, + ["Galaxy.Good", "Galaxy.Bad"], + TimeSpan.FromMilliseconds(750), + CancellationToken.None); + + Assert.Equal(MxCommandKind.ReadBulk, workerClient.LastCommand?.Command.Kind); + Assert.Equal(750u, workerClient.LastCommand?.Command.ReadBulk.TimeoutMs); + Assert.Equal(["Galaxy.Good", "Galaxy.Bad"], workerClient.LastCommand?.Command.ReadBulk.TagAddresses); + Assert.Equal(2, results.Count); + Assert.True(results[0].WasSuccessful); + Assert.False(results[1].WasSuccessful); + Assert.Equal("MXAccess read timed out", results[1].ErrorMessage); + } + + [Fact] + public async Task ReadBulkAsync_PropagatesCancellation() + { + FakeBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + await cts.CancelAsync(); + + await Assert.ThrowsAnyAsync( + async () => await session.ReadBulkAsync( + 12, + ["Galaxy.Tag"], + TimeSpan.FromMilliseconds(500), + cts.Token)); + } + + // ----------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------- + + private static FakeBulkWorkerClient WithReply(Action populate, MxCommandKind kind) + { + MxCommandReply reply = new() + { + SessionId = "session-1", + CorrelationId = "correlation-1", + Kind = kind, + ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok }, + }; + populate(reply); + return new FakeBulkWorkerClient + { + InvokeReply = new WorkerCommandReply { Reply = reply }, + }; + } + + private static async Task OpenSessionAsync(FakeBulkWorkerClient workerClient) + { + SessionManager manager = CreateManager(workerClient); + return await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None); + } + + private static SessionManager CreateManager(FakeBulkWorkerClient workerClient) + { + return new SessionManager( + new SessionRegistry(), + new FakeBulkSessionWorkerClientFactory(workerClient), + Options.Create(new GatewayOptions + { + Sessions = new SessionOptions + { + DefaultCommandTimeoutSeconds = 30, + MaxSessions = 16, + DefaultLeaseSeconds = 1800, + }, + Worker = new WorkerOptions + { + StartupTimeoutSeconds = 30, + ShutdownTimeoutSeconds = 10, + }, + }), + new GatewayMetrics()); + } + + private static SessionOpenRequest CreateOpenRequest() + { + return new SessionOpenRequest( + RequestedBackend: null, + ClientSessionName: "test-session", + ClientCorrelationId: "client-correlation-1", + CommandTimeout: Duration.FromTimeSpan(TimeSpan.FromSeconds(5))); + } + + private sealed class FakeBulkSessionWorkerClientFactory(IWorkerClient workerClient) : ISessionWorkerClientFactory + { + /// + public Task CreateAsync( + GatewaySession session, + CancellationToken cancellationToken) + { + return Task.FromResult(workerClient); + } + } + + private sealed class FakeBulkWorkerClient : IWorkerClient + { + /// + public string SessionId { get; init; } = "session-1"; + + /// + public int? ProcessId { get; init; } = 1234; + + /// + public WorkerClientState State { get; set; } = WorkerClientState.Ready; + + /// + public DateTimeOffset LastHeartbeatAt { get; init; } = DateTimeOffset.UtcNow; + + /// Gets the number of times Invoke was called on the fake worker client. + public int InvokeCount { get; private set; } + + /// Gets the last command invoked on the fake worker client. + public WorkerCommand? LastCommand { get; private set; } + + /// Gets the reply to return for invoke calls on the fake worker client. + public WorkerCommandReply? InvokeReply { get; init; } + + /// + public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + /// + public Task InvokeAsync( + WorkerCommand command, + TimeSpan timeout, + CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + InvokeCount++; + LastCommand = command; + if (InvokeReply is not null) + { + return Task.FromResult(InvokeReply); + } + + MxCommandKind kind = command.Command?.Kind ?? MxCommandKind.Unspecified; + return Task.FromResult(new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + CorrelationId = "correlation-1", + Kind = kind, + ProtocolStatus = new ProtocolStatus { Code = ProtocolStatusCode.Ok }, + }, + }); + } + + /// + public async IAsyncEnumerable ReadEventsAsync( + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + await Task.CompletedTask; + yield break; + } + + /// + public Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken) + { + State = WorkerClientState.Closed; + return Task.CompletedTask; + } + + /// + public void Kill(string reason) => State = WorkerClientState.Faulted; + + /// + public ValueTask DisposeAsync() => ValueTask.CompletedTask; + } +} diff --git a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs index 9eb3fd5..58c6faf 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs @@ -37,7 +37,7 @@ public sealed class SessionManagerTests [Fact] public async Task OpenSessionAsync_SetsInitialDefaultLease() { - ManualTimeProvider clock = new(DateTimeOffset.Parse("2026-04-29T10:00:00Z")); + ManualTimeProvider clock = new(DateTimeOffset.Parse("2026-04-29T10:00:00Z", System.Globalization.CultureInfo.InvariantCulture)); GatewayOptions options = CreateOptions(defaultLeaseSeconds: 1800); SessionManager manager = CreateManager( new FakeSessionWorkerClientFactory(new FakeWorkerClient()), diff --git a/src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs b/src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs index 8297a2f..d6c7367 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs @@ -267,21 +267,45 @@ public sealed class WorkerAlarmRpcDispatcherTests Assert.Equal("Galaxy!A.T2", collected[1].AlarmFullReference); } + /// + /// Server-019 regression: QueryActiveAlarmsAsync used to silently + /// yield break when the session id was not in the registry, while the + /// peer AcknowledgeAsync returned SessionNotFound. Both methods + /// now signal a missing session — QueryActiveAlarms throws a + /// with + /// (the gateway gRPC + /// layer maps it to gRPC NotFound), aligning the dispatcher's + /// missing-session contract across the two RPCs. + /// [Fact] - public async Task QueryActiveAlarmsAsync_WhenSessionMissing_YieldsEmpty() + public async Task QueryActiveAlarmsAsync_WhenSessionMissing_ThrowsSessionNotFound() { SessionRegistry registry = new(); WorkerAlarmRpcDispatcher dispatcher = new(registry); - List collected = new(); - await foreach (ActiveAlarmSnapshot snap in dispatcher.QueryActiveAlarmsAsync( - new QueryActiveAlarmsRequest { SessionId = "missing" }, - CancellationToken.None)) + SessionManagerException exception = await Assert.ThrowsAsync(async () => { - collected.Add(snap); - } + await foreach (ActiveAlarmSnapshot _ in dispatcher.QueryActiveAlarmsAsync( + new QueryActiveAlarmsRequest { SessionId = "missing" }, + CancellationToken.None)) + { + // No yield expected — the throw happens before the first iteration. + } + }); - Assert.Empty(collected); + Assert.Equal(SessionManagerErrorCode.SessionNotFound, exception.ErrorCode); + + // Peer-method parity: AcknowledgeAsync still signals SessionNotFound (as an + // in-band ProtocolStatus, since it's a unary RPC). The two methods now agree + // that a missing session is an error, not an empty success. + AcknowledgeAlarmReply ackReply = await dispatcher.AcknowledgeAsync( + new AcknowledgeAlarmRequest + { + SessionId = "missing", + AlarmFullReference = Guid.NewGuid().ToString(), + }, + CancellationToken.None); + Assert.Equal(ProtocolStatusCode.SessionNotFound, ackReply.ProtocolStatus.Code); } [Fact] diff --git a/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs b/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs index 2974c48..8faa038 100644 --- a/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs +++ b/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs @@ -341,10 +341,17 @@ public sealed class WorkerClientTests Assert.Equal(previousHeartbeat + TimeSpan.FromSeconds(1), client.LastHeartbeatAt); } - /// Verifies that the heartbeat monitor faults the client when the heartbeat expires. + /// + /// Verifies that the heartbeat monitor faults the client when the heartbeat expires. + /// Uses an injected so the grace comparison is deterministic + /// instead of depending on real wall-clock advance; the monitor's + /// timer stays on the real clock and + /// observes the manually-advanced grace on its next tick. + /// [Fact] public async Task HeartbeatMonitor_WhenHeartbeatExpires_FaultsClient() { + ManualTimeProvider clock = new(DateTimeOffset.Parse("2026-05-20T12:00:00Z", System.Globalization.CultureInfo.InvariantCulture)); await using PipePair pipePair = await PipePair.CreateAsync(); await using WorkerClient client = CreateClient( pipePair, @@ -353,9 +360,12 @@ public sealed class WorkerClientTests HeartbeatGrace = TimeSpan.FromMilliseconds(80), HeartbeatCheckInterval = TimeSpan.FromMilliseconds(20), EventChannelCapacity = 8, - }); + }, + timeProvider: clock); await CompleteHandshakeAsync(client, pipePair); + clock.Advance(TimeSpan.FromSeconds(2)); + await WaitUntilAsync( () => client.State == WorkerClientState.Faulted, TestTimeout); diff --git a/src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs b/src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs index a9031a4..b741803 100644 --- a/src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs +++ b/src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs @@ -252,6 +252,94 @@ public sealed class GatewayGrpcAuthorizationInterceptorTests Assert.Equal(0, sessionManager.InvokeCount); } + /// + /// Verifies the interceptor denies AcknowledgeAlarm calls that lack + /// . Ack is a write-shaped mutation against + /// alarm state, so it carries the same scope as MxCommandKind.Write. + /// + [Fact] + public async Task UnaryServerHandler_AcknowledgeAlarmMissingScope_ReturnsPermissionDenied() + { + GatewayGrpcAuthorizationInterceptor interceptor = CreateInterceptor( + new FakeApiKeyVerifier(SuccessWithScopes(GatewayScopes.InvokeRead)), + new GatewayRequestIdentityAccessor()); + + RpcException exception = await Assert.ThrowsAsync( + () => interceptor.UnaryServerHandler( + new AcknowledgeAlarmRequest { SessionId = "session-1", AlarmFullReference = "ref" }, + ContextWithAuthorization("Bearer mxgw_operator01_secret"), + (_, _) => Task.FromResult(new AcknowledgeAlarmReply()))); + + Assert.Equal(StatusCode.PermissionDenied, exception.StatusCode); + Assert.Contains(GatewayScopes.InvokeWrite, exception.Status.Detail, StringComparison.Ordinal); + } + + /// Verifies that an API key holding invoke:write may call AcknowledgeAlarm. + [Fact] + public async Task UnaryServerHandler_AcknowledgeAlarmWithScope_RunsHandler() + { + GatewayGrpcAuthorizationInterceptor interceptor = CreateInterceptor( + new FakeApiKeyVerifier(SuccessWithScopes(GatewayScopes.InvokeWrite)), + new GatewayRequestIdentityAccessor()); + bool handlerRan = false; + + AcknowledgeAlarmReply reply = await interceptor.UnaryServerHandler( + new AcknowledgeAlarmRequest { SessionId = "session-1", AlarmFullReference = "ref" }, + ContextWithAuthorization("Bearer mxgw_operator01_secret"), + (_, _) => + { + handlerRan = true; + return Task.FromResult(new AcknowledgeAlarmReply()); + }); + + Assert.NotNull(reply); + Assert.True(handlerRan); + } + + /// + /// Verifies the interceptor denies QueryActiveAlarms server-streaming calls that + /// lack . Active-alarm snapshots are part of the + /// alarm/event surface and share the same scope as StreamEvents. + /// + [Fact] + public async Task ServerStreamingServerHandler_QueryActiveAlarmsMissingScope_ReturnsPermissionDenied() + { + GatewayGrpcAuthorizationInterceptor interceptor = CreateInterceptor( + new FakeApiKeyVerifier(SuccessWithScopes(GatewayScopes.InvokeRead)), + new GatewayRequestIdentityAccessor()); + + RpcException exception = await Assert.ThrowsAsync( + () => interceptor.ServerStreamingServerHandler( + new QueryActiveAlarmsRequest { SessionId = "session-1" }, + new RecordingServerStreamWriter(), + ContextWithAuthorization("Bearer mxgw_operator01_secret"), + (_, _, _) => Task.CompletedTask)); + + Assert.Equal(StatusCode.PermissionDenied, exception.StatusCode); + Assert.Contains(GatewayScopes.EventsRead, exception.Status.Detail, StringComparison.Ordinal); + } + + /// Verifies that an API key holding events:read may call QueryActiveAlarms. + [Fact] + public async Task ServerStreamingServerHandler_QueryActiveAlarmsWithScope_RunsHandler() + { + GatewayGrpcAuthorizationInterceptor interceptor = CreateInterceptor( + new FakeApiKeyVerifier(SuccessWithScopes(GatewayScopes.EventsRead)), + new GatewayRequestIdentityAccessor()); + RecordingServerStreamWriter streamWriter = new(); + + await interceptor.ServerStreamingServerHandler( + new QueryActiveAlarmsRequest { SessionId = "session-1" }, + streamWriter, + ContextWithAuthorization("Bearer mxgw_operator01_secret"), + async (_, writer, _) => + { + await writer.WriteAsync(new ActiveAlarmSnapshot()); + }); + + Assert.Single(streamWriter.Messages); + } + private static MxAccessGatewayService CreateService( ISessionManager sessionManager, IGatewayRequestIdentityAccessor identityAccessor) diff --git a/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs b/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs index ec8419e..60d9dfc 100644 --- a/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs +++ b/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs @@ -13,6 +13,8 @@ public sealed class GatewayGrpcScopeResolverTests [InlineData(typeof(OpenSessionRequest), GatewayScopes.SessionOpen)] [InlineData(typeof(CloseSessionRequest), GatewayScopes.SessionClose)] [InlineData(typeof(StreamEventsRequest), GatewayScopes.EventsRead)] + [InlineData(typeof(AcknowledgeAlarmRequest), GatewayScopes.InvokeWrite)] + [InlineData(typeof(QueryActiveAlarmsRequest), GatewayScopes.EventsRead)] [InlineData(typeof(TestConnectionRequest), GatewayScopes.MetadataRead)] [InlineData(typeof(GetLastDeployTimeRequest), GatewayScopes.MetadataRead)] [InlineData(typeof(DiscoverHierarchyRequest), GatewayScopes.MetadataRead)] diff --git a/src/MxGateway.Tests/TestSupport/PredicateConstraintEnforcer.cs b/src/MxGateway.Tests/TestSupport/PredicateConstraintEnforcer.cs new file mode 100644 index 0000000..147003f --- /dev/null +++ b/src/MxGateway.Tests/TestSupport/PredicateConstraintEnforcer.cs @@ -0,0 +1,89 @@ +using MxGateway.Server.Security.Authentication; +using MxGateway.Server.Security.Authorization; +using MxGateway.Server.Sessions; + +namespace MxGateway.Tests.TestSupport; + +/// +/// for tests that exercise the constraint +/// filtering and reply-merging code paths in +/// MxAccessGatewayService.ApplyConstraintsAsync and the +/// BulkConstraintPlan family. Callers supply predicates that decide +/// whether a given tag address or (server, item) handle is denied; recorded +/// denials are exposed for assertions. +/// +public sealed class PredicateConstraintEnforcer : IConstraintEnforcer +{ + /// Deny predicate keyed on tag address (returns true to deny). + public Func DenyTag { get; init; } = _ => false; + + /// Deny predicate keyed on (serverHandle, itemHandle) (returns true to deny). + public Func DenyReadHandle { get; init; } = (_, _) => false; + + /// Deny predicate keyed on (serverHandle, itemHandle) (returns true to deny). + public Func DenyWriteHandle { get; init; } = (_, _) => false; + + /// Recorded denial messages — (commandKind, target) tuples. + public List<(string CommandKind, string Target)> RecordedDenials { get; } = []; + + /// + public Task CheckReadTagAsync( + ApiKeyIdentity? identity, + string tagAddress, + CancellationToken cancellationToken) + { + if (DenyTag(tagAddress)) + { + return Task.FromResult( + new ConstraintFailure("read-tag", $"Read denied for tag '{tagAddress}'.")); + } + + return Task.FromResult(null); + } + + /// + public Task CheckReadHandleAsync( + ApiKeyIdentity? identity, + GatewaySession session, + int serverHandle, + int itemHandle, + CancellationToken cancellationToken) + { + if (DenyReadHandle(serverHandle, itemHandle)) + { + return Task.FromResult( + new ConstraintFailure("read-handle", $"Read denied for handle {itemHandle}.")); + } + + return Task.FromResult(null); + } + + /// + public Task CheckWriteHandleAsync( + ApiKeyIdentity? identity, + GatewaySession session, + int serverHandle, + int itemHandle, + CancellationToken cancellationToken) + { + if (DenyWriteHandle(serverHandle, itemHandle)) + { + return Task.FromResult( + new ConstraintFailure("write-handle", $"Write denied for handle {itemHandle}.")); + } + + return Task.FromResult(null); + } + + /// + public Task RecordDenialAsync( + ApiKeyIdentity? identity, + string commandKind, + string target, + ConstraintFailure failure, + CancellationToken cancellationToken) + { + RecordedDenials.Add((commandKind, target)); + return Task.CompletedTask; + } +} diff --git a/src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs b/src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs index 46d889f..32e4ec5 100644 --- a/src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs +++ b/src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs @@ -129,6 +129,109 @@ public sealed class WorkerFrameProtocolTests Assert.Equal(WorkerFrameProtocolErrorCode.InvalidEnvelope, exception.ErrorCode); } + /// + /// Worker.Tests-021 (a): pins the EndOfStream branch of + /// WorkerFrameReader.ReadExactlyOrThrowAsync. The gateway + /// closing its end of the pipe during a partial-frame read is the + /// most common production transport failure; the reader must + /// surface this as WorkerFrameProtocolErrorCode.EndOfStream + /// so the worker session can fault deterministically rather than + /// spinning on a partial buffer. The stream here declares a 100-byte + /// payload but only supplies 50 bytes, so the inner read loop sees + /// bytesRead == 0 mid-frame. + /// + [Fact] + public async Task ReadAsync_WhenStreamEndsMidFrame_ThrowsEndOfStream() + { + WorkerFrameProtocolOptions options = CreateOptions(); + byte[] frame = new byte[sizeof(uint) + 50]; + WorkerFrameTestHelpers.WriteUInt32LittleEndian(frame, 100); + using MemoryStream stream = new(frame); + + WorkerFrameReader reader = new(stream, options); + WorkerFrameProtocolException exception = + await Assert.ThrowsAsync( + async () => await reader.ReadAsync()); + + Assert.Equal(WorkerFrameProtocolErrorCode.EndOfStream, exception.ErrorCode); + } + + /// + /// Worker.Tests-021 (b): pins the writer-side + /// MessageTooLarge branch. A session that constructs an + /// envelope whose serialised size exceeds MaxMessageBytes + /// must be rejected by the writer before any bytes are sent down + /// the pipe, so a misbehaving producer cannot push the receiver + /// past its bounds. A small MaxMessageBytes is configured + /// so a modest GatewayHello payload — with its nonce + /// padded out to several hundred bytes — exceeds the limit + /// without allocating anything large. + /// + [Fact] + public async Task WriteAsync_WithEnvelopeAboveConfiguredMaximum_ThrowsMessageTooLarge() + { + const int maxMessageBytes = 64; + WorkerFrameProtocolOptions options = new( + SessionId, + GatewayContractInfo.WorkerProtocolVersion, + Nonce, + maxMessageBytes); + using MemoryStream stream = new(); + WorkerFrameWriter writer = new(stream, options); + + WorkerEnvelope envelope = CreateGatewayHelloEnvelope(); + envelope.GatewayHello.GatewayVersion = new string('x', 1024); + + WorkerFrameProtocolException exception = + await Assert.ThrowsAsync( + async () => await writer.WriteAsync(envelope)); + + Assert.Equal(WorkerFrameProtocolErrorCode.MessageTooLarge, exception.ErrorCode); + Assert.Equal(0, stream.Length); + } + + /// + /// Worker.Tests-021 (c): documents that the writer-side + /// InvalidEnvelope branch (raised when + /// WorkerEnvelope.CalculateSize() returns 0) is unreachable + /// through public API. WorkerEnvelopeValidator.Validate (run + /// before the size check in WorkerFrameWriter.WriteAsync) + /// rejects any envelope whose BodyCase is None with + /// InvalidEnvelope; a body-less envelope is therefore + /// intercepted before the empty-payload branch can fire. Any + /// envelope carrying a typed body serialises at least the field + /// tag bytes, so CalculateSize() is strictly positive. This + /// test exercises the body-less path and asserts the same + /// InvalidEnvelope error code reaches the caller, pinning + /// the contract that "no body" is rejected before any size check. + /// The defensive zero-length branch in WriteAsync is left + /// in place because the cost is one comparison and removing it + /// would weaken the writer against future serialisation + /// regressions; this test makes its rationale visible. + /// + [Fact] + public async Task WriteAsync_WithEmptyEnvelope_ThrowsInvalidEnvelopeFromValidator() + { + WorkerFrameProtocolOptions options = CreateOptions(); + using MemoryStream stream = new(); + WorkerFrameWriter writer = new(stream, options); + + WorkerEnvelope envelope = new() + { + ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, + SessionId = SessionId, + Sequence = 1, + // No body — BodyCase == None, validator rejects. + }; + + WorkerFrameProtocolException exception = + await Assert.ThrowsAsync( + async () => await writer.WriteAsync(envelope)); + + Assert.Equal(WorkerFrameProtocolErrorCode.InvalidEnvelope, exception.ErrorCode); + Assert.Equal(0, stream.Length); + } + /// Verifies that concurrent writes produce complete serialized frames. [Fact] public async Task WriteAsync_WithConcurrentCalls_SerializesCompleteFrames() diff --git a/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs b/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs index 7cf1960..762c13a 100644 --- a/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs +++ b/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs @@ -10,7 +10,6 @@ using MxGateway.Contracts; using MxGateway.Contracts.Proto; using MxGateway.Worker.Ipc; using MxGateway.Worker.MxAccess; -using MxGateway.Worker.Sta; using MxGateway.Worker.Tests.TestSupport; namespace MxGateway.Worker.Tests.Ipc; @@ -290,7 +289,14 @@ public sealed class WorkerPipeSessionTests } - /// Verifies that stale STA activity triggers watchdog fault. + /// + /// Verifies that stale STA activity with no command in flight triggers + /// the watchdog StaHung fault. Worker-017 changed the watchdog to skip + /// the fault while a command is in flight (the worker is busy + /// executing it, not hung), so this test deliberately leaves the + /// current-command correlation id empty to assert the genuine-hung + /// path still fires. + /// [Fact] public async Task RunAsync_WhenStaActivityIsStale_WritesWatchdogFault() { @@ -302,7 +308,7 @@ public sealed class WorkerPipeSessionTests pendingCommandCount: 0, outboundEventQueueDepth: 0, lastEventSequence: 0, - currentCommandCorrelationId: "stuck-command")); + currentCommandCorrelationId: string.Empty)); WorkerPipeSession session = CreatePipeSession( pipePair.WorkerStream, runtime, @@ -320,12 +326,75 @@ public sealed class WorkerPipeSessionTests cancellation.Token); Assert.Equal(WorkerFaultCategory.StaHung, fault.WorkerFault.Category); - Assert.Equal("stuck-command", fault.WorkerFault.CommandMethod); Assert.Contains("STA activity is stale", fault.WorkerFault.DiagnosticMessage); await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); } + /// + /// Worker-017 regression: while a command is in flight (snapshot's + /// current command correlation id is non-empty), stale STA activity + /// must NOT trigger the watchdog StaHung fault. The STA is busy + /// executing the command, not hung; StaRuntime.ProcessQueuedCommands + /// only calls MarkActivity() before and after each work item, + /// so a synchronously long-running command (e.g. ReadBulk + /// waiting timeout_ms for OnDataChange) legitimately freezes + /// LastActivityUtc. The heartbeat already advertises the + /// in-flight correlation id so the gateway can apply its own per-command + /// timeout. + /// + [Fact] + public async Task RunAsync_WhenStaActivityIsStaleWithCommandInFlight_DoesNotWriteWatchdogFault() + { + using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(10)); + using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token); + FakeRuntimeSession runtime = new(); + runtime.SetSnapshot(new WorkerRuntimeHeartbeatSnapshot( + DateTimeOffset.UtcNow - TimeSpan.FromSeconds(5), + pendingCommandCount: 0, + outboundEventQueueDepth: 0, + lastEventSequence: 0, + currentCommandCorrelationId: "slow-bulk-read")); + WorkerPipeSession session = CreatePipeSession( + pipePair.WorkerStream, + runtime, + new WorkerPipeSessionOptions + { + HeartbeatInterval = TimeSpan.FromMilliseconds(20), + HeartbeatGrace = TimeSpan.FromMilliseconds(50), + }); + Task runTask = session.RunAsync(cancellation.Token); + await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token); + + // Read several frames over a window much larger than HeartbeatGrace. + // None must be a WorkerFault; multiple heartbeats must all carry the + // in-flight correlation id. Reading a bounded count of frames keeps + // the pipe frame-aligned for the subsequent shutdown handshake. + const int framesToInspect = 6; + int heartbeatsObserved = 0; + for (int index = 0; index < framesToInspect; index++) + { + WorkerEnvelope envelope = await pipePair.GatewayReader + .ReadAsync(cancellation.Token); + Assert.NotEqual( + WorkerEnvelope.BodyOneofCase.WorkerFault, + envelope.BodyCase); + if (envelope.BodyCase == WorkerEnvelope.BodyOneofCase.WorkerHeartbeat) + { + Assert.Equal( + "slow-bulk-read", + envelope.WorkerHeartbeat.CurrentCommandCorrelationId); + heartbeatsObserved++; + } + } + + Assert.True( + heartbeatsObserved >= 2, + $"Expected multiple heartbeats during in-flight command window; observed {heartbeatsObserved}."); + + await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); + } + /// /// Worker-004 regression: once the watchdog reports an StaHung fault, /// subsequent heartbeats must report @@ -531,6 +600,89 @@ public sealed class WorkerPipeSessionTests await runTask; } + /// + /// Worker.Tests-017 regression: the WorkerCancel branch of + /// must + /// forward the envelope's correlation id to the runtime session via + /// and keep the + /// message loop running (no fault, no exit). The handler dispatch + /// returns true (keep reading), so a subsequent + /// WorkerShutdown still produces the normal shutdown ack. + /// + [Fact] + public async Task RunAsync_WhenGatewaySendsWorkerCancel_ForwardsCorrelationIdToRuntimeSession() + { + using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5)); + using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token); + FakeRuntimeSession runtime = new(); + WorkerPipeSession session = CreatePipeSession( + pipePair.WorkerStream, + runtime, + new WorkerPipeSessionOptions + { + HeartbeatInterval = TimeSpan.FromSeconds(1), + HeartbeatGrace = TimeSpan.FromSeconds(5), + }); + Task runTask = session.RunAsync(cancellation.Token); + await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token); + + await pipePair.GatewayWriter + .WriteAsync(CreateCancelEnvelope("cancel-correlation-1"), cancellation.Token); + + // The session must remain in its message loop: send a follow-up + // shutdown and observe the normal ack. If WorkerCancel had faulted + // the pipe or exited the loop, the ack would never arrive. + await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); + + Assert.Contains("cancel-correlation-1", runtime.CancelledCorrelationIds); + } + + /// + /// Worker.Tests-017 regression: the default: arm of + /// must + /// throw with + /// + /// when the gateway sends an envelope body that is invalid + /// post-handshake (here a second GatewayHello) and must exit + /// the message loop — + /// surfaces the exception to the caller. The message loop does not + /// emit a fault frame on this path (the handshake catch in + /// CompleteStartupHandshakeAsync is what writes faults for + /// pre-handshake protocol violations); the contract this test pins + /// is the exception type/error-code and message-loop exit. + /// + [Fact] + public async Task RunAsync_WhenGatewaySendsUnexpectedEnvelopeBodyAfterHandshake_ThrowsAndExitsMessageLoop() + { + using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(10)); + using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token); + FakeRuntimeSession runtime = new(); + // Use a long heartbeat interval so no heartbeat frame fires during + // the test window. With no heartbeats and no fault frame written on + // the unexpected-body path, the gateway pipe receives nothing after + // the handshake — no drain task is needed. + WorkerPipeSession session = CreatePipeSession( + pipePair.WorkerStream, + runtime, + new WorkerPipeSessionOptions + { + HeartbeatInterval = TimeSpan.FromSeconds(30), + HeartbeatGrace = TimeSpan.FromSeconds(60), + }); + Task runTask = session.RunAsync(cancellation.Token); + await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token); + + // Send a second GatewayHello — valid envelope, invalid for the + // post-handshake state, so DispatchGatewayEnvelopeAsync falls to + // the default arm. + await pipePair.GatewayWriter + .WriteAsync(CreateGatewayHelloEnvelope(), cancellation.Token); + + WorkerFrameProtocolException exception = + await Assert.ThrowsAsync(async () => await runTask); + Assert.Equal(WorkerFrameProtocolErrorCode.UnexpectedEnvelopeBody, exception.ErrorCode); + } + /// /// Worker-002 regression: the first heartbeat must be emitted /// immediately on entering the heartbeat loop, not after a full @@ -707,6 +859,21 @@ public sealed class WorkerPipeSessionTests }; } + private static WorkerEnvelope CreateCancelEnvelope(string correlationId) + { + return new WorkerEnvelope + { + ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, + SessionId = SessionId, + Sequence = 4, + CorrelationId = correlationId, + WorkerCancel = new WorkerCancel + { + Reason = "test-cancel", + }, + }; + } + private static WorkerEnvelope CreateShutdownEnvelope() { return new WorkerEnvelope diff --git a/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs b/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs index 4e13b9c..d2d9f4b 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandExecutorTests.cs @@ -317,81 +317,18 @@ public sealed class AlarmCommandExecutorTests private static MxAccessCommandExecutor NewExecutor(IAlarmCommandHandler? alarmHandler) { // Construct an executor with a no-op data session — we only exercise - // the alarm switch arms, which never touch the data session. + // the alarm switch arms, which never touch the data session. The + // session is built through the internal MxAccessSession.CreateForTesting + // factory (exposed via [assembly: InternalsVisibleTo("MxGateway.Worker.Tests")] + // on MxGateway.Worker), so no reflection is needed. return new MxAccessCommandExecutor( - session: NoopMxAccessSession.Create(), + session: MxAccessSession.CreateForTesting( + mxAccessServer: new NoopMxAccessServer(), + eventSink: new NoopEventSink()), variantConverter: new MxGateway.Worker.Conversion.VariantConverter(), alarmCommandHandler: alarmHandler); } - /// - /// Reflection-based helper to construct an MxAccessSession without - /// a real COM object. Only the alarm-side code paths are exercised - /// in this test class, so the session reference is never - /// dereferenced. - /// - private static class NoopMxAccessSession - { - public static MxAccessSession Create() - { - // Walk to the private constructor via reflection — the public - // factory MxAccessSession.Create(...) requires a real COM object. - // Signature mirrors MxAccessSession's private ctor; the - // MxAccessValueCache slot was added when ReadBulk gained the - // cached-vs-snapshot fork. - System.Reflection.ConstructorInfo? ctor = typeof(MxAccessSession) - .GetConstructor( - System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, - binder: null, - types: new[] - { - typeof(object), - typeof(IMxAccessServer), - typeof(IMxAccessEventSink), - typeof(MxAccessHandleRegistry), - typeof(MxAccessValueCache), - typeof(int), - }, - modifiers: null); - if (ctor is null) - { - throw new InvalidOperationException( - "MxAccessSession private ctor signature changed; update the test seam."); - } - return (MxAccessSession)ctor.Invoke(new object[] - { - new object(), - new NullMxAccessServer(), - new NoopEventSink(), - new MxAccessHandleRegistry(), - new MxAccessValueCache(), - System.Environment.CurrentManagedThreadId, - }); - } - } - - private sealed class NullMxAccessServer : IMxAccessServer - { - public int Register(string clientName) => 0; - public void Unregister(int serverHandle) { } - public int AddItem(int serverHandle, string itemDefinition) => 0; - public int AddItem2(int serverHandle, string itemDefinition, string itemContext) => 0; - public void RemoveItem(int serverHandle, int itemHandle) { } - public void Advise(int serverHandle, int itemHandle) { } - public void UnAdvise(int serverHandle, int itemHandle) { } - public void AdviseSupervisory(int serverHandle, int itemHandle) { } - public int AddBufferedItem(int serverHandle, string itemDefinition, string itemContext) => 0; - public void SetBufferedUpdateInterval(int serverHandle, int updateIntervalMilliseconds) { } - public void Suspend(int serverHandle, int itemHandle) { } - public void Activate(int serverHandle, int itemHandle) { } - public void Write(int serverHandle, int itemHandle, object? value, int userId) { } - public void Write2(int serverHandle, int itemHandle, object? value, object? timestampValue, int userId) { } - public void WriteSecured(int serverHandle, int itemHandle, int currentUserId, int verifierUserId, object? value) { } - public void WriteSecured2(int serverHandle, int itemHandle, int currentUserId, int verifierUserId, object? value, object? timestampValue) { } - public int AuthenticateUser(string userName, string password) => 0; - public int ArchestrAUserToId(string userName) => 0; - } - private sealed class FakeAlarmHandler : IAlarmCommandHandler { public string? LastSubscription { get; private set; } diff --git a/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs b/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs index 1e41b5e..597a0b4 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs @@ -39,6 +39,19 @@ public sealed class AlarmCommandHandlerTests () => handler.Subscribe(@"\\HOST\Galaxy!B", "s1")); } + /// + /// Worker.Tests-024: pins both the disposal contract and the + /// origin of the propagated exception. The fake throws + /// InvalidOperationException("simulated wnwrap subscribe failure") + /// from Subscribe; the handler must propagate that exact + /// exception (not swallow it and rethrow its own) and dispose the + /// just-constructed consumer so a retry can build a fresh one. + /// Pinning the message guards against a regression where the + /// handler throws a different + /// (for example its own "already subscribed" guard) and the + /// disposal assertion alone would still pass while hiding the + /// real swallow. + /// [Fact] public void Subscribe_WhenUnderlyingSubscribeThrows_DisposesConsumer() { @@ -47,8 +60,9 @@ public sealed class AlarmCommandHandlerTests new MxAccessEventQueue(), () => consumer); - Assert.Throws( + InvalidOperationException exception = Assert.Throws( () => handler.Subscribe(@"\\HOST\Galaxy!A", "s1")); + Assert.Contains("simulated wnwrap subscribe failure", exception.Message); Assert.False(handler.IsSubscribed); Assert.True(consumer.Disposed); } diff --git a/src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs b/src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs index 0a6d56b..ba95b3d 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/MxAccessLiveComCreationTests.cs @@ -3,6 +3,7 @@ using System.Threading.Tasks; using MxGateway.Contracts.Proto; using MxGateway.Worker.MxAccess; using MxGateway.Worker.Sta; +using MxGateway.Worker.Tests.TestSupport; namespace MxGateway.Worker.Tests.MxAccess; @@ -14,31 +15,18 @@ public sealed class MxAccessLiveComCreationTests private const string DefaultLiveAddItem2Context = "TestChildObject"; /// Verifies that StartAsync creates the installed MXAccess COM object on the STA thread when opted in. - [Fact] + [LiveMxAccessFact] public async Task StartAsync_WhenOptedIn_CreatesInstalledMxAccessComObjectOnSta() { - if (!string.Equals( - Environment.GetEnvironmentVariable("MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"), - "1", - StringComparison.Ordinal)) - { - return; - } - using MxAccessStaSession session = new(); await session.StartAsync(workerProcessId: 1234); } /// Verifies that Register and Unregister round-trip server handles with installed MXAccess. - [Fact] + [LiveMxAccessFact] public async Task RegisterAndUnregister_WhenOptedIn_RoundTripsInstalledMxAccessServerHandle() { - if (!RunLiveMxAccessTests()) - { - return; - } - using MxAccessStaSession session = new(); await session.StartAsync(workerProcessId: 1234); @@ -73,14 +61,9 @@ public sealed class MxAccessLiveComCreationTests } /// Verifies that AddItem and RemoveItem round-trip item handles with installed MXAccess. - [Fact] + [LiveMxAccessFact] public async Task AddItemAndRemoveItem_WhenOptedIn_RoundTripsInstalledMxAccessItemHandle() { - if (!RunLiveMxAccessTests()) - { - return; - } - using MxAccessStaSession session = new(); await session.StartAsync(workerProcessId: 1234); @@ -146,14 +129,9 @@ public sealed class MxAccessLiveComCreationTests } /// Verifies that AddItem2 and RemoveItem preserve item context with installed MXAccess. - [Fact] + [LiveMxAccessFact] public async Task AddItem2AndRemoveItem_WhenOptedIn_PreservesContextForInstalledMxAccess() { - if (!RunLiveMxAccessTests()) - { - return; - } - using MxAccessStaSession session = new(); await session.StartAsync(workerProcessId: 1234); @@ -220,14 +198,9 @@ public sealed class MxAccessLiveComCreationTests } /// Verifies that Advise and UnAdvise round-trip subscriptions with installed MXAccess. - [Fact] + [LiveMxAccessFact] public async Task AdviseAndUnAdvise_WhenOptedIn_RoundTripsInstalledMxAccessSubscription() { - if (!RunLiveMxAccessTests()) - { - return; - } - using MxAccessStaSession session = new(); await session.StartAsync(workerProcessId: 1234); @@ -341,14 +314,6 @@ public sealed class MxAccessLiveComCreationTests } } - private static bool RunLiveMxAccessTests() - { - return string.Equals( - Environment.GetEnvironmentVariable("MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"), - "1", - StringComparison.Ordinal); - } - private static string GetLiveAddItemReference() { string itemReference = Environment.GetEnvironmentVariable("MXGATEWAY_LIVE_MXACCESS_ITEM"); diff --git a/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs b/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs index 887d836..0d4ea6f 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs @@ -388,6 +388,55 @@ public sealed class MxAccessStaSessionTests Assert.Equal(typeof(System.Runtime.InteropServices.COMException).FullName, fault.ExceptionType); } + /// + /// Worker-016 regression: the alarm poll loop's catch for the graceful + /// STA-runtime-shutdown signal must NOT also swallow a vanilla + /// raised from inside the marshalled + /// poll lambda — for example the STA-affinity assertion thrown by + /// EnsureOnAlarmConsumerThread if a regression ever caused the poll + /// to run off the alarm-consumer thread. The runtime-shutdown signal is now + /// the dedicated ; a plain + /// from PollOnce must reach + /// the fault-recording arm and become observable on the event queue. + /// + [Fact] + public async Task RunAlarmPollLoop_WhenPollOnceThrowsInvalidOperation_RecordsFaultOnEventQueue() + { + FakeAlarmCommandHandler handler = new() + { + PollException = new InvalidOperationException( + "Alarm consumer accessed off its owning STA thread."), + }; + FakeMxAccessComObjectFactory factory = new(); + FakeMxAccessEventSink eventSink = new(); + using StaRuntime runtime = CreateRuntime(); + MxAccessEventQueue eventQueue = new(); + using MxAccessStaSession session = new( + runtime, + factory, + eventSink, + eventQueue, + _eq => handler); + + await session.StartAsync("session-1", workerProcessId: 1); + + using CancellationTokenSource timeout = new CancellationTokenSource(TimeSpan.FromSeconds(5)); + while (!eventQueue.IsFaulted && !timeout.IsCancellationRequested) + { + await Task.Delay(50, CancellationToken.None); + } + + Assert.True( + eventQueue.IsFaulted, + "Expected the alarm poll InvalidOperationException to fault the event queue, " + + "not be silently swallowed as a shutdown signal."); + WorkerFault? fault = session.DrainFault(); + Assert.NotNull(fault); + Assert.Equal(WorkerFaultCategory.MxaccessEventConversionFailed, fault!.Category); + Assert.Equal(typeof(InvalidOperationException).FullName, fault.ExceptionType); + Assert.Contains("alarm poll failed", fault.DiagnosticMessage, StringComparison.OrdinalIgnoreCase); + } + /// /// Worker-008 regression: the STA-affinity guard throws when an /// IMxAccessAlarmConsumer call is attempted off the thread that created diff --git a/src/MxGateway.Worker.Tests/MxAccess/MxAccessValueCacheTests.cs b/src/MxGateway.Worker.Tests/MxAccess/MxAccessValueCacheTests.cs index b69eb76..2b8ab3d 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/MxAccessValueCacheTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/MxAccessValueCacheTests.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.Threading; using System.Threading.Tasks; using Google.Protobuf.WellKnownTypes; @@ -84,27 +83,47 @@ public sealed class MxAccessValueCacheTests Assert.Equal(2UL, cache.CurrentVersion(7, 21)); } + /// + /// Worker.Tests-020: pins the contract that TryWaitForUpdate + /// returns false when the deadline has elapsed with no + /// Set, yields a default CachedValue, and invokes + /// pumpStep at least once so MXAccess Windows messages can + /// be dispatched. Earlier revisions of this test asserted both an + /// elapsed-time floor (stopwatch.ElapsedMilliseconds >= 60) + /// and pumpCalls > 1 — the same wall-clock-floor race + /// pattern Worker.Tests-003/004/013 corrected. To eliminate the + /// timing dependency entirely (the equivalent of a manual time + /// source for a DateTime.UtcNow-based deadline), the test + /// now supplies a deadline already in the past: the loop pumps + /// once, observes the passed deadline, and returns false + /// deterministically without any Thread.Sleep. The + /// deadline-honouring contract is what this test exists to pin; + /// elapsed time and pump-iteration count are incidental. + /// [Fact] public void TryWaitForUpdate_ReturnsFalseAfterDeadline_WhenNoSetOccurs() { MxAccessValueCache cache = new(); int pumpCalls = 0; - Stopwatch stopwatch = Stopwatch.StartNew(); + + // Deadline already in the past — eliminates the wall-clock-floor + // race. The loop must pump once (so MXAccess messages can dispatch + // on the calling thread even when the deadline has just expired) + // and then immediately observe the passed deadline. + DateTime expiredDeadlineUtc = DateTime.UtcNow.AddMilliseconds(-1); bool result = cache.TryWaitForUpdate( serverHandle: 7, itemHandle: 21, sinceVersion: 0, - deadlineUtc: DateTime.UtcNow.AddMilliseconds(80), + deadlineUtc: expiredDeadlineUtc, pumpStep: () => Interlocked.Increment(ref pumpCalls), out MxAccessValueCache.CachedValue value, pollIntervalMs: 5); - stopwatch.Stop(); Assert.False(result); Assert.Equal(default, value.Value); - Assert.True(pumpCalls > 1, $"pumpCalls={pumpCalls}: pump step should fire each poll iteration so MXAccess events can dispatch."); - Assert.True(stopwatch.ElapsedMilliseconds >= 60, $"elapsed={stopwatch.ElapsedMilliseconds}ms: wait should approximate the deadline."); + Assert.Equal(1, pumpCalls); } [Fact] diff --git a/src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs b/src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs index 096ae02..811e068 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Linq; using System.Reflection; using System.Threading; @@ -149,4 +150,171 @@ public sealed class WnWrapAlarmConsumerXmlTests && parameter.Name.IndexOf("poll", StringComparison.OrdinalIgnoreCase) >= 0); } } + + /// + /// Worker.Tests-022: pins the "new alarm sighting" branch of + /// . A GUID + /// that appears in next but not in previous must + /// produce exactly one transition with + /// as the previous + /// state — the proto layer relies on this sentinel to map a + /// first sighting to a Raise. + /// + [Fact] + public void ComputeTransitions_WhenAlarmIsNewInNextSnapshot_EmitsTransitionWithUnspecifiedPreviousState() + { + Guid alarmGuid = new Guid("BCC47053-9542-4D65-BDAA-BCDEA6A32A73"); + Dictionary previous = new(); + Dictionary next = new() + { + [alarmGuid] = NewRecord(alarmGuid, MxAlarmStateKind.UnackAlm), + }; + + IReadOnlyList transitions = + WnWrapAlarmConsumer.ComputeTransitions(previous, next); + + MxAlarmTransitionEvent single = Assert.Single(transitions); + Assert.Equal(alarmGuid, single.Record.AlarmGuid); + Assert.Equal(MxAlarmStateKind.UnackAlm, single.Record.State); + Assert.Equal(MxAlarmStateKind.Unspecified, single.PreviousState); + } + + /// + /// Worker.Tests-022: pins the "state unchanged" branch. A GUID + /// present in both snapshots with identical + /// must produce no + /// transition — a regression that emits a transition every poll + /// regardless of state change would slip through without this + /// test. + /// + [Fact] + public void ComputeTransitions_WhenAlarmStateUnchanged_EmitsNoTransition() + { + Guid alarmGuid = Guid.NewGuid(); + Dictionary previous = new() + { + [alarmGuid] = NewRecord(alarmGuid, MxAlarmStateKind.UnackAlm), + }; + Dictionary next = new() + { + [alarmGuid] = NewRecord(alarmGuid, MxAlarmStateKind.UnackAlm), + }; + + IReadOnlyList transitions = + WnWrapAlarmConsumer.ComputeTransitions(previous, next); + + Assert.Empty(transitions); + } + + /// + /// Worker.Tests-022: pins the "state changed" branch. A GUID + /// present in both snapshots with a different state must produce + /// one transition carrying the prior state so the proto layer + /// can distinguish e.g. UnackAlmAckAlm + /// (Acknowledge) from UnspecifiedUnackAlm (Raise). + /// + [Fact] + public void ComputeTransitions_WhenAlarmStateChanged_EmitsTransitionWithPriorState() + { + Guid alarmGuid = Guid.NewGuid(); + Dictionary previous = new() + { + [alarmGuid] = NewRecord(alarmGuid, MxAlarmStateKind.UnackAlm), + }; + Dictionary next = new() + { + [alarmGuid] = NewRecord(alarmGuid, MxAlarmStateKind.AckAlm), + }; + + IReadOnlyList transitions = + WnWrapAlarmConsumer.ComputeTransitions(previous, next); + + MxAlarmTransitionEvent single = Assert.Single(transitions); + Assert.Equal(alarmGuid, single.Record.AlarmGuid); + Assert.Equal(MxAlarmStateKind.AckAlm, single.Record.State); + Assert.Equal(MxAlarmStateKind.UnackAlm, single.PreviousState); + } + + /// + /// Worker.Tests-022: pins the "alarm cleared from the active set" + /// branch. AVEVA drops cleared alarms from + /// GetXmlCurrentAlarms2's active set rather than emitting a + /// transition record. A GUID present in + /// previous but absent from next must therefore + /// produce no transition; the diff treats disappearance as an + /// implicit clear that the proto layer recognises by the missing + /// GUID, not by an emitted event. + /// + [Fact] + public void ComputeTransitions_WhenAlarmDroppedFromActiveSet_EmitsNoTransition() + { + Guid alarmGuid = Guid.NewGuid(); + Dictionary previous = new() + { + [alarmGuid] = NewRecord(alarmGuid, MxAlarmStateKind.UnackAlm), + }; + Dictionary next = new(); + + IReadOnlyList transitions = + WnWrapAlarmConsumer.ComputeTransitions(previous, next); + + Assert.Empty(transitions); + } + + /// + /// Worker.Tests-022: pins the multi-alarm fan-out. Multiple + /// simultaneous transitions (new + changed + unchanged + dropped) + /// in one snapshot must produce exactly the changed and new + /// entries — not the unchanged and not the dropped. + /// + [Fact] + public void ComputeTransitions_WithMixedDelta_EmitsOnlyNewAndChangedTransitions() + { + Guid newGuid = Guid.NewGuid(); + Guid changedGuid = Guid.NewGuid(); + Guid unchangedGuid = Guid.NewGuid(); + Guid droppedGuid = Guid.NewGuid(); + + Dictionary previous = new() + { + [changedGuid] = NewRecord(changedGuid, MxAlarmStateKind.UnackAlm), + [unchangedGuid] = NewRecord(unchangedGuid, MxAlarmStateKind.AckAlm), + [droppedGuid] = NewRecord(droppedGuid, MxAlarmStateKind.UnackAlm), + }; + Dictionary next = new() + { + [newGuid] = NewRecord(newGuid, MxAlarmStateKind.UnackAlm), + [changedGuid] = NewRecord(changedGuid, MxAlarmStateKind.AckAlm), + [unchangedGuid] = NewRecord(unchangedGuid, MxAlarmStateKind.AckAlm), + }; + + IReadOnlyList transitions = + WnWrapAlarmConsumer.ComputeTransitions(previous, next); + + Assert.Equal(2, transitions.Count); + + MxAlarmTransitionEvent newTransition = Assert.Single( + transitions, + t => t.Record.AlarmGuid == newGuid); + Assert.Equal(MxAlarmStateKind.Unspecified, newTransition.PreviousState); + Assert.Equal(MxAlarmStateKind.UnackAlm, newTransition.Record.State); + + MxAlarmTransitionEvent changedTransition = Assert.Single( + transitions, + t => t.Record.AlarmGuid == changedGuid); + Assert.Equal(MxAlarmStateKind.UnackAlm, changedTransition.PreviousState); + Assert.Equal(MxAlarmStateKind.AckAlm, changedTransition.Record.State); + } + + private static MxAlarmSnapshotRecord NewRecord(Guid guid, MxAlarmStateKind state) + { + return new MxAlarmSnapshotRecord + { + AlarmGuid = guid, + State = state, + TagName = "TestMachine.TestAlarm", + ProviderNode = "TEST-NODE", + ProviderName = "Galaxy", + }; + } } diff --git a/src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs b/src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs similarity index 99% rename from src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs rename to src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs index ad716de..39432fc 100644 --- a/src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs +++ b/src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs @@ -141,7 +141,7 @@ public sealed class AlarmClientWmProbeTests : IDisposable } [Fact(Skip = "Runtime probe — flip Skip=null on the dev rig (AVEVA installed) to capture alarm-path behavior")] - public void ProbeAlarmClientWmMessages() + public void ProbeAlarmClient_OnDevRig_LogsAlarmWindowMessages() { // 1. Pre-resolve a few candidate RegisterWindowMessage strings so any // matches in the captured log can be labeled. None of these is diff --git a/src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs b/src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs similarity index 99% rename from src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs rename to src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs index 07c0bfa..77c1aba 100644 --- a/src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs +++ b/src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs @@ -42,7 +42,7 @@ public sealed class AlarmsLiveSmokeTests } [Fact(Skip = "Live dev-rig smoke test — flip Skip=null with AVEVA + the alarm flip script running. Verified working 2026-05-01.")] - public void Alarms_full_pipeline_round_trip() + public void Alarms_FullPipelineRoundTrip_RaisesAndAcknowledges() { Exception? threadException = null; var done = new ManualResetEventSlim(false); diff --git a/src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs b/src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs similarity index 99% rename from src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs rename to src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs index 3840e7d..e12d6fe 100644 --- a/src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs +++ b/src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs @@ -52,7 +52,7 @@ public sealed class WnWrapConsumerProbeTests } [Fact(Skip = "Runtime probe — flip Skip=null on the dev rig (AVEVA installed) to capture wnwrapConsumer XML alarm output. Verified working 2026-05-01.")] - public void ProbeWnWrapConsumer() + public void ProbeWnWrapConsumer_OnDevRig_LogsXmlAlarmStream() { Exception? threadException = null; var done = new ManualResetEventSlim(false); diff --git a/src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs b/src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs index b9562bd..8320c70 100644 --- a/src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs +++ b/src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs @@ -99,7 +99,16 @@ public sealed class StaRuntimeTests Assert.Equal(runtime.StaThreadId, threadId); } - /// Verifies that InvokeAsync returns a faulted task when called after Shutdown. + /// + /// Verifies that InvokeAsync returns a faulted task when called after + /// Shutdown. Worker-016 introduced + /// (a dedicated subtype of ) so + /// callers — notably MxAccessStaSession.RunAlarmPollLoopAsync — + /// can distinguish the graceful shutdown signal from a vanilla + /// such as an STA-affinity + /// assertion. The test pins the exact type so a regression that + /// reverts to a plain InvalidOperationException fails here. + /// [Fact] public async Task InvokeAsync_AfterShutdown_ReturnsFaultedTask() { @@ -108,7 +117,7 @@ public sealed class StaRuntimeTests runtime.Start(); runtime.Shutdown(TimeSpan.FromSeconds(2)); - InvalidOperationException exception = await Assert.ThrowsAsync( + StaRuntimeShutdownException exception = await Assert.ThrowsAsync( () => runtime.InvokeAsync(() => Thread.CurrentThread.ManagedThreadId)); Assert.Contains("shutting down", exception.Message); diff --git a/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs b/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs index 47a553e..2926381 100644 --- a/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs +++ b/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs @@ -23,6 +23,7 @@ internal sealed class FakeRuntimeSession : IWorkerRuntimeSession private readonly ManualResetEventSlim releaseDispatch = new(false); private readonly object gate = new(); private readonly Queue events = new(); + private readonly List cancelledCorrelationIds = new(); private WorkerRuntimeHeartbeatSnapshot snapshot = new( DateTimeOffset.UtcNow, pendingCommandCount: 0, @@ -148,12 +149,41 @@ internal sealed class FakeRuntimeSession : IWorkerRuntimeSession return null; } + /// + /// Gets a snapshot of every correlation id passed to + /// . Recording lets the IPC tests + /// assert that a WorkerCancel envelope dispatched on the + /// gateway side reaches the runtime session — see Worker.Tests-017. + /// + public IReadOnlyList CancelledCorrelationIds + { + get + { + lock (gate) + { + return new List(cancelledCorrelationIds); + } + } + } + + /// + /// Optional return value yielded by . + /// Defaults to false (the runtime had no matching in-flight + /// command), matching the previous test-double behaviour. + /// + public bool CancelCommandReturnValue { get; set; } + /// Cancels command by correlation ID. /// The command correlation ID. /// True if cancelled; false otherwise. public bool CancelCommand(string correlationId) { - return false; + lock (gate) + { + cancelledCorrelationIds.Add(correlationId); + } + + return CancelCommandReturnValue; } /// Requests graceful shutdown. diff --git a/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs b/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs new file mode 100644 index 0000000..bf1030c --- /dev/null +++ b/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs @@ -0,0 +1,36 @@ +using System; + +namespace MxGateway.Worker.Tests.TestSupport; + +/// +/// Marks an xUnit test as requiring installed MXAccess COM and live +/// provider state. When the opt-in environment variable +/// MXGATEWAY_RUN_LIVE_MXACCESS_TESTS is not set to 1, the +/// test is reported as Skipped by xUnit rather than silently +/// returning early (which xUnit would otherwise report as +/// Passed). Mirrors +/// MxGateway.IntegrationTests.LiveMxAccessFactAttribute; the +/// copy avoids a cross-project reference and keeps the Worker.Tests +/// net48/x86 build self-contained. +/// +public sealed class LiveMxAccessFactAttribute : FactAttribute +{ + /// + /// The environment variable that opts the suite into running live + /// MXAccess COM tests. Must be set to 1 on a machine with the + /// installed MXAccess runtime and a reachable Galaxy provider. + /// + public const string LiveMxAccessVariableName = "MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"; + + /// Initializes the attribute, skipping the test unless the env var is set. + public LiveMxAccessFactAttribute() + { + if (!string.Equals( + Environment.GetEnvironmentVariable(LiveMxAccessVariableName), + "1", + StringComparison.Ordinal)) + { + Skip = $"Set {LiveMxAccessVariableName}=1 to run live MXAccess tests."; + } + } +} diff --git a/src/MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs b/src/MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs new file mode 100644 index 0000000..a62034e --- /dev/null +++ b/src/MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs @@ -0,0 +1,92 @@ +using MxGateway.Worker.MxAccess; + +namespace MxGateway.Worker.Tests.TestSupport; + +/// +/// Shared no-operation for tests that need to +/// construct an via +/// but do not exercise any +/// MXAccess COM call. Replaces the per-file NullMxAccessServer copy +/// that previously lived inside AlarmCommandExecutorTests and was +/// constructed via reflection — see Worker.Tests-016 for the rationale. +/// +internal sealed class NoopMxAccessServer : IMxAccessServer +{ + /// + public int Register(string clientName) => 0; + + /// + public void Unregister(int serverHandle) + { + } + + /// + public int AddItem(int serverHandle, string itemDefinition) => 0; + + /// + public int AddItem2(int serverHandle, string itemDefinition, string itemContext) => 0; + + /// + public void RemoveItem(int serverHandle, int itemHandle) + { + } + + /// + public void Advise(int serverHandle, int itemHandle) + { + } + + /// + public void UnAdvise(int serverHandle, int itemHandle) + { + } + + /// + public void AdviseSupervisory(int serverHandle, int itemHandle) + { + } + + /// + public int AddBufferedItem(int serverHandle, string itemDefinition, string itemContext) => 0; + + /// + public void SetBufferedUpdateInterval(int serverHandle, int updateIntervalMilliseconds) + { + } + + /// + public void Suspend(int serverHandle, int itemHandle) + { + } + + /// + public void Activate(int serverHandle, int itemHandle) + { + } + + /// + public void Write(int serverHandle, int itemHandle, object? value, int userId) + { + } + + /// + public void Write2(int serverHandle, int itemHandle, object? value, object? timestampValue, int userId) + { + } + + /// + public void WriteSecured(int serverHandle, int itemHandle, int currentUserId, int verifierUserId, object? value) + { + } + + /// + public void WriteSecured2(int serverHandle, int itemHandle, int currentUserId, int verifierUserId, object? value, object? timestampValue) + { + } + + /// + public int AuthenticateUser(string userName, string password) => 0; + + /// + public int ArchestrAUserToId(string userName) => 0; +} diff --git a/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs b/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs index 6af6751..7adc38e 100644 --- a/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs +++ b/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs @@ -402,7 +402,15 @@ public sealed class WorkerPipeSession try { MxCommandReply reply = await runtimeSession.DispatchAsync(staCommand).ConfigureAwait(false); - if (_state is not WorkerState.Ready and not WorkerState.ExecutingCommand) + // _state is only ever assigned Starting, Handshaking, InitializingSta, + // Ready, ShuttingDown, Faulted, or Stopped — never ExecutingCommand + // (that value is synthesized in CreateHeartbeat from the live + // CurrentCommandCorrelationId and never written back to _state). So + // the only command-serving state is Ready; anything else means a + // state transition (shutdown / fault) raced the command's + // completion and we must drop the reply rather than write into a + // half-torn-down pipe. + if (_state != WorkerState.Ready) { LogCommandResultDropped(envelope.CorrelationId, staCommand.MethodName); return; @@ -420,7 +428,7 @@ public sealed class WorkerPipeSession } catch (Exception exception) when (exception is not OperationCanceledException) { - if (_state is not WorkerState.Ready and not WorkerState.ExecutingCommand) + if (_state != WorkerState.Ready) { LogCommandResultDropped(envelope.CorrelationId, staCommand.MethodName); return; @@ -599,6 +607,24 @@ public sealed class WorkerPipeSession } } + /// + /// The watchdog detects a hung STA (no thread activity for longer than + /// HeartbeatGrace) and emits an StaHung fault. Design + /// intent: catch a stuck STA thread, not a legitimately long-running + /// command. StaRuntime.ProcessQueuedCommands calls + /// MarkActivity() only immediately before and after + /// workItem.Execute(), so a synchronously long-running STA + /// command (e.g. ReadBulk waiting timeout_ms for the + /// first OnDataChange callback) freezes LastActivityUtc for the + /// duration of the wait even though the worker is healthy. To avoid + /// self-faulting a healthy in-flight command (Worker-017), the + /// watchdog is suppressed while CurrentCommandCorrelationId is + /// non-empty — the worker already advertises the in-flight command on + /// each heartbeat, so the gateway has the signal it needs to decide + /// the command is just slow. The watchdog still fires on a truly hung + /// STA (no command in flight and no activity), which is the only case + /// the watchdog can usefully distinguish from a slow command. + /// private async Task ReportWatchdogFaultIfNeededAsync( WorkerRuntimeHeartbeatSnapshot snapshot, CancellationToken cancellationToken) @@ -610,6 +636,17 @@ public sealed class WorkerPipeSession return; } + if (!string.IsNullOrEmpty(snapshot.CurrentCommandCorrelationId)) + { + // A command is in flight — the STA is busy executing it, not + // hung. The next MarkActivity() in StaRuntime.ProcessQueuedCommands + // will refresh LastActivityUtc once the command returns, at which + // point this branch stops being taken. The heartbeat already + // surfaces the in-flight correlation id so the gateway can apply + // its own per-command timeout if it considers the command too slow. + return; + } + if (_watchdogFaultSent) { return; @@ -789,16 +826,28 @@ public sealed class WorkerPipeSession private async Task InitializeMxAccessAsync(CancellationToken cancellationToken) { - _runtimeSession = new MxAccessStaSession(eq => new AlarmCommandHandler(eq)); + // RunAsync constructs the runtime session via _runtimeSessionFactory() + // before invoking CompleteStartupHandshakeAsync, so on the production + // path _runtimeSession is already non-null when this default + // initializer runs. Treat that pre-existing instance as authoritative + // and only drive its StartAsync — unconditionally reassigning + // _runtimeSession here would leak the factory-supplied session (no + // Dispose) and replace it with a hard-coded MxAccessStaSession, + // discarding the factory's configuration. The fall-back construction + // is preserved for the legacy direct-invocation path where the + // parameterless CompleteStartupHandshakeAsync is used without a + // prior factory call. + _runtimeSession ??= new MxAccessStaSession(eq => new AlarmCommandHandler(eq)); + IWorkerRuntimeSession session = _runtimeSession; try { - return await _runtimeSession + return await session .StartAsync(_options.SessionId, _processIdProvider(), cancellationToken) .ConfigureAwait(false); } catch { - _runtimeSession.Dispose(); + session.Dispose(); _runtimeSession = null; throw; } diff --git a/src/MxGateway.Worker/MxAccess/MxAccessSession.cs b/src/MxGateway.Worker/MxAccess/MxAccessSession.cs index 1aab7b4..6758616 100644 --- a/src/MxGateway.Worker/MxAccess/MxAccessSession.cs +++ b/src/MxGateway.Worker/MxAccess/MxAccessSession.cs @@ -58,6 +58,35 @@ public sealed class MxAccessSession : IDisposable }; } + /// + /// Test-only seam: constructs a session that bypasses the live COM + /// factory. The caller supplies the and + /// directly so tests can exercise + /// session methods without touching MXAccess COM. This is exposed via + /// InternalsVisibleTo("MxGateway.Worker.Tests"); production code + /// must use the factory. + /// + /// The server abstraction to drive. + /// The event sink to attach to the session. + /// Optional handle registry; a fresh one is created when null. + /// Optional value cache; a fresh one is created when null. + /// Optional creation thread id; defaults to the current managed thread id. + internal static MxAccessSession CreateForTesting( + IMxAccessServer mxAccessServer, + IMxAccessEventSink eventSink, + MxAccessHandleRegistry? handleRegistry = null, + MxAccessValueCache? valueCache = null, + int? creationThreadId = null) + { + return new MxAccessSession( + new object(), + mxAccessServer, + eventSink, + handleRegistry ?? new MxAccessHandleRegistry(), + valueCache ?? new MxAccessValueCache(), + creationThreadId ?? Environment.CurrentManagedThreadId); + } + /// Creates and initializes an MXAccess COM session. /// Factory to create the MXAccess COM object. /// Event sink to attach to the COM object. diff --git a/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs b/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs index 4c040ff..adbf5b2 100644 --- a/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs +++ b/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs @@ -258,19 +258,31 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession // STA runtime or alarm handler disposed — stop the loop gracefully. return; } - catch (InvalidOperationException) + catch (StaRuntimeShutdownException) { // STA runtime shutting down — stop the loop gracefully. + // The dedicated shutdown type lets us distinguish this + // graceful-stop signal from the STA-affinity assertion + // raised by EnsureOnAlarmConsumerThread (Worker-008), + // which is also an InvalidOperationException but signals + // a programming-error regression — that case falls through + // to the generic Exception arm below and is recorded as a + // fault on the event queue, so an affinity regression + // becomes observable on the IPC fault path instead of + // silently stopping alarm delivery. return; } catch (Exception exception) { // A real alarm-poll failure (COMException from - // GetXmlCurrentAlarms2, malformed-XML parse failure, etc.). - // Record it as a fault on the event queue so a broken - // alarm subscription becomes observable on the IPC fault - // path instead of silently faulting this never-awaited - // task. The loop then stops — the subscription is dead. + // GetXmlCurrentAlarms2, malformed-XML parse failure, an + // STA-affinity InvalidOperationException from + // EnsureOnAlarmConsumerThread, etc.). Record it as a + // fault on the event queue so a broken alarm subscription + // — or an affinity-invariant regression — becomes + // observable on the IPC fault path instead of silently + // faulting this never-awaited task. The loop then stops — + // the subscription is dead. eventQueue.RecordFault(CreateAlarmPollFault(exception)); return; } diff --git a/src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs b/src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs index 23c1816..67b46e0 100644 --- a/src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs +++ b/src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs @@ -2,22 +2,6 @@ using System; namespace MxGateway.Worker.MxAccess; -/// -/// Library-agnostic alarm-state enum. Mirrors the four STATE -/// values returned by AVEVA's WNWRAPCONSUMERLib XML payload — -/// UNACK_ALM, ACK_ALM, UNACK_RTN, ACK_RTN. -/// Decoupling the consumer from any specific COM library keeps the -/// proto-build path testable without an AVEVA install. -/// -public enum MxAlarmStateKind -{ - Unspecified = 0, - UnackAlm = 1, - AckAlm = 2, - UnackRtn = 3, - AckRtn = 4, -} - /// /// Single alarm record as emitted by the wnwrapConsumer XML stream. /// Field names match the captured XML schema (see @@ -40,20 +24,3 @@ public sealed class MxAlarmSnapshotRecord public string OperatorName { get; set; } = string.Empty; public string AlarmComment { get; set; } = string.Empty; } - -/// -/// One transition emitted by the consumer's snapshot diff. Pairs the -/// latest record with its previous state so the proto layer can decide -/// whether the transition is a Raise / Acknowledge / Clear. -/// -public sealed class MxAlarmTransitionEvent : EventArgs -{ - public MxAlarmSnapshotRecord Record { get; set; } = new MxAlarmSnapshotRecord(); - - /// - /// The state on the consumer's previous polled snapshot, or - /// when this is the - /// first time the GUID has been observed. - /// - public MxAlarmStateKind PreviousState { get; set; } -} diff --git a/src/MxGateway.Worker/MxAccess/MxAlarmStateKind.cs b/src/MxGateway.Worker/MxAccess/MxAlarmStateKind.cs new file mode 100644 index 0000000..0176b81 --- /dev/null +++ b/src/MxGateway.Worker/MxAccess/MxAlarmStateKind.cs @@ -0,0 +1,17 @@ +namespace MxGateway.Worker.MxAccess; + +/// +/// Library-agnostic alarm-state enum. Mirrors the four STATE +/// values returned by AVEVA's WNWRAPCONSUMERLib XML payload — +/// UNACK_ALM, ACK_ALM, UNACK_RTN, ACK_RTN. +/// Decoupling the consumer from any specific COM library keeps the +/// proto-build path testable without an AVEVA install. +/// +public enum MxAlarmStateKind +{ + Unspecified = 0, + UnackAlm = 1, + AckAlm = 2, + UnackRtn = 3, + AckRtn = 4, +} diff --git a/src/MxGateway.Worker/MxAccess/MxAlarmTransitionEvent.cs b/src/MxGateway.Worker/MxAccess/MxAlarmTransitionEvent.cs new file mode 100644 index 0000000..50011db --- /dev/null +++ b/src/MxGateway.Worker/MxAccess/MxAlarmTransitionEvent.cs @@ -0,0 +1,20 @@ +using System; + +namespace MxGateway.Worker.MxAccess; + +/// +/// One transition emitted by the consumer's snapshot diff. Pairs the +/// latest record with its previous state so the proto layer can decide +/// whether the transition is a Raise / Acknowledge / Clear. +/// +public sealed class MxAlarmTransitionEvent : EventArgs +{ + public MxAlarmSnapshotRecord Record { get; set; } = new MxAlarmSnapshotRecord(); + + /// + /// The state on the consumer's previous polled snapshot, or + /// when this is the + /// first time the GUID has been observed. + /// + public MxAlarmStateKind PreviousState { get; set; } +} diff --git a/src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs b/src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs index caee1d4..5f331d5 100644 --- a/src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs +++ b/src/MxGateway.Worker/MxAccess/WnWrapAlarmConsumer.cs @@ -56,7 +56,6 @@ public sealed class WnWrapAlarmConsumer : IMxAccessAlarmConsumer private wwAlarmConsumerClass? client; private wwAlarmConsumerClass? ackClient; - private string subscriptionExpression = string.Empty; private bool subscribed; private bool disposed; @@ -157,8 +156,28 @@ public sealed class WnWrapAlarmConsumer : IMxAccessAlarmConsumer // also breaks AlarmAckByName on the same consumer (rejects with // -55), so a separate ack-only consumer is provisioned below // that gets only Initialize/Register/Subscribe (no SetXmlAlarmQuery). + // + // The wnwrap interop signature is `void SetXmlAlarmQuery(string)` + // — there is no integer return code to gate on like the other v1 + // lifecycle calls in this method. A genuine failure surfaces as a + // COM exception (mapped from the underlying HRESULT). Wrap the + // call so a failure becomes an InvalidOperationException with + // diagnostic context, matching the other call-gates' failure + // shape rather than letting an opaque COMException escape with + // no indication that the alarm subscription is now misconfigured + // and the next GetXmlCurrentAlarms2 poll will fail with E_FAIL. string xmlQuery = ComposeXmlAlarmQuery(subscription); - com.SetXmlAlarmQuery(xmlQuery); + try + { + com.SetXmlAlarmQuery(xmlQuery); + } + catch (COMException ex) + { + throw new InvalidOperationException( + $"wwAlarmConsumer.SetXmlAlarmQuery failed with HRESULT 0x{ex.HResult:X8}; " + + "subsequent GetXmlCurrentAlarms2 polls would return E_FAIL.", + ex); + } // Provision a parallel COM consumer for ack calls. It runs the // v1 lifecycle (Initialize/Register/Subscribe) only; without @@ -185,7 +204,6 @@ public sealed class WnWrapAlarmConsumer : IMxAccessAlarmConsumer $"Ack consumer setup returned non-zero status: " + $"Initialize={ackInit}, Register={ackReg}, Subscribe={ackSub}."); } - subscriptionExpression = subscription; subscribed = true; } @@ -303,23 +321,10 @@ public sealed class WnWrapAlarmConsumer : IMxAccessAlarmConsumer Dictionary next = ParseSnapshotXml(xml); - List transitions = new List(); + IReadOnlyList transitions; lock (syncRoot) { - foreach (KeyValuePair kv in next) - { - MxAlarmStateKind previousState = MxAlarmStateKind.Unspecified; - if (latestSnapshot.TryGetValue(kv.Key, out MxAlarmSnapshotRecord? prev)) - { - previousState = prev.State; - if (previousState == kv.Value.State) continue; // no transition - } - transitions.Add(new MxAlarmTransitionEvent - { - Record = kv.Value, - PreviousState = previousState, - }); - } + transitions = ComputeTransitions(latestSnapshot, next); latestSnapshot.Clear(); foreach (KeyValuePair kv in next) { @@ -336,6 +341,52 @@ public sealed class WnWrapAlarmConsumer : IMxAccessAlarmConsumer } } + /// + /// Pure snapshot-to-transitions diff. Compares the previous polled + /// snapshot to the next snapshot and produces one + /// per state change. Used by + /// after a successful + /// GetXmlCurrentAlarms2 call; exposed as internal static + /// so the diff rules can be unit-tested without driving the + /// wnwrapConsumer COM object (Worker.Tests-022). + /// + /// + /// Rules: + /// + /// A GUID present in but not in produces a transition with as the previous state — first sighting. + /// A GUID present in both with the same produces no transition. + /// A GUID present in both with a different produces a transition carrying the prior state. + /// A GUID present in but absent from produces no transition. AVEVA drops cleared alarms from the active set; the snapshot simply stops mentioning them. + /// + /// + /// The snapshot from the previous poll (or empty on first call). + /// The snapshot just parsed from GetXmlCurrentAlarms2. + /// One transition per state change in . + internal static IReadOnlyList ComputeTransitions( + Dictionary previous, + Dictionary next) + { + if (previous is null) throw new ArgumentNullException(nameof(previous)); + if (next is null) throw new ArgumentNullException(nameof(next)); + + List transitions = new List(); + foreach (KeyValuePair kv in next) + { + MxAlarmStateKind previousState = MxAlarmStateKind.Unspecified; + if (previous.TryGetValue(kv.Key, out MxAlarmSnapshotRecord? prev)) + { + previousState = prev.State; + if (previousState == kv.Value.State) continue; // no transition + } + transitions.Add(new MxAlarmTransitionEvent + { + Record = kv.Value, + PreviousState = previousState, + }); + } + return transitions; + } + /// /// Parse the XML payload returned by GetXmlCurrentAlarms2 /// into a GUID-keyed dictionary. Records with malformed GUIDs are diff --git a/src/MxGateway.Worker/Sta/StaRuntime.cs b/src/MxGateway.Worker/Sta/StaRuntime.cs index 28cc3b3..3923e1f 100644 --- a/src/MxGateway.Worker/Sta/StaRuntime.cs +++ b/src/MxGateway.Worker/Sta/StaRuntime.cs @@ -99,7 +99,7 @@ public sealed class StaRuntime : IDisposable { if (shutdownRequested) { - throw new InvalidOperationException("The worker STA runtime is shutting down."); + throw new StaRuntimeShutdownException(); } if (!startRequested) @@ -167,8 +167,7 @@ public sealed class StaRuntime : IDisposable { if (shutdownRequested) { - return Task.FromException( - new InvalidOperationException("The worker STA runtime is shutting down.")); + return Task.FromException(new StaRuntimeShutdownException()); } commandQueue.Enqueue(workItem); diff --git a/src/MxGateway.Worker/Sta/StaRuntimeShutdownException.cs b/src/MxGateway.Worker/Sta/StaRuntimeShutdownException.cs new file mode 100644 index 0000000..57e3486 --- /dev/null +++ b/src/MxGateway.Worker/Sta/StaRuntimeShutdownException.cs @@ -0,0 +1,35 @@ +using System; + +namespace MxGateway.Worker.Sta; + +/// +/// Thrown by when an operation is rejected because +/// the runtime is shutting down (or has already shut down). The dedicated +/// type lets callers distinguish a graceful shutdown signal — which should +/// stop their work loops without recording a fault — from a genuine +/// programming-error such as the +/// STA-affinity assertion in MxAccessStaSession.AssertOnAlarmConsumerThread. +/// It inherits from so existing +/// callers that catch the latter remain source-compatible. +/// +public sealed class StaRuntimeShutdownException : InvalidOperationException +{ + /// + /// Initializes a new instance of + /// with a default message. + /// + public StaRuntimeShutdownException() + : base("The worker STA runtime is shutting down.") + { + } + + /// + /// Initializes a new instance of + /// with the specified message. + /// + /// Diagnostic message. + public StaRuntimeShutdownException(string message) + : base(message) + { + } +}