From 6b5fe6aa82eb6065fe771694e1d77da0474dd8a3 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 17 Jun 2026 05:23:14 -0400 Subject: [PATCH] fix: resolve code-review findings (locally verified) Server-054/055/056, Contracts-020/021/022, Tests-036/038/039, IntegrationTests-030/031/032 (+033 deferred to live rig), Client.Dotnet-026/028/029 (+027 won't-fix), Client.Go-030..034, Client.Python-032..036, Client.Rust-033..038. Key fix: SessionEventDistributor orphaned a subscriber that registered after the pump completed but before disposal (Server-056) -> register paths now complete late registrants under _lifecycleLock; regression test added. The racy dashboard-mirror gRPC test made deterministic (Tests-039). Verified green locally: gateway Tests targeted classes (GatewaySession, SessionEventDistributor, GatewayOptionsValidator, ProtobufContractRoundTrip, GatewaySessionDashboardMirror) + dotnet/go/python/rust client suites. --- CLAUDE.md | 2 +- .../IMxGatewayCliClient.cs | 7 + .../MxGatewayClientCli.cs | 39 ++++-- .../MxGatewayClientCliTests.cs | 42 ++++++ .../LazyBrowseNode.cs | 6 + clients/go/README.md | 54 ++++++++ clients/go/cmd/mxgw-go/main.go | 29 +++- clients/go/cmd/mxgw-go/main_test.go | 40 ++++++ clients/python/README.md | 8 +- .../src/zb_mom_ww_mxgateway/__init__.py | 6 +- .../src/zb_mom_ww_mxgateway_cli/commands.py | 19 +-- .../tests/test_review_findings_032_to_036.py | 131 ++++++++++++++++++ clients/rust/README.md | 4 +- clients/rust/RustClientDesign.md | 8 ++ clients/rust/crates/mxgw-cli/src/main.rs | 127 +++++++++++++++-- code-reviews/Client.Dotnet/findings.md | 18 +-- code-reviews/Client.Go/findings.md | 22 +-- code-reviews/Client.Python/findings.md | 22 +-- code-reviews/Client.Rust/findings.md | 26 ++-- code-reviews/Contracts/findings.md | 14 +- code-reviews/IntegrationTests/findings.md | 18 +-- code-reviews/README.md | 82 +++++------ code-reviews/Server/findings.md | 25 +++- code-reviews/Tests/findings.md | 29 +++- docs/DesignDecisions.md | 81 ++++++++--- docs/GatewayTesting.md | 16 ++- gateway.md | 15 +- .../Generated/MxaccessGateway.cs | 3 +- .../Protos/mxaccess_gateway.proto | 3 +- .../WorkerLiveMxAccessSmokeTests.cs | 14 +- .../Sessions/GatewaySession.cs | 26 +++- .../Sessions/SessionEventDistributor.cs | 45 +++++- .../GatewayOptionsValidatorTests.cs | 87 ++++++++++++ .../ProtobufContractRoundTripTests.cs | 45 ++++++ .../GatewaySessionDashboardMirrorTests.cs | 46 +++++- .../Gateway/Sessions/GatewaySessionTests.cs | 42 +++++- .../Sessions/SessionEventDistributorTests.cs | 59 +++++++- 37 files changed, 1049 insertions(+), 211 deletions(-) create mode 100644 clients/python/tests/test_review_findings_032_to_036.py diff --git a/CLAUDE.md b/CLAUDE.md index 99ac3c3..4250aa9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -73,7 +73,7 @@ powershell -ExecutionPolicy Bypass -File scripts/run-client-e2e-tests.ps1 - **Style guides** in `docs/style-guides/` are authoritative. Follow `CSharpStyleGuide.md` for gateway/worker/.NET-client code: file-scoped namespaces, `sealed` by default, `Async` suffix on Task-returning methods, MXAccess-aligned names (`MxStatusProxy`, `ServerHandle`, `ItemHandle`, `HResult`). - **MXAccess parity is the contract.** Don't "fix" surprising MXAccess behavior (e.g., `WriteSecured` failing before a value-bearing NMX body, distinct `OperationComplete` semantics, invalid-handle exceptions) unless the client explicitly opts into a non-parity mode. The installed MXAccess COM component is the baseline. - **Don't synthesize events.** The gateway forwards only events the worker emits; it never invents `OperationComplete` from write completion or command replies. -- **One worker per session, one event subscriber per session** (v1). Multi-subscriber fan-out and reconnectable sessions are explicitly out of scope — see `docs/DesignDecisions.md`. +- **One worker per session** (invariant). Multi-subscriber event fan-out and reconnect-with-replay have shipped and are config-gated: `AllowMultipleEventSubscribers` (default `false`) enables fan-out up to `MaxEventSubscribersPerSession` (default `8`); `DetachGraceSeconds` (default `30`) retains a session after its last subscriber drops so clients can reconnect; `ReplayBufferCapacity` / `ReplayRetentionSeconds` control how much event history the replay ring keeps. Default config preserves the original single-subscriber, no-retention behavior. See `docs/DesignDecisions.md` and `docs/Sessions.md`. - **Gateway restart does not reattach orphan workers.** The first version terminates orphaned workers on startup; do not design code paths that assume reattachment. - **No Blazor UI component libraries.** Dashboard uses local Bootstrap CSS/JS only — do not introduce MudBlazor, Radzen, FluentUI, etc. - **Don't log secrets or full tag values by default.** API keys, passwords, `WriteSecured` payloads, and `AuthenticateUser` credentials must never reach logs. Value logging is opt-in and redacted. diff --git a/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs b/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs index 5f0b67d..39dd1f4 100644 --- a/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs +++ b/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs @@ -3,6 +3,13 @@ using ZB.MOM.WW.MxGateway.Contracts.Proto.Galaxy; namespace ZB.MOM.WW.MxGateway.Client.Cli; +/// +/// Transport seam used by the CLI to drive gateway and Galaxy Repository +/// RPCs, exposing only the operations the command surface needs. The +/// production binding is (wrapping a +/// real MxGatewayClient); tests substitute an in-memory fake so the +/// command routing can be exercised without a live gateway. +/// public interface IMxGatewayCliClient : IAsyncDisposable { /// diff --git a/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs b/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs index 0578372..e163420 100644 --- a/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs +++ b/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/MxGatewayClientCli.cs @@ -153,7 +153,10 @@ public static class MxGatewayClientCli } catch (Exception exception) when (exception is not OperationCanceledException) { - string? apiKey = arguments.GetOptional("api-key"); + // Client.Dotnet-028: redact the *effective* key — from --api-key or the + // --api-key-env environment variable — so an env-var-sourced key echoed + // in a transport error never reaches stderr unredacted. + string? apiKey = TryResolveApiKey(arguments); string message = MxGatewayCliSecretRedactor.Redact(exception.Message, apiKey); if (forceJsonErrors || arguments.HasFlag("json")) @@ -278,6 +281,29 @@ public static class MxGatewayClientCli } private static string ResolveApiKey(CliArguments arguments) + { + string? apiKey = TryResolveApiKey(arguments); + if (!string.IsNullOrWhiteSpace(apiKey)) + { + return apiKey; + } + + string apiKeyEnvironmentName = arguments.GetOptional("api-key-env") + ?? "MXGATEWAY_API_KEY"; + + throw new ArgumentException( + $"Gateway API key is required. Pass --api-key or set {apiKeyEnvironmentName}."); + } + + /// + /// Resolves the effective API key from --api-key or, failing that, + /// the --api-key-env-named environment variable (default + /// MXGATEWAY_API_KEY), returning when neither + /// is set. Unlike this never throws, so the + /// error-redaction catch block can strip the env-var-sourced key from + /// output (Client.Dotnet-028) without re-raising on the absent-key path. + /// + private static string? TryResolveApiKey(CliArguments arguments) { string? apiKey = arguments.GetOptional("api-key"); if (!string.IsNullOrWhiteSpace(apiKey)) @@ -288,14 +314,7 @@ public static class MxGatewayClientCli string apiKeyEnvironmentName = arguments.GetOptional("api-key-env") ?? "MXGATEWAY_API_KEY"; - apiKey = Environment.GetEnvironmentVariable(apiKeyEnvironmentName); - if (!string.IsNullOrWhiteSpace(apiKey)) - { - return apiKey; - } - - throw new ArgumentException( - $"Gateway API key is required. Pass --api-key or set {apiKeyEnvironmentName}."); + return Environment.GetEnvironmentVariable(apiKeyEnvironmentName); } private static CancellationTokenSource CreateCancellation(CliArguments arguments, string command) @@ -303,7 +322,7 @@ public static class MxGatewayClientCli var cancellation = new CancellationTokenSource(); // Long-running streaming commands run until Ctrl+C / cancellation by default; // a caller-supplied --timeout still applies if present. - bool isLongRunning = command is "galaxy-watch"; + bool isLongRunning = command is "galaxy-watch" or "galaxy-browse"; string? rawTimeout = arguments.GetOptional("timeout"); if (isLongRunning && string.IsNullOrWhiteSpace(rawTimeout)) { diff --git a/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/MxGatewayClientCliTests.cs b/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/MxGatewayClientCliTests.cs index c049ea2..cf92ebe 100644 --- a/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/MxGatewayClientCliTests.cs +++ b/clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/MxGatewayClientCliTests.cs @@ -106,6 +106,48 @@ public sealed class MxGatewayClientCliTests Assert.Contains("[redacted]", error.ToString()); } + /// + /// Client.Dotnet-028: when the API key is sourced from the env var + /// (--api-key-env path, no --api-key flag), the error-redaction + /// catch block must still resolve and redact the effective key. Regression + /// guard for the catch block reverting to GetOptional("api-key") only, + /// which is null on the env-var path and leaves the key unredacted. + /// + [Fact] + public async Task RunAsync_ErrorOutput_RedactsApiKey_WhenSourcedFromEnvironmentVariable() + { + const string envName = "MXGATEWAY_TEST_API_KEY_028"; + const string secret = "env-sourced-secret-key"; + string? previousKey = Environment.GetEnvironmentVariable(envName); + Environment.SetEnvironmentVariable(envName, secret); + + try + { + using var output = new StringWriter(); + using var error = new StringWriter(); + + int exitCode = await MxGatewayClientCli.RunAsync( + [ + "open-session", + "--endpoint", + "http://localhost:5000", + "--api-key-env", + envName, + ], + output, + error, + _ => throw new InvalidOperationException($"boom {secret}")); + + Assert.Equal(1, exitCode); + Assert.DoesNotContain(secret, error.ToString()); + Assert.Contains("[redacted]", error.ToString()); + } + finally + { + Environment.SetEnvironmentVariable(envName, previousKey); + } + } + /// Verifies that stream-events with max-events limit stops output in non-JSON format. [Fact] public async Task RunAsync_StreamEvents_WithMaxEventsStopsNonJsonOutput() diff --git a/clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs b/clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs index f4eb18e..777e13b 100644 --- a/clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs +++ b/clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs @@ -12,6 +12,12 @@ public sealed class LazyBrowseNode { private readonly GalaxyRepositoryClient _client; private readonly BrowseChildrenOptions _options; + // Client.Dotnet-027 (Won't Fix): this gate is used only via WaitAsync/Release and + // never via AvailableWaitHandle, so SemaphoreSlim allocates no kernel wait handle — + // it holds no unmanaged/OS handle to leak. It is pure managed memory whose lifetime + // is the node's, so the type is intentionally not IDisposable: making it disposable + // would push per-node disposal onto every tree consumer (thousands of nodes) for no + // resource benefit. private readonly SemaphoreSlim _expandLock = new(1, 1); // Published once, under _expandLock, when expansion completes. Lock-free readers diff --git a/clients/go/README.md b/clients/go/README.md index a1c0d1f..9aa0da8 100644 --- a/clients/go/README.md +++ b/clients/go/README.md @@ -247,24 +247,78 @@ one line per event in text mode or one JSON object per event with `-json`. The `mxgw-go` CLI emits JSON with redacted API keys for commands that connect to the gateway: +### Subcommand reference + +Every subcommand wired into the CLI. All accept the common flags +(`-endpoint`, `-plaintext`, `-api-key` / `-api-key-env`, `-ca-cert`, +`-server-name-override`, `-call-timeout`) and most accept `-json`. + +| Command | Purpose | +|---|---| +| `version` | Print client/contract versions. | +| `open-session` | Open a gateway session and print its id. | +| `close-session` | Close a session by id. | +| `ping` | Round-trip a `PING` command (`-session-id`, `-message`). | +| `register` | Register a client name on a session (`-session-id`, `-client-name`). | +| `add-item` | Add an item handle (`-session-id`, `-server-handle`, `-item`). | +| `advise` | Advise (subscribe) one item (`-session-id`, `-server-handle`, `-item-handle`). | +| `subscribe-bulk` | Advise many items in one call. | +| `unsubscribe-bulk` | Unadvise many item handles in one call. | +| `read-bulk` | Read snapshots for many item handles in one call. | +| `write` | Write one value (`-type`, `-value`). | +| `write-bulk` | Write many values (`-item-handles`, `-values`, counts must match). | +| `write2-bulk` | `write-bulk` with a shared `-timestamp-value` (RFC 3339). | +| `write-secured-bulk` | Secured bulk write (`-current-user-id`, `-verifier-user-id`). | +| `write-secured2-bulk` | Secured bulk write with a shared timestamp. | +| `bench-read-bulk` | Throughput benchmark (`-duration-seconds`, `-warmup-seconds`, `-bulk-size`). | +| `stream-events` | Stream item-value events for a session (`-session-id`, `-limit`). | +| `stream-alarms` | Stream the alarm feed (`-filter-prefix`, `-limit`). | +| `acknowledge-alarm` | Acknowledge an alarm reference. | +| `smoke` | End-to-end smoke workflow against one item. | +| `galaxy-test-connection` | Probe the Galaxy Repository RPC connection. | +| `galaxy-last-deploy` | Print the most recent deploy event. | +| `galaxy-discover` | Discover deployed objects. | +| `galaxy-watch` | Stream deploy events until Ctrl+C or `-limit`. | +| `galaxy-browse` | Lazy/eager browse of the Galaxy object tree. | +| `batch` | Read commands from stdin (see below). | + ```powershell go run ./cmd/mxgw-go version -json go run ./cmd/mxgw-go open-session -endpoint localhost:5000 -plaintext -json +go run ./cmd/mxgw-go ping -session-id -plaintext -json go run ./cmd/mxgw-go register -session-id -client-name mxgw-go -plaintext -json go run ./cmd/mxgw-go add-item -session-id -server-handle 1 -item Area001.Tag.Value -plaintext -json go run ./cmd/mxgw-go advise -session-id -server-handle 1 -item-handle 1 -plaintext -json go run ./cmd/mxgw-go write -session-id -server-handle 1 -item-handle 1 -type int32 -value 123 -plaintext -json +go run ./cmd/mxgw-go write-bulk -session-id -server-handle 1 -item-handles 1,2 -values 10,20 -type int32 -plaintext -json +go run ./cmd/mxgw-go read-bulk -session-id -item-handles 1,2 -plaintext -json go run ./cmd/mxgw-go stream-events -session-id -plaintext -json +go run ./cmd/mxgw-go stream-alarms -plaintext -json go run ./cmd/mxgw-go smoke -item Area001.Tag.Value -plaintext -json go run ./cmd/mxgw-go galaxy-test-connection -plaintext -json go run ./cmd/mxgw-go galaxy-last-deploy -plaintext -json go run ./cmd/mxgw-go galaxy-discover -plaintext -json go run ./cmd/mxgw-go galaxy-watch -plaintext -json +go run ./cmd/mxgw-go galaxy-browse -plaintext -json ``` Use `-api-key-env MXGATEWAY_API_KEY` or `-api-key ` when authentication is enabled. CLI output redacts the key value and never writes the raw secret. +### `batch` mode + +`batch` reads one command line at a time from stdin and dispatches each through +the same routing as the standalone subcommands; it is the interface the +cross-language E2E harness drives. After every command's output it writes the +end-of-result sentinel line `__MXGW_BATCH_EOR__` to stdout and flushes, so the +harness can frame each result. Blank/whitespace-only lines are skipped; only +stdin EOF ends the session. Command errors are serialised as a JSON object +(`{"error":...,"type":"error"}`) to stdout (not stderr) and still followed by the +sentinel, so a failing command does not abort the batch. The input scanner +buffer is widened to 16 MiB so a single long command line (e.g. a bulk write with +thousands of handles) does not trip bufio's default 64 KiB token-too-long limit; +a line that still exceeds 16 MiB surfaces as a framed error and ends the session. + Use TLS options for a secured gateway: ```powershell diff --git a/clients/go/cmd/mxgw-go/main.go b/clients/go/cmd/mxgw-go/main.go index 543ddef..11fadb1 100644 --- a/clients/go/cmd/mxgw-go/main.go +++ b/clients/go/cmd/mxgw-go/main.go @@ -837,7 +837,14 @@ func runStreamEvents(ctx context.Context, args []string, stdout, stderr io.Write defer client.Close() session := mxgateway.NewSessionForID(client, *sessionID) - streamCtx, cancelStream := context.WithCancel(ctx) + + // Ctrl+C on a long-running stream-events command cancels the gRPC stream + // cleanly (the gateway sees codes.Canceled rather than a torn TCP + // connection) and the deferred subscription.Close()/client.Close() run. + signalCtx, stopSignals := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer stopSignals() + + streamCtx, cancelStream := context.WithCancel(signalCtx) defer cancelStream() subscription, err := session.SubscribeEventsAfter(streamCtx, *after) if err != nil { @@ -1035,15 +1042,17 @@ func runSmoke(ctx context.Context, args []string, stdout, stderr io.Writer) erro } func closeSmokeSession(ctx context.Context, session *mxgateway.Session, primaryErr error) error { - closeCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() + // Compute the close timeout once so a single context (and a single + // deferred cancel) is allocated: default 5s, shortened to the caller's + // remaining deadline when that is sooner. + closeTimeout := 5 * time.Second if deadline, ok := ctx.Deadline(); ok { - if until := time.Until(deadline); until > 0 && until < 5*time.Second { - cancel() - closeCtx, cancel = context.WithTimeout(context.Background(), until) - defer cancel() + if until := time.Until(deadline); until > 0 && until < closeTimeout { + closeTimeout = until } } + closeCtx, cancel := context.WithTimeout(context.Background(), closeTimeout) + defer cancel() _, closeErr := session.Close(closeCtx) if primaryErr != nil { @@ -1490,6 +1499,12 @@ func runGalaxyWatch(ctx context.Context, args []string, stdout, stderr io.Writer count++ if *limit > 0 && count >= *limit { cancelStream() + // Drain so the WatchDeployEvents goroutine can exit instead + // of blocking on a send into the buffered events channel + // while the deferred client.Close() tears the stream down + // underneath it (mirrors the signal-cancel branch below). + for range events { + } return nil } case streamErr, ok := <-errs: diff --git a/clients/go/cmd/mxgw-go/main_test.go b/clients/go/cmd/mxgw-go/main_test.go index 34e6239..31b9194 100644 --- a/clients/go/cmd/mxgw-go/main_test.go +++ b/clients/go/cmd/mxgw-go/main_test.go @@ -537,3 +537,43 @@ func TestRunBatchHandlesLongCommandLine(t *testing.T) { t.Fatalf("EOR sentinel count = %d, want 2 (one per command, even when first is too long); out length = %d", count, len(out)) } } + +// TestRunBenchReadBulkRejectsNonPositiveDuration pins the -duration-seconds +// positivity guard so the bench window cannot be configured to zero/negative. +func TestRunBenchReadBulkRejectsNonPositiveDuration(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{"bench-read-bulk", "-duration-seconds", "0"}, &stdout, &stderr) + if err == nil || !strings.Contains(err.Error(), "duration-seconds must be positive") { + t.Fatalf("bench-read-bulk -duration-seconds 0 error = %v", err) + } +} + +// TestRunStreamEventsRequiresSessionID pins the session-id guard so stream-events +// fails fast before dialing when no session id is supplied. +func TestRunStreamEventsRequiresSessionID(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{"stream-events", "-plaintext", "-api-key", "test"}, &stdout, &stderr) + if err == nil || !strings.Contains(err.Error(), "session-id is required") { + t.Fatalf("stream-events without -session-id error = %v", err) + } +} + +// TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues pins the len-mismatch +// guard so a write-bulk with unequal item-handles / values counts fails fast +// before any dial. +func TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{ + "write-bulk", + "-session-id", "s1", + "-server-handle", "1", + "-item-handles", "1,2", + "-values", "10", + "-type", "int32", + "-plaintext", + "-api-key", "test", + }, &stdout, &stderr) + if err == nil || !strings.Contains(err.Error(), "does not match values count") { + t.Fatalf("write-bulk mismatched handles/values error = %v", err) + } +} diff --git a/clients/python/README.md b/clients/python/README.md index e166cd5..f340e3e 100644 --- a/clients/python/README.md +++ b/clients/python/README.md @@ -140,19 +140,21 @@ service requires the `metadata:read` scope on the API key. ### Browsing lazily -For UI trees or OPC UA bridges, use `browse_children` to walk one level at a +For UI trees or OPC UA bridges, use `browse_children_raw` to walk one level at a time instead of loading the full hierarchy with `discover_hierarchy`. Pass an empty request for root objects; subsequent calls set `parent_gobject_id`, `parent_tag_name`, or `parent_contained_path`. Filter fields match `DiscoverHierarchy`. Each response pairs `children` with `child_has_children` so -you know which nodes to expand. See +you know which nodes to expand. Most callers should prefer the higher-level +`browse()` / `LazyBrowseNode` walker below; `browse_children_raw` is the +low-level escape hatch for direct page-token control. See [Galaxy Repository](../../docs/GalaxyRepository.md#browsechildren) for full request and filter semantics. ```python from zb_mom_ww_mxgateway.generated import galaxy_repository_pb2 as galaxy_pb2 -reply = await galaxy.browse_children(galaxy_pb2.BrowseChildrenRequest()) +reply = await galaxy.browse_children_raw(galaxy_pb2.BrowseChildrenRequest()) for child, has_children in zip(reply.children, reply.child_has_children): print(child.tag_name, "expand=" + str(has_children)) ``` diff --git a/clients/python/src/zb_mom_ww_mxgateway/__init__.py b/clients/python/src/zb_mom_ww_mxgateway/__init__.py index af5e43d..49bace6 100644 --- a/clients/python/src/zb_mom_ww_mxgateway/__init__.py +++ b/clients/python/src/zb_mom_ww_mxgateway/__init__.py @@ -2,7 +2,7 @@ from .auth import ApiKey, auth_metadata from .client import GatewayClient -from .galaxy import GalaxyRepositoryClient +from .galaxy import GalaxyRepositoryClient, LazyBrowseNode from .generated.galaxy_repository_pb2 import ( DeployEvent, GalaxyAttribute, @@ -19,19 +19,21 @@ from .errors import ( MxGatewayTransportError, MxGatewayWorkerError, ) -from .options import ClientOptions +from .options import BrowseChildrenOptions, ClientOptions from .session import Session from .values import MxValueView, from_mx_value, to_mx_value from .version import __version__ __all__ = [ "ApiKey", + "BrowseChildrenOptions", "ClientOptions", "DeployEvent", "GalaxyAttribute", "GalaxyObject", "GalaxyRepositoryClient", "GatewayClient", + "LazyBrowseNode", "MxAccessError", "MxGatewayAuthenticationError", "MxGatewayAuthorizationError", diff --git a/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py b/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py index 6f11c7a..97da512 100644 --- a/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py +++ b/clients/python/src/zb_mom_ww_mxgateway_cli/commands.py @@ -769,7 +769,7 @@ def _build_write_bulk_entries(kwargs: dict[str, Any]): """ handles = _parse_int_list(kwargs["item_handles"]) - value_texts = _parse_string_list(kwargs["values"]) + value_texts = _parse_string_list(kwargs["values"], param_hint="--values") if len(handles) != len(value_texts): raise click.UsageError( f"item-handles count ({len(handles)}) does not match values count ({len(value_texts)})", @@ -1045,8 +1045,7 @@ async def _write2(**kwargs: Any) -> dict[str, Any]: async def _smoke(**kwargs: Any) -> dict[str, Any]: async with await _connect(kwargs) as client: session = await client.open_session(client_session_name=kwargs["client_name"]) - closed = False - try: + async with session: server_handle = await session.register(kwargs["client_name"]) item_handle = await session.add_item(server_handle, kwargs["item"]) await session.advise(server_handle, item_handle) @@ -1061,9 +1060,6 @@ async def _smoke(**kwargs: Any) -> dict[str, Any]: "itemHandle": item_handle, "events": [_message_dict(event) for event in events], } - finally: - if not closed: - await session.close() async def _galaxy_test_connection(**kwargs: Any) -> dict[str, Any]: @@ -1487,10 +1483,10 @@ def _parse_datetime(raw_value: str) -> datetime: return parsed -def _parse_string_list(raw_value: str) -> list[str]: +def _parse_string_list(raw_value: str, param_hint: str = "--items") -> list[str]: values = [item.strip() for item in raw_value.split(",") if item.strip()] if not values: - raise click.BadParameter("at least one item is required", param_hint="--items") + raise click.BadParameter("at least one item is required", param_hint=param_hint) return values @@ -1498,7 +1494,12 @@ def _parse_int_list(raw_value: str) -> list[int]: values = [item.strip() for item in raw_value.split(",") if item.strip()] if not values: raise click.BadParameter("at least one item handle is required", param_hint="--item-handles") - return [int(item) for item in values] + try: + return [int(item) for item in values] + except ValueError as exc: + raise click.BadParameter( + f"item handles must be integers: {exc}", param_hint="--item-handles" + ) from exc def _message_dict(message: Any) -> dict[str, Any]: diff --git a/clients/python/tests/test_review_findings_032_to_036.py b/clients/python/tests/test_review_findings_032_to_036.py new file mode 100644 index 0000000..709a9aa --- /dev/null +++ b/clients/python/tests/test_review_findings_032_to_036.py @@ -0,0 +1,131 @@ +"""Regression tests for Client.Python-032..036. + +Each test corresponds to a finding from the 2026-06-16 re-review. Tests are +TDD-first — they fail against the pre-fix source and pass against the fixed +source. +""" + +from __future__ import annotations + +import inspect +import re +from pathlib import Path + +import click +import pytest + +from zb_mom_ww_mxgateway_cli import commands as cli_commands +from zb_mom_ww_mxgateway_cli.commands import _parse_int_list, _parse_string_list + + +# --------------------------------------------------------------------------- +# Client.Python-032 — `_smoke` must not carry the dead `closed` guard variable. +# --------------------------------------------------------------------------- + + +def test_smoke_does_not_carry_dead_closed_guard() -> None: + """`_smoke` must not reintroduce the dead `closed = False` / `if not closed` + guard removed by Client.Python-004. The variable is never reassigned, so the + guard misleads readers into expecting an early-close path that never exists. + """ + + source = inspect.getsource(cli_commands._smoke) + assert "closed = False" not in source, ( + "_smoke must not reintroduce the dead `closed = False` variable" + ) + assert "if not closed:" not in source, ( + "_smoke must not reintroduce the dead `if not closed:` guard" + ) + + +# --------------------------------------------------------------------------- +# Client.Python-033 — `_parse_string_list` param_hint must reflect the caller. +# --------------------------------------------------------------------------- + + +def test_parse_string_list_default_param_hint_is_items() -> None: + with pytest.raises(click.BadParameter) as exc: + _parse_string_list("") + assert exc.value.param_hint == "--items" + + +def test_parse_string_list_accepts_caller_supplied_param_hint() -> None: + """The write-bulk family passes `--values`, so an empty value must surface a + `--values` hint, not the irrelevant `--items` default. + """ + + with pytest.raises(click.BadParameter) as exc: + _parse_string_list("", param_hint="--values") + assert exc.value.param_hint == "--values" + + +# --------------------------------------------------------------------------- +# Client.Python-034 — `_parse_int_list` must re-raise non-numeric tokens as +# click.BadParameter, not a raw ValueError traceback. +# --------------------------------------------------------------------------- + + +def test_parse_int_list_non_numeric_raises_bad_parameter() -> None: + with pytest.raises(click.BadParameter) as exc: + _parse_int_list("10,abc") + assert exc.value.param_hint == "--item-handles" + + +def test_parse_int_list_happy_path() -> None: + assert _parse_int_list("10, 20 ,30") == [10, 20, 30] + + +# --------------------------------------------------------------------------- +# Client.Python-035 — public browse types must be re-exported from the package +# root. +# --------------------------------------------------------------------------- + + +def test_browse_children_options_is_exported_from_package_root() -> None: + import zb_mom_ww_mxgateway as pkg + + assert hasattr(pkg, "BrowseChildrenOptions") + assert "BrowseChildrenOptions" in pkg.__all__ + + +def test_lazy_browse_node_is_exported_from_package_root() -> None: + import zb_mom_ww_mxgateway as pkg + + assert hasattr(pkg, "LazyBrowseNode") + assert "LazyBrowseNode" in pkg.__all__ + + +# --------------------------------------------------------------------------- +# Client.Python-036 — README "Browsing lazily" example must reference a method +# that actually exists on GalaxyRepositoryClient. +# --------------------------------------------------------------------------- + + +def _readme_path() -> Path: + return Path(__file__).resolve().parent.parent / "README.md" + + +def test_galaxy_client_exposes_browse_children_raw() -> None: + """Guard the method name the README example depends on so future renames + break this test rather than only failing at runtime in user code. + """ + + from zb_mom_ww_mxgateway import GalaxyRepositoryClient + + assert hasattr(GalaxyRepositoryClient, "browse_children_raw") + + +def test_readme_browse_example_uses_existing_method() -> None: + """The README's `galaxy.(...BrowseChildrenRequest...)` call must name + a method that exists on GalaxyRepositoryClient. + """ + + from zb_mom_ww_mxgateway import GalaxyRepositoryClient + + text = _readme_path().read_text(encoding="utf-8") + called = set(re.findall(r"galaxy\.([A-Za-z_][A-Za-z0-9_]*)\s*\(", text)) + assert called, "README must contain at least one galaxy.(...) example" + for method in called: + assert hasattr(GalaxyRepositoryClient, method), ( + f"README references galaxy.{method}() but no such method exists" + ) diff --git a/clients/rust/README.md b/clients/rust/README.md index a40c67f..27779e9 100644 --- a/clients/rust/README.md +++ b/clients/rust/README.md @@ -161,7 +161,7 @@ cargo run -p mxgw-cli -- galaxy discover-hierarchy --endpoint http://localhost:5 ### Browsing lazily -For UI trees or OPC UA bridges, use `browse_children` to walk one level at a +For UI trees or OPC UA bridges, use `browse_children_raw` to walk one level at a time instead of paging the full hierarchy. Pass a default request for root objects; subsequent calls set `parent_gobject_id`, `parent_tag_name`, or `parent_contained_path`. Filter fields match `discover_hierarchy`. Each response @@ -172,7 +172,7 @@ request and filter semantics. ```rust use zb_mom_ww_mxgateway_client::generated::galaxy_repository::v1::BrowseChildrenRequest; -let reply = galaxy.browse_children(BrowseChildrenRequest::default()).await?.into_inner(); +let reply = galaxy.browse_children_raw(BrowseChildrenRequest::default()).await?; for (child, has_children) in reply.children.iter().zip(reply.child_has_children.iter()) { println!("{} expand={}", child.tag_name, has_children); } diff --git a/clients/rust/RustClientDesign.md b/clients/rust/RustClientDesign.md index 7d764c0..379bce8 100644 --- a/clients/rust/RustClientDesign.md +++ b/clients/rust/RustClientDesign.md @@ -349,8 +349,16 @@ mxgw bench-read-bulk [--duration-seconds ] [--warmup-seconds ] [--bulk-siz mxgw smoke --endpoint http://localhost:5000 --api-key-env MXGATEWAY_API_KEY --item TestChildObject.TestInt mxgw batch mxgw galaxy {test-connection,last-deploy-time,discover-hierarchy,watch} +mxgw galaxy browse [--parent-gobject-id ] [--category-id ...] [--template-contains ...] [--tag-name-glob ] [--include-attributes] [--alarm-bearing-only] [--historized-only] [--depth ] [--json] ``` +`galaxy browse` walks the hierarchy one level at a time over the raw +`BrowseChildren` paging path. `--depth 0` (the default) prints only the +requested level; `--depth N` eagerly expands N additional levels beneath each +returned node. `--parent-gobject-id` makes `--depth` a no-op (the parent's +children are returned as a single level). Omit `--parent-gobject-id` to browse +root objects. + `batch` reads commands from stdin one per line and dispatches each through the normal subcommand path; the loop terminates only on stdin EOF (blank lines log an empty-EOR-bracketed result and continue) so accidental empty diff --git a/clients/rust/crates/mxgw-cli/src/main.rs b/clients/rust/crates/mxgw-cli/src/main.rs index 299b97f..1ea2442 100644 --- a/clients/rust/crates/mxgw-cli/src/main.rs +++ b/clients/rust/crates/mxgw-cli/src/main.rs @@ -46,8 +46,6 @@ enum Command { Version { #[arg(long)] json: bool, - #[arg(long)] - jsonl: bool, }, Ping { #[command(flatten)] @@ -458,9 +456,16 @@ struct ConnectionArgs { endpoint: String, #[arg(long)] api_key: Option, + /// Name of the environment variable holding the gateway API key. The + /// variable's value must be a full gateway key of the form + /// `mxgw__`; it is forwarded verbatim as the Bearer + /// token, so do not point this at an unrelated credential. #[arg(long, default_value = "MXGATEWAY_API_KEY")] api_key_env: String, - #[arg(long)] + /// Use an unencrypted (plaintext h2c) channel. Mutually exclusive with + /// `--tls`; supplying both is rejected so an explicit `--tls` cannot be + /// silently downgraded. + #[arg(long, conflicts_with = "tls")] plaintext: bool, #[arg(long)] tls: bool, @@ -545,7 +550,7 @@ async fn dispatch(command: Command) -> Result<(), Error> { detail: "batch cannot be nested inside another batch session".to_owned(), }); } - Command::Version { json, .. } => print_version(json), + Command::Version { json } => print_version(json), Command::Ping { connection, message, @@ -1214,6 +1219,24 @@ const BROWSE_PAGE_SIZE: i32 = 500; /// Drive `BrowseChildren` paging by hand for a single parent and return the /// flattened child list. Used by the `browse --parent-gobject-id` path, which /// surfaces one level of children rather than the lazy root-tree walk. +/// Record a non-empty `next_page_token` in `seen` and reject a repeat. A +/// server that returns the same continuation token twice would loop forever, +/// so the second sighting is converted to an `InvalidArgument` error. Extracted +/// from [`browse_children_one_level`] so the guard can be unit-tested without a +/// network client. +fn register_page_token( + seen: &mut std::collections::HashSet, + token: &str, +) -> Result<(), Error> { + if !seen.insert(token.to_owned()) { + return Err(Error::InvalidArgument { + name: "page_token".to_owned(), + detail: format!("galaxy browse children returned repeated page token `{token}`"), + }); + } + Ok(()) +} + async fn browse_children_one_level( client: &mut GalaxyClient, parent_gobject_id: i32, @@ -1254,14 +1277,7 @@ async fn browse_children_one_level( if page_token.is_empty() { return Ok(children); } - if !seen.insert(page_token.clone()) { - return Err(Error::InvalidArgument { - name: "page_token".to_owned(), - detail: format!( - "galaxy browse children returned repeated page token `{page_token}`" - ), - }); - } + register_page_token(&mut seen, &page_token)?; } } @@ -2337,7 +2353,18 @@ where mod tests { use clap::Parser; - use super::Cli; + use super::{Cli, Command}; + + /// Pull the flattened `ConnectionArgs` out of a parsed `ping` command so + /// `ConnectionArgs::options()` can be exercised directly. + fn connection_from_ping(args: &[&str]) -> super::ConnectionArgs { + let mut full = vec!["mxgw", "ping"]; + full.extend_from_slice(args); + match Cli::try_parse_from(full).expect("ping parse").command { + Command::Ping { connection, .. } => connection, + other => panic!("expected ping command, got {other:?}"), + } + } #[test] fn parses_version_json_command() { @@ -2345,6 +2372,36 @@ mod tests { assert!(parsed.is_ok()); } + #[test] + fn connection_defaults_to_plaintext() { + let options = connection_from_ping(&[]).options(); + assert!(options.plaintext(), "default channel should be plaintext"); + } + + #[test] + fn connection_tls_flag_disables_plaintext() { + let options = connection_from_ping(&["--tls"]).options(); + assert!( + !options.plaintext(), + "--tls must select an encrypted channel" + ); + } + + #[test] + fn connection_plaintext_flag_selects_plaintext() { + let options = connection_from_ping(&["--plaintext"]).options(); + assert!(options.plaintext()); + } + + #[test] + fn connection_rejects_tls_and_plaintext_together() { + let parsed = Cli::try_parse_from(["mxgw", "ping", "--tls", "--plaintext"]); + assert!( + parsed.is_err(), + "--tls and --plaintext must conflict so TLS cannot be silently downgraded" + ); + } + #[test] fn parses_write_command() { let parsed = Cli::try_parse_from([ @@ -2513,6 +2570,50 @@ mod tests { assert_eq!(summary.mean, 42.0); } + #[test] + fn register_page_token_accepts_distinct_tokens_and_rejects_repeats() { + let mut seen = std::collections::HashSet::new(); + assert!(super::register_page_token(&mut seen, "tok-1").is_ok()); + assert!(super::register_page_token(&mut seen, "tok-2").is_ok()); + + let repeated = super::register_page_token(&mut seen, "tok-1"); + match repeated { + Err(super::Error::InvalidArgument { name, detail }) => { + assert_eq!(name, "page_token"); + assert!(detail.contains("tok-1"), "detail: {detail}"); + } + other => panic!("expected InvalidArgument on repeated token, got {other:?}"), + } + } + + #[test] + fn rfc3339_parser_rejects_trailing_characters() { + let err = super::parse_rfc3339_timestamp("2026-04-28T15:30:00Zextra"); + assert!(err.is_err(), "trailing chars after Z must be rejected"); + } + + #[test] + fn rfc3339_parser_rejects_day_zero() { + let err = super::parse_rfc3339_timestamp("2026-04-00T15:30:00Z"); + assert!(err.is_err(), "day 0 must be rejected"); + } + + #[test] + fn rfc3339_parser_rejects_month_thirteen() { + let err = super::parse_rfc3339_timestamp("2026-13-01T15:30:00Z"); + assert!(err.is_err(), "month 13 must be rejected"); + } + + #[test] + fn rfc3339_parser_rejects_day_out_of_range_for_month() { + // April has 30 days. + let err = super::parse_rfc3339_timestamp("2026-04-31T15:30:00Z"); + assert!(err.is_err(), "April 31 must be rejected"); + // February 29 in a non-leap year. + let feb = super::parse_rfc3339_timestamp("2025-02-29T00:00:00Z"); + assert!(feb.is_err(), "Feb 29 in a non-leap year must be rejected"); + } + #[test] fn rfc3339_parser_round_trips_z_and_offset_inputs() { // 2026-04-28T15:30:00Z = 1_777_995_000 (sanity-checked once below) diff --git a/code-reviews/Client.Dotnet/findings.md b/code-reviews/Client.Dotnet/findings.md index a108bb3..cddc11c 100644 --- a/code-reviews/Client.Dotnet/findings.md +++ b/code-reviews/Client.Dotnet/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 4 | +| Open findings | 0 | ## Checklist coverage @@ -628,13 +628,13 @@ Re-review of the .NET client delta: `LazyBrowseNode` lazy paging + tests, the ne | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/dotnet/.../MxGatewayClientCli.cs:306` (isLongRunning) | -| Status | Open | +| Status | Resolved | **Description:** Client.Dotnet-015 extended `isLongRunning` to include the bench commands so they aren't silently cancelled by the default 30s CTS. The new `galaxy-browse` command is NOT in `isLongRunning`. A `galaxy-browse --depth N` tree walk on a large Galaxy can exceed 30s (sequential paginated RPCs per node); the CTS fires and the OCE escapes as a non-zero exit with no output — the same silent failure the bench commands were exempted from. **Recommendation:** Add `"galaxy-browse"` to the `isLongRunning` set alongside `galaxy-watch`/bench, so it defaults to unlimited wall-clock and only applies `CancelAfter` with an explicit `--timeout`. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed against source: `CreateCancellation`'s `isLongRunning` expression at line 306 read `command is "galaxy-watch"` only — `galaxy-browse` was absent, so the default 30 s `CancelAfter` budget applied and a deep paginated tree walk that overran it would have the OCE escape as a non-zero exit with no output. (Note: at HEAD the bench commands the finding cites are also not in this set despite Client.Dotnet-015's recorded resolution, but per the task scope only `galaxy-browse` is added here.) Changed the expression to `command is "galaxy-watch" or "galaxy-browse"`, so `galaxy-browse` now runs to completion by default and only applies `CancelAfter` when the caller supplies an explicit `--timeout`. Pure correctness fix matching the existing `galaxy-watch` precedent. ### Client.Dotnet-027 @@ -643,13 +643,13 @@ Re-review of the .NET client delta: `LazyBrowseNode` lazy paging + tests, the ne | Severity | Low | | Category | Performance & resource management | | Location | `clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs:15` | -| Status | Open | +| Status | Won't Fix | **Description:** `LazyBrowseNode` allocates one `SemaphoreSlim _expandLock = new(1,1)` per node and never disposes it (the type is not IDisposable). For a large Galaxy browse tree (thousands of nodes), live SemaphoreSlim instances accumulate; OS handles are released only on finalization. Negligible for small trees, meaningful for long-lived large trees. **Recommendation:** Replace the once-only async gate with a non-disposable primitive (e.g. `Lazy`-based dedup) or make `LazyBrowseNode` IDisposable and dispose the semaphore. Document the chosen lifetime contract. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — **Won't Fix.** The finding's premise — that the undisposed semaphore leaks an OS handle until finalization — does not hold for this usage. `SemaphoreSlim` only allocates a kernel wait handle (`ManualResetEvent`) lazily, the first time its `AvailableWaitHandle` property is accessed; `LazyBrowseNode` uses the gate exclusively via `WaitAsync`/`Release` and never touches `AvailableWaitHandle` (verified by grep), so no unmanaged/OS handle is ever created. The semaphore is therefore pure managed memory whose lifetime is the node's and which is reclaimed by the GC with the node — `SemaphoreSlim.Dispose()` would have nothing to release. Making the type `IDisposable` (or restructuring to a `Lazy` gate) would change the public surface and push per-node disposal onto every tree consumer (thousands of nodes) for zero resource benefit, so it is not worth the over-engineering. Added an inline code comment at `LazyBrowseNode.cs:15` documenting this lifetime contract and the no-handle rationale so the design intent is explicit. No test added (no behavior change). ### Client.Dotnet-028 @@ -658,13 +658,13 @@ Re-review of the .NET client delta: `LazyBrowseNode` lazy paging + tests, the ne | Severity | Medium | | Category | Security | | Location | `clients/dotnet/.../MxGatewayClientCli.cs:156` | -| Status | Open | +| Status | Resolved | **Description:** Client.Dotnet-008 was recorded resolved by adding a `TryResolveApiKey` helper resolving both `--api-key` and the `--api-key-env` env-var path, wired into the error-redaction catch block. At HEAD the catch block reads `arguments.GetOptional("api-key")` only — the pre-008 code. When the key is sourced from the env var, `GetOptional("api-key")` returns null, `Redact(message, null)` is a no-op, and an exception message echoing the bearer key would print it raw to stderr. The existing regression test only covers the `--api-key` direct path, so it passes against the broken code. (Claimed regression — verify root cause before fixing.) **Recommendation:** Restore the `TryResolveApiKey` pattern (resolve `--api-key` then the `--api-key-env`-named env var, default `MXGATEWAY_API_KEY`) in the catch block, and add a regression test that sources the key from the env var and asserts it is redacted in stderr. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — **Confirmed: real regression.** The `RunCoreAsync` catch block at line 156 resolved the redaction key via `arguments.GetOptional("api-key")` only, and no `TryResolveApiKey` helper existed anywhere in the CLI project (verified by grep) — the Client.Dotnet-008 helper had been lost from the history reaching HEAD, same as the 012/013/022/023 props/doc regressions. On the `--api-key-env` path `GetOptional("api-key")` is null, so `Redact(message, null)` was a no-op and a transport error echoing the bearer token would have reached stderr unredacted. Restored a non-throwing `TryResolveApiKey(CliArguments)` helper that resolves `--api-key` then the `--api-key-env`-named env var (default `MXGATEWAY_API_KEY`) and returns null when neither is set; refactored `ResolveApiKey` to call it (so the resolution order stays single-sourced) and changed the catch block to redact `TryResolveApiKey(arguments)` instead of `GetOptional("api-key")`. Regression test `MxGatewayClientCliTests.RunAsync_ErrorOutput_RedactsApiKey_WhenSourcedFromEnvironmentVariable` sets a dedicated env var (`MXGATEWAY_TEST_API_KEY_028`), runs `open-session --api-key-env ` (no `--api-key` flag) against a client factory that throws an `InvalidOperationException` whose message embeds the secret, and asserts exit 1, that the secret is absent from stderr, and that `[redacted]` is present. The pre-existing `--api-key`-path test (`RunAsync_ErrorOutput_RedactsApiKey`) is retained; the new test fails against the `GetOptional("api-key")`-only catch block (key printed raw) and passes after the fix. ### Client.Dotnet-029 @@ -673,10 +673,10 @@ Re-review of the .NET client delta: `LazyBrowseNode` lazy paging + tests, the ne | Severity | Low | | Category | Code organization & conventions | | Location | `clients/dotnet/.../IMxGatewayCliClient.cs:6` | -| Status | Open | +| Status | Resolved | **Description:** `IMxGatewayCliClient` is a public interface with no type-level `` XML doc. The Client.Dotnet-013 resolution recorded adding one; at HEAD it is absent. No CS1591 fires (GenerateDocumentationFile now scoped to the packable library only), but the public extension point should follow the public-surface doc convention. **Recommendation:** Add a one-line `` describing the interface and noting `MxGatewayCliClientAdapter` is the production binding. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed against source: the interface declaration at `IMxGatewayCliClient.cs:6` had no type-level `` (only the members were documented). Added a type-level `` describing the interface as the CLI's transport seam over the gateway and Galaxy Repository RPCs, naming `MxGatewayCliClientAdapter` (over a real `MxGatewayClient`) as the production binding and the in-memory fake as the test substitute. Pure documentation change — no test needed. diff --git a/code-reviews/Client.Go/findings.md b/code-reviews/Client.Go/findings.md index b83b4f0..e05d97d 100644 --- a/code-reviews/Client.Go/findings.md +++ b/code-reviews/Client.Go/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 5 | +| Open findings | 0 | ## Checklist coverage @@ -731,13 +731,13 @@ if ($dirty) { | Severity | Medium | | Category | Concurrency & thread safety | | Location | `clients/go/cmd/mxgw-go/main.go:1491-1494` | -| Status | Open | +| Status | Resolved | **Description:** `runGalaxyWatch`'s limit-reached branch calls `cancelStream()` and returns WITHOUT draining the buffered `events` channel, unlike the signal-cancel branch which drains. This is the shape Client.Go-013's resolution claimed to have fixed ("now drains via for range events"). The WatchDeployEvents goroutine may still be blocked sending into the 16-deep channel; it exits via ctx cancellation (not a permanent leak) but remains alive until that propagates, racing `defer client.Close()`. (Claimed regression — verify root cause.) **Recommendation:** After `cancelStream()` in the limit-reached branch, drain: `for range events {}`, mirroring the signal-cancel branch. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed real: the limit-reached branch returned right after `cancelStream()` while the signal-cancel branch drained `events`, so the buffered (16-deep) `WatchDeployEvents` producer could remain blocked on a send while `defer client.Close()` tore the stream down. Added the `for range events {}` drain to the limit-reached branch, mirroring the signal-cancel branch. Behaviour exercised by the existing `runGalaxyWatch` flow; verified via `go vet`/`go build`/`go test ./...`. ### Client.Go-031 @@ -746,13 +746,13 @@ if ($dirty) { | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/go/cmd/mxgw-go/main.go:1037-1046` | -| Status | Open | +| Status | Resolved | **Description:** `closeSmokeSession` registers `defer cancel()` twice on the same `cancel` variable across two `context.WithTimeout` calls when the deadline-shortening branch fires. Because `cancel` is reassigned, both defers end up calling the second context's cancel (idempotent, harmless today), while the first context is released by an explicit `cancel()`. The double-defer-on-reassigned-variable is fragile: removing the explicit `cancel()` in a future refactor would leak the first context's timer goroutine. **Recommendation:** Use a distinct variable for the second cancel, or compute the close timeout once before allocating a single context. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed real. Rewrote `closeSmokeSession` to compute the close timeout once (default 5s, shortened to the caller's remaining deadline when sooner) and then allocate a single `context.WithTimeout` with a single `defer cancel()`, removing the reassigned-variable double-defer entirely. ### Client.Go-032 @@ -761,13 +761,13 @@ if ($dirty) { | Severity | Low | | Category | Code organization & conventions | | Location | `clients/go/cmd/mxgw-go/main.go:839-841` | -| Status | Open | +| Status | Resolved | **Description:** `runStreamEvents` does not install a `signal.NotifyContext` handler, while `runStreamAlarms` and `runGalaxyWatch` do. Client.Go-020's resolution claimed this was added. Without a signal-aware parent context, Ctrl+C kills the process without running `defer subscription.Close()`/`client.Close()`, so the gateway sees a torn connection rather than a clean `codes.Canceled`. (Claimed regression — verify root cause.) **Recommendation:** Wrap `ctx` with `signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM)` (defer the stop) before deriving `streamCtx`, matching the other two stream commands. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed real: `runStreamEvents` derived `streamCtx` directly from `ctx` with no signal handler (and `runStreamAlarms` even carried a "Mirror runStreamEvents" comment that no longer matched). Added `signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM)` (with `defer stopSignals()`) before deriving `streamCtx`, so Ctrl+C/SIGTERM cancels the stream cleanly (gateway sees `codes.Canceled`) and the deferred `subscription.Close()`/`client.Close()` run. Imports already present. CLI guard covered by `TestRunStreamEventsRequiresSessionID`. ### Client.Go-033 @@ -776,13 +776,13 @@ if ($dirty) { | Severity | Low | | Category | Testing coverage | | Location | `clients/go/cmd/mxgw-go/main_test.go` | -| Status | Open | +| Status | Resolved | **Description:** Gaps vs prior coverage: (1) `TestRunBenchReadBulkRejectsNonPositiveDuration` (named in Client.Go-021's resolution) is absent — the `-duration-seconds`-positive guard at main.go:619 is untested; (2) `runStreamEvents` has no CLI-level test (session-id-required and limit paths untested); (3) `TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues` (Client.Go-021 deliverable) is absent — the len-mismatch guard at main.go:508-510 is untested. **Recommendation:** Add the three missing tests; all run through `runWithIO` without a fake server (except the stream-events one which can reuse the ping test's fake-server pattern). -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed all three tests absent. Added them to `cmd/mxgw-go/main_test.go`, each driving `runWithIO` and asserting the guard error before any dial: `TestRunBenchReadBulkRejectsNonPositiveDuration` (`-duration-seconds 0` → "duration-seconds must be positive"), `TestRunStreamEventsRequiresSessionID` (no `-session-id` → "session-id is required"), and `TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues` (2 handles / 1 value → "does not match values count"). All three pass under `go test ./...`. ### Client.Go-034 @@ -791,10 +791,10 @@ if ($dirty) { | Severity | Low | | Category | Documentation & comments | | Location | `clients/go/README.md:245-263` | -| Status | Open | +| Status | Resolved | **Description:** The README CLI example table lists ~12 commands but the binary now exposes ~27 subcommands (per `writeUsage`). Absent: `ping`, `galaxy-browse`, `batch`, `read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `bench-read-bulk`, `stream-alarms`, `acknowledge-alarm`, and more. `batch` (the cross-language harness interface with an EOR sentinel + 16 MiB line cap) is undocumented entirely. **Recommendation:** Add a complete subcommand reference, and document the `batch` EOR-sentinel protocol and line cap. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Expanded the README CLI section with a "Subcommand reference" table covering all 27 subcommands wired into `run` (incl. `ping`, `galaxy-browse`, `read-bulk`, the four bulk-write variants, `bench-read-bulk`, `stream-alarms`, `acknowledge-alarm`, `batch`), refreshed the example block, and added a "`batch` mode" subsection documenting the `__MXGW_BATCH_EOR__` end-of-result sentinel, the JSON error framing, blank-line skipping, and the 16 MiB scanner line cap. diff --git a/code-reviews/Client.Python/findings.md b/code-reviews/Client.Python/findings.md index e116a19..9bb1adf 100644 --- a/code-reviews/Client.Python/findings.md +++ b/code-reviews/Client.Python/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 5 | +| Open findings | 0 | ## Checklist coverage @@ -1463,13 +1463,13 @@ passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:1048,1065-1066` | -| Status | Open | +| Status | Resolved | **Description:** `_smoke` reintroduces the dead `closed = False` / `if not closed:` guard that Client.Python-004's resolution claimed to have removed via `async with session:`. `closed` is never reassigned, so the guard is always true. Behavior is correct (session always closed) but the dead variable misleads readers into expecting an early-close path. (Claimed regression — verify root cause.) **Recommendation:** Use `async with session:` or drop the `closed` variable and close unconditionally. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed regression: the dead `closed = False` / `if not closed:` guard had returned. Replaced the `try/finally` with `async with session:` (Session implements the async context-manager protocol). Test: `test_smoke_does_not_carry_dead_closed_guard` in `tests/test_review_findings_032_to_036.py`. ### Client.Python-033 @@ -1478,13 +1478,13 @@ passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:772,1490-1494` | -| Status | Open | +| Status | Resolved | **Description:** `_parse_string_list` always emits `param_hint="--items"`, but it is also called from `_build_write_bulk_entries` with `kwargs["values"]`. An empty `--values ""` on the write-bulk commands yields `Error: Invalid value for '--items': ...`, pointing at a flag that doesn't exist on those commands. **Recommendation:** Add an optional `param_hint` parameter (default `--items`) and pass `--values` from the write-bulk caller. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added `param_hint="--items"` default param to `_parse_string_list`; `_build_write_bulk_entries` now passes `param_hint="--values"`. Tests: `test_parse_string_list_default_param_hint_is_items`, `test_parse_string_list_accepts_caller_supplied_param_hint` in `tests/test_review_findings_032_to_036.py`. ### Client.Python-034 @@ -1493,13 +1493,13 @@ passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:1497-1501` | -| Status | Open | +| Status | Resolved | **Description:** `_parse_int_list` does `int(item)` with no error handling. A non-numeric token (e.g. `--item-handles "10,abc"`) raises a raw `ValueError`, surfacing as an unformatted traceback interactively (other input errors raise `click.BadParameter`). **Recommendation:** Wrap the conversion and re-raise as `click.BadParameter(..., param_hint="--item-handles")`. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Wrapped the `int()` comprehension in `try/except ValueError` and re-raise as `click.BadParameter(..., param_hint="--item-handles")`. Tests: `test_parse_int_list_non_numeric_raises_bad_parameter`, `test_parse_int_list_happy_path` in `tests/test_review_findings_032_to_036.py`. ### Client.Python-035 @@ -1508,13 +1508,13 @@ passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked | Severity | Low | | Category | Code organization & conventions | | Location | `clients/python/src/zb_mom_ww_mxgateway/__init__.py`, `.../options.py:63-77`, `.../galaxy.py:293` | -| Status | Open | +| Status | Resolved | **Description:** Two new public types — `BrowseChildrenOptions` (options.py) and `LazyBrowseNode` (galaxy.py) — are absent from `__init__.py`/`__all__`, so callers can't `from zb_mom_ww_mxgateway import BrowseChildrenOptions`, breaking the package-root import contract that `ClientOptions`/`GatewayClient`/etc. follow. **Recommendation:** Re-export both from `__init__.py` and add them to `__all__`. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Re-exported `BrowseChildrenOptions` (from `.options`) and `LazyBrowseNode` (from `.galaxy`) in `__init__.py` and added both to `__all__`. Tests: `test_browse_children_options_is_exported_from_package_root`, `test_lazy_browse_node_is_exported_from_package_root` in `tests/test_review_findings_032_to_036.py`. ### Client.Python-036 @@ -1523,10 +1523,10 @@ passed, 1 skipped, 0 warnings; previously 1 warning). The `tls`-marked | Severity | Medium | | Category | Documentation & comments | | Location | `clients/python/README.md:143-158` | -| Status | Open | +| Status | Resolved | **Description:** The README "Browsing lazily" section's code example calls `galaxy.browse_children(...)`, a method that does not exist — the actual public low-level method is `browse_children_raw`. The example raises `AttributeError` at runtime. The README-parse test only covers shell CLI invocations, not Python code fragments, so it doesn't catch this. **Recommendation:** Update the example/prose to `browse_children_raw(...)` (and promote the high-level `browse()`/`LazyBrowseNode` path), or add a `browse_children` alias. Add a `hasattr` test to catch future renames. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Updated the README "Browsing lazily" prose and example to `browse_children_raw(...)` and added a pointer to the higher-level `browse()`/`LazyBrowseNode` walker. Tests: `test_galaxy_client_exposes_browse_children_raw` (hasattr guard) and `test_readme_browse_example_uses_existing_method` (parses every `galaxy.()` call in README against the client class) in `tests/test_review_findings_032_to_036.py`. diff --git a/code-reviews/Client.Rust/findings.md b/code-reviews/Client.Rust/findings.md index 21d10e7..cd67d68 100644 --- a/code-reviews/Client.Rust/findings.md +++ b/code-reviews/Client.Rust/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 6 | +| Open findings | 0 | ## Checklist coverage @@ -787,13 +787,13 @@ This is masked by the tests: `tls_with_require_certificate_validation_does_not_s | Severity | Medium | | Category | Correctness & logic bugs | | Location | `clients/rust/crates/mxgw-cli/src/main.rs:485` | -| Status | Open | +| Status | Resolved | **Description:** `ConnectionArgs::options()` computes plaintext as `!self.tls || self.plaintext`. With both `--tls` and `--plaintext` supplied, this is `true`, silently degrading to an unencrypted channel despite the explicit `--tls`. A security-sensitive footgun (e.g. a script auto-appending `--plaintext`). **Recommendation:** Add clap `conflicts_with = "tls"` on `--plaintext` (reject the combo), or prefer `--tls` and warn. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added `conflicts_with = "tls"` to the `--plaintext` arg so supplying both is rejected at parse time, removing the silent downgrade. Tests: `connection_rejects_tls_and_plaintext_together`, `connection_tls_flag_disables_plaintext`, `connection_defaults_to_plaintext`, `connection_plaintext_flag_selects_plaintext`. ### Client.Rust-034 @@ -802,13 +802,13 @@ This is masked by the tests: `tls_with_require_certificate_validation_does_not_s | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/rust/crates/mxgw-cli/src/main.rs:48-51,548` | -| Status | Open | +| Status | Resolved | **Description:** `Command::Version` carries a `jsonl: bool` field that is never read; the dispatch arm matches `{ json, .. }` and discards `jsonl`. `mxgw version --jsonl` silently behaves as plain text. **Recommendation:** Handle `jsonl` in the Version arm (treat like `--json`) or remove the unused field. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Removed the unused `jsonl` field from `Command::Version` (version output is a single record, not a stream); the dispatch arm now matches `{ json }` exhaustively, so `mxgw version --jsonl` errors as an unknown flag instead of silently being ignored. No test (CLI surface change verified by build). ### Client.Rust-035 @@ -817,13 +817,13 @@ This is masked by the tests: `tls_with_require_certificate_validation_does_not_s | Severity | Low | | Category | Security | | Location | `clients/rust/crates/mxgw-cli/src/main.rs:489-495` | -| Status | Open | +| Status | Resolved | **Description:** `--api-key-env` (default `MXGATEWAY_API_KEY`) names an env var read into an `ApiKey` Bearer token, but its clap help has no description of the expected value format. A user pointing it at another credential's env var would silently forward that credential to the gateway as a Bearer token. Low risk (redacted Debug; bounded to user's own shell) but an implicit-trust gap. **Recommendation:** Add help text stating the variable must hold a value of the form `mxgw__`. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added clap doc-comment help to `--api-key-env` stating the variable's value must be a full gateway key of the form `mxgw__` and is forwarded verbatim as the Bearer token. Doc/help-only change, no test. ### Client.Rust-036 @@ -832,13 +832,13 @@ This is masked by the tests: `tls_with_require_certificate_validation_does_not_s | Severity | Low | | Category | Design-document adherence | | Location | `clients/rust/RustClientDesign.md:351` | -| Status | Open | +| Status | Resolved | **Description:** The new `galaxy browse` subcommand (with its filter/depth/json flags) is not listed in the "Test CLI" command table in RustClientDesign.md, which still reads `galaxy {test-connection,last-deploy-time,discover-hierarchy,watch}`. **Recommendation:** Add `mxgw galaxy browse [...flags]` and note `--depth 0` = requested level only, `--depth N` eagerly expands, and `--parent-gobject-id` makes `--depth` a no-op. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added the `mxgw galaxy browse` line (with all flags) to the CLI table and a paragraph documenting that `--depth 0` prints only the requested level, `--depth N` eagerly expands N further levels, and `--parent-gobject-id` makes `--depth` a no-op. Doc-only change, no test. ### Client.Rust-037 @@ -847,13 +847,13 @@ This is masked by the tests: `tls_with_require_certificate_validation_does_not_s | Severity | Low | | Category | Design-document adherence | | Location | `clients/rust/README.md:164-179` | -| Status | Open | +| Status | Resolved | **Description:** The README "Browsing lazily" example calls `galaxy.browse_children(...).await?.into_inner()`, but the public API is `GalaxyClient::browse_children_raw` (the bare `browse_children` is the generated proto-client method, not public; and `browse_children_raw` returns the reply struct directly, no `.into_inner()`). The example would not compile. **Recommendation:** Replace with `galaxy.browse_children_raw(BrowseChildrenRequest::default()).await?` (drop `.into_inner()`). -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Verified `browse_children_raw` is the public method (galaxy.rs:302) and returns `BrowseChildrenReply` directly. Updated the README prose and example to call `browse_children_raw(...).await?` without `.into_inner()`. Doc-only change, no test. ### Client.Rust-038 @@ -862,10 +862,10 @@ This is masked by the tests: `tls_with_require_certificate_validation_does_not_s | Severity | Low | | Category | Testing coverage | | Location | `clients/rust/crates/mxgw-cli/src/main.rs:2336-2564` | -| Status | Open | +| Status | Resolved | **Description:** Three CLI test gaps: (1) `ConnectionArgs::options()` `--tls`/`--plaintext` resolution (incl. the both-set path of Client.Rust-033) is untested; (2) `browse_children_one_level`'s repeated-page-token guard is untested; (3) `parse_rfc3339_timestamp` has no error-path tests (trailing chars, day=0, month 13, out-of-range day). **Recommendation:** Add unit tests for all three (none need a network connection). -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added all three test groups. (1) `--tls`/`--plaintext` resolution: `connection_defaults_to_plaintext`, `connection_tls_flag_disables_plaintext`, `connection_plaintext_flag_selects_plaintext`, `connection_rejects_tls_and_plaintext_together`. (2) Extracted the page-token dedup guard into pure `register_page_token` and covered it with `register_page_token_accepts_distinct_tokens_and_rejects_repeats`. (3) RFC3339 error paths: `rfc3339_parser_rejects_trailing_characters`, `rfc3339_parser_rejects_day_zero`, `rfc3339_parser_rejects_month_thirteen`, `rfc3339_parser_rejects_day_out_of_range_for_month`. diff --git a/code-reviews/Contracts/findings.md b/code-reviews/Contracts/findings.md index 9d3bd14..378d849 100644 --- a/code-reviews/Contracts/findings.md +++ b/code-reviews/Contracts/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 3 | +| Open findings | 0 | ## Checklist coverage @@ -433,13 +433,13 @@ Re-review of the proto delta (`git diff 410acc9..8df5ab3 -- .../Protos/`): the n | Severity | Low | | Category | Design-document adherence | | Location | `gateway.md:1087,1101-1102` | -| Status | Open | +| Status | Resolved | **Description:** gateway.md still lists "no reconnectable sessions" under "Resolved for v1" and lists "reconnectable sessions" / "multi-subscriber event fan-out" as post-v1 revisit items. The shipped `ReplayGap` reconnect-replay contract and multi-subscriber fan-out (documented in docs/Sessions.md) contradict this. docs/Sessions.md was updated; gateway.md's scope summary was left stale. **Recommendation:** Update the gateway.md Resolved/Post-v1 lists to reflect that reconnectable sessions (via `after_worker_sequence` + `ReplayGap`) and multi-subscriber fan-out have shipped, cross-referencing docs/Sessions.md. -**Resolution:** _(empty until closed)_ +**Resolution:** _(2026-06-16)_ Updated `gateway.md` "Resolved for v1" list: replaced "no reconnectable sessions" / "one active event subscriber" with bullet points describing the shipped reconnect-replay (`after_worker_sequence` + `ReplayGap` sentinel, cross-referencing `docs/Sessions.md`) and multi-subscriber fan-out (single-subscriber fail-fast vs. multi-subscriber per-consumer disconnect, cross-referencing `docs/Sessions.md`). Removed "reconnectable sessions" and "multi-subscriber event fan-out" from the Post-v1 revisit list. Updated the backpressure bullet to mention both modes. ### Contracts-021 @@ -448,13 +448,13 @@ Re-review of the proto delta (`git diff 410acc9..8df5ab3 -- .../Protos/`): the n | Severity | Low | | Category | Documentation & comments | | Location | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:731-733` | -| Status | Open | +| Status | Resolved | **Description:** The `replay_gap` field comment ends with "(Reconnect/replay logic is Task 12; this is the contract surface only.)". That parenthetical is now stale — the reconnect/replay logic has shipped and is exercised by EventStreamServiceTests/SessionEventDistributorTests. A reader is misled into thinking only the contract exists. **Recommendation:** Drop the "Task 12 / contract surface only" parenthetical; the rest of the comment is accurate. -**Resolution:** _(empty until closed)_ +**Resolution:** _(2026-06-16)_ Removed the stale "(Reconnect/replay logic is Task 12; this is the contract surface only.)" parenthetical from the `replay_gap` field comment in `mxaccess_gateway.proto`. The "Additive (proto3):" sentence before it is retained. Comment-only change; no wire-format or generated-type impact. ### Contracts-022 @@ -463,10 +463,10 @@ Re-review of the proto delta (`git diff 410acc9..8df5ab3 -- .../Protos/`): the n | Severity | Low | | Category | Testing coverage | | Location | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | -| Status | Open | +| Status | Resolved | **Description:** No round-trip / descriptor pin exists for the new `ReplayGap` message or `MxEvent.replay_gap` (field 14). The field is exercised functionally end-to-end, but there is no contract-level pin to catch a future renumber/type-narrowing of `replay_gap = 14` or the two `ReplayGap` sequence-field numbers — the same gap class as Contracts-007/010/018. **Recommendation:** Add a round-trip test setting `MxEvent.ReplayGap` with both sequence fields, asserting `BodyCase == None`, plus a descriptor assertion pinning `ReplayGapFieldNumber == 14` and the `ReplayGap` field numbers (1, 2). -**Resolution:** _(empty until closed)_ +**Resolution:** _(2026-06-16)_ Added `ProtobufContractRoundTripTests.MxEvent_RoundTripsReplayGapSentinelAndPinsFieldNumbers` to `ProtobufContractRoundTripTests.cs`. The test pins `MxEvent.ReplayGapFieldNumber == 14` via the generated constant, pins `ReplayGap.RequestedAfterSequenceFieldNumber == 1` and `ReplayGap.OldestAvailableSequenceFieldNumber == 2` via `ReplayGap.Descriptor.Fields` (asserting both the number and the field name), builds a sentinel `MxEvent` with both sequence fields populated and no body oneof set, serializes and parses it, then asserts both sequence values survive and `BodyCase == None` (confirming `replay_gap` is orthogonal to the body oneof). diff --git a/code-reviews/IntegrationTests/findings.md b/code-reviews/IntegrationTests/findings.md index be92f47..2ca5085 100644 --- a/code-reviews/IntegrationTests/findings.md +++ b/code-reviews/IntegrationTests/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 4 | +| Open findings | 0 | ## Checklist coverage @@ -633,13 +633,13 @@ The prior `DashboardAuthenticator` ctor took `IOptions`, so the | Severity | Low | | Category | Documentation & comments | | Location | `docs/GatewayTesting.md:76`, `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:576,728` | -| Status | Open | +| Status | Resolved | **Description:** `docs/GatewayTesting.md` says "All six tests are gated by MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1" and enumerates five parity paths. This diff adds two new `[LiveMxAccessFact]` tests (B8 new COM commands: AuthenticateUser/ArchestrAUserToId/Suspend/Activate; and the buffered-data path: AddBufferedItem/SetBufferedUpdateInterval), bringing the total to eight. The doc still says "six" and omits the two new parity surfaces. **Recommendation:** Update GatewayTesting.md to "eight" and add bullets for the B8 new-COM-commands and buffered-data parity surfaces. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: Updated `docs/GatewayTesting.md` — changed "five parity paths" to "seven", "All six tests" to "All eight tests", and added bullets for the B8 new-COM-commands surface (AuthenticateUser/ArchestrAUserToId/Suspend/Activate against an added-but-not-advised item) and the buffered-data surface (AddBufferedItem/SetBufferedUpdateInterval/Advise round-trip with at least one OnBufferedDataChange event, residual noted for multi-sample conversion). ### IntegrationTests-031 @@ -648,13 +648,13 @@ The prior `DashboardAuthenticator` ctor took `IOptions`, so the | Severity | Low | | Category | Documentation & comments | | Location | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:672` | -| Status | Open | +| Status | Resolved | **Description:** The inline comment at line 672 says "Suspend / Activate against the advised item", but no `Advise` call is made between `AddItem` (line 616) and `CreateSuspendRequest` (line 677) — the item is added but not advised. The comment mislabels the COM subscription state under test (the parity assertion only requires a real reply, not a successful one). **Recommendation:** Change "against the advised item" to "against the added-but-not-advised item" (or remove "advised"), and note that Suspend/Activate is exercised without a prior Advise. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: Rewrote the comment to "Suspend / Activate against the added-but-not-advised item (no Advise was issued between AddItem and this call)," making the COM subscription state explicit and noting that parity requires only a real reply, not a successful one. ### IntegrationTests-032 @@ -663,13 +663,13 @@ The prior `DashboardAuthenticator` ctor took `IOptions`, so the | Severity | Low | | Category | Testing coverage | | Location | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:823-865` | -| Status | Open | +| Status | Resolved | **Description:** In the buffered-item test, when no sample-bearing `OnBufferedDataChange` batch arrives, the sample-predicate `TimeoutException` is caught and discarded (line 831) before asserting `bootstrapBufferedEvents > 0`. The final failure message ("No OnBufferedDataChange event arrived at all") conflates two failure modes (NoData bootstrap not delivered vs. delivered-but-no-sample), reducing residual diagnostic quality. **Recommendation:** Before nulling the batch, log the caught timeout message (e.g. `output.WriteLine($"B8: sample-bearing batch predicate timed out: {ex.Message}")`) so the residual log distinguishes the two cases. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: Added `output.WriteLine($"B8: sample-bearing batch predicate timed out: {ex.Message}")` inside the `catch (TimeoutException ex)` block before nulling `bufferedBatch`, so the residual log clearly records the timeout detail and distinguishes "predicate timed out" from "no OnBufferedDataChange arrived at all". ### IntegrationTests-033 @@ -678,10 +678,10 @@ The prior `DashboardAuthenticator` ctor took `IOptions`, so the | Severity | Low | | Category | Testing coverage | | Location | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:577-709` | -| Status | Open | +| Status | Deferred | **Description:** The new-COM-commands live test covers AuthenticateUser/ArchestrAUserToId/Suspend/Activate but not `AddItem2`/`Write2` — the B8 extended commands with a second context parameter introduced in the same bundle. Only live COM tests can verify the COM call succeeds with the correct argument split; a parity regression short-circuiting AddItem2/Write2 to InvalidRequest would not be caught. **Recommendation:** Add AddItem2/Write2 to the parity test (or a dedicated test) asserting each produces a real reply (not InvalidRequest) against a valid handle and item-definition split. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: requires a live MXAccess rig + provider state not available on this dev box; add the AddItem2/Write2 parity assertions when running on the MXAccess host. diff --git a/code-reviews/README.md b/code-reviews/README.md index 8138034..13f7e5e 100644 --- a/code-reviews/README.md +++ b/code-reviews/README.md @@ -10,15 +10,15 @@ Each module's `findings.md` is the source of truth; this file is generated from | Module | Reviewer | Date | Commit | Status | Open | Total | |---|---|---|---|---|---|---| -| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 4 | 29 | -| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 5 | 34 | +| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 29 | +| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 34 | | [Client.Java](Client.Java/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 9 | 48 | -| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 5 | 36 | -| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 6 | 38 | -| [Contracts](Contracts/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 3 | 22 | -| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 4 | 33 | -| [Server](Server/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 2 | 55 | -| [Tests](Tests/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 3 | 38 | +| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 36 | +| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 38 | +| [Contracts](Contracts/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 22 | +| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 33 | +| [Server](Server/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 56 | +| [Tests](Tests/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 39 | | [Worker](Worker/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 0 | 28 | | [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-06-16 | `8df5ab3` | Re-reviewed | 3 | 36 | @@ -28,19 +28,7 @@ Findings with status `Open` or `In Progress`, ordered by severity. | ID | Severity | Category | Location | Description | |---|---|---|---|---| -| Client.Dotnet-028 | Medium | Security | `clients/dotnet/.../MxGatewayClientCli.cs:156` | Client.Dotnet-008 was recorded resolved by adding a `TryResolveApiKey` helper resolving both `--api-key` and the `--api-key-env` env-var path, wired into the error-redaction catch block. At HEAD the catch block reads `arguments.GetOptional… | -| Client.Go-030 | Medium | Concurrency & thread safety | `clients/go/cmd/mxgw-go/main.go:1491-1494` | `runGalaxyWatch`'s limit-reached branch calls `cancelStream()` and returns WITHOUT draining the buffered `events` channel, unlike the signal-cancel branch which drains. This is the shape Client.Go-013's resolution claimed to have fixed ("n… | | Client.Java-040 | Medium | Correctness & logic bugs | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1552-1561` | The `stream-alarms` overflow handler does `queue.clear()` then `offer(exception)` + `offer(ALARM_FEED_END)` non-atomically on an `ArrayBlockingQueue` shared with the gRPC delivery thread. In production gRPC (netty I/O thread), a concurrent… | -| Client.Python-036 | Medium | Documentation & comments | `clients/python/README.md:143-158` | The README "Browsing lazily" section's code example calls `galaxy.browse_children(...)`, a method that does not exist — the actual public low-level method is `browse_children_raw`. The example raises `AttributeError` at runtime. The README… | -| Client.Rust-033 | Medium | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:485` | `ConnectionArgs::options()` computes plaintext as `!self.tls \|\| self.plaintext`. With both `--tls` and `--plaintext` supplied, this is `true`, silently degrading to an unencrypted channel despite the explicit `--tls`. A security-sensitive… | -| Server-054 | Medium | Design-document adherence | `docs/DesignDecisions.md` (Session Reconnect / Event Subscribers / Later Revisit Items §470-471), `CLAUDE.md` (Repository-Specific Conventions) | The session-resilience epic shipped multi-subscriber fan-out (`SessionEventDistributor`), reconnectable sessions with replay (`AttachEventSubscriberWithReplay`/`ReplayGap`), and detach-grace retention — but `docs/DesignDecisions.md` still… | -| Client.Dotnet-026 | Low | Correctness & logic bugs | `clients/dotnet/.../MxGatewayClientCli.cs:306` (isLongRunning) | Client.Dotnet-015 extended `isLongRunning` to include the bench commands so they aren't silently cancelled by the default 30s CTS. The new `galaxy-browse` command is NOT in `isLongRunning`. A `galaxy-browse --depth N` tree walk on a large… | -| Client.Dotnet-027 | Low | Performance & resource management | `clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs:15` | `LazyBrowseNode` allocates one `SemaphoreSlim _expandLock = new(1,1)` per node and never disposes it (the type is not IDisposable). For a large Galaxy browse tree (thousands of nodes), live SemaphoreSlim instances accumulate; OS handles ar… | -| Client.Dotnet-029 | Low | Code organization & conventions | `clients/dotnet/.../IMxGatewayCliClient.cs:6` | `IMxGatewayCliClient` is a public interface with no type-level `` XML doc. The Client.Dotnet-013 resolution recorded adding one; at HEAD it is absent. No CS1591 fires (GenerateDocumentationFile now scoped to the packable library o… | -| Client.Go-031 | Low | Correctness & logic bugs | `clients/go/cmd/mxgw-go/main.go:1037-1046` | `closeSmokeSession` registers `defer cancel()` twice on the same `cancel` variable across two `context.WithTimeout` calls when the deadline-shortening branch fires. Because `cancel` is reassigned, both defers end up calling the second cont… | -| Client.Go-032 | Low | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:839-841` | `runStreamEvents` does not install a `signal.NotifyContext` handler, while `runStreamAlarms` and `runGalaxyWatch` do. Client.Go-020's resolution claimed this was added. Without a signal-aware parent context, Ctrl+C kills the process withou… | -| Client.Go-033 | Low | Testing coverage | `clients/go/cmd/mxgw-go/main_test.go` | Gaps vs prior coverage: (1) `TestRunBenchReadBulkRejectsNonPositiveDuration` (named in Client.Go-021's resolution) is absent — the `-duration-seconds`-positive guard at main.go:619 is untested; (2) `runStreamEvents` has no CLI-level test (… | -| Client.Go-034 | Low | Documentation & comments | `clients/go/README.md:245-263` | The README CLI example table lists ~12 commands but the binary now exposes ~27 subcommands (per `writeUsage`). Absent: `ping`, `galaxy-browse`, `batch`, `read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`,… | | Client.Java-041 | Low | Correctness & logic bugs | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:2187-2194` | `jsonString` escapes only `\`, `"`, `\r`, `\n` — not `\t`, `\b`, `\f`, or U+0000–U+001F/U+007F. A tag address/message/reference containing a tab produces malformed JSON (RFC 8259). Affects the hand-rolled `jsonObject`/`jsonString`/`jsonVal… | | Client.Java-042 | Low | Error handling & resilience | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1565-1567` | `StreamAlarmsCommand.onError` calls `queue.offer(error)` without checking the return value. If the queue is full when a transport error arrives, the error is dropped and the drain loop blocks forever on `queue.take()`. Same class as Client… | | Client.Java-043 | Low | Code organization & conventions | `clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java:241-264` | `galaxyBrowseParentZeroEmitsWarningToStderr` calls `MxGatewayCli.execute(new FakeClientFactory(), ...)` for a galaxy-browse command, which wires the real `GrpcGalaxyClientFactory` and constructs a live Netty channel to localhost:5000 as a… | @@ -49,26 +37,6 @@ Findings with status `Open` or `In Progress`, ordered by severity. | Client.Java-046 | Low | Testing coverage | `clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java:680-696` | `streamAlarmsCommandFailsFastOnQueueOverflow` delivers all 2000 onNext synchronously from within `streamAlarms`, so `subscriptionRef` is still null when the overflow fires — the `sub.cancel()` branch is never exercised. The test also doesn… | | Client.Java-047 | Low | Documentation & comments | `clients/java/README.md` | README advertises the `0.1.1` artifact coordinate (Gitea Maven section) while the `version` command reports `0.1.0` — the user-visible symptom of Client.Java-044. Cross-ref `MxGatewayClientVersion.java:12`. | | Client.Java-048 | Low | Documentation & comments | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:88-105` | The public `execute(PrintWriter, PrintWriter, String...)` Javadoc calls it "Test-friendly entry point", but it wires `GrpcMxGatewayCliClientFactory` with no injection — the actual test seam is the package-private `execute(MxGatewayCliClien… | -| Client.Python-032 | Low | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:1048,1065-1066` | `_smoke` reintroduces the dead `closed = False` / `if not closed:` guard that Client.Python-004's resolution claimed to have removed via `async with session:`. `closed` is never reassigned, so the guard is always true. Behavior is correct… | -| Client.Python-033 | Low | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:772,1490-1494` | `_parse_string_list` always emits `param_hint="--items"`, but it is also called from `_build_write_bulk_entries` with `kwargs["values"]`. An empty `--values ""` on the write-bulk commands yields `Error: Invalid value for '--items': ...`, p… | -| Client.Python-034 | Low | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:1497-1501` | `_parse_int_list` does `int(item)` with no error handling. A non-numeric token (e.g. `--item-handles "10,abc"`) raises a raw `ValueError`, surfacing as an unformatted traceback interactively (other input errors raise `click.BadParameter`). | -| Client.Python-035 | Low | Code organization & conventions | `clients/python/src/zb_mom_ww_mxgateway/__init__.py`, `.../options.py:63-77`, `.../galaxy.py:293` | Two new public types — `BrowseChildrenOptions` (options.py) and `LazyBrowseNode` (galaxy.py) — are absent from `__init__.py`/`__all__`, so callers can't `from zb_mom_ww_mxgateway import BrowseChildrenOptions`, breaking the package-root imp… | -| Client.Rust-034 | Low | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:48-51,548` | `Command::Version` carries a `jsonl: bool` field that is never read; the dispatch arm matches `{ json, .. }` and discards `jsonl`. `mxgw version --jsonl` silently behaves as plain text. | -| Client.Rust-035 | Low | Security | `clients/rust/crates/mxgw-cli/src/main.rs:489-495` | `--api-key-env` (default `MXGATEWAY_API_KEY`) names an env var read into an `ApiKey` Bearer token, but its clap help has no description of the expected value format. A user pointing it at another credential's env var would silently forward… | -| Client.Rust-036 | Low | Design-document adherence | `clients/rust/RustClientDesign.md:351` | The new `galaxy browse` subcommand (with its filter/depth/json flags) is not listed in the "Test CLI" command table in RustClientDesign.md, which still reads `galaxy {test-connection,last-deploy-time,discover-hierarchy,watch}`. | -| Client.Rust-037 | Low | Design-document adherence | `clients/rust/README.md:164-179` | The README "Browsing lazily" example calls `galaxy.browse_children(...).await?.into_inner()`, but the public API is `GalaxyClient::browse_children_raw` (the bare `browse_children` is the generated proto-client method, not public; and `brow… | -| Client.Rust-038 | Low | Testing coverage | `clients/rust/crates/mxgw-cli/src/main.rs:2336-2564` | Three CLI test gaps: (1) `ConnectionArgs::options()` `--tls`/`--plaintext` resolution (incl. the both-set path of Client.Rust-033) is untested; (2) `browse_children_one_level`'s repeated-page-token guard is untested; (3) `parse_rfc3339_tim… | -| Contracts-020 | Low | Design-document adherence | `gateway.md:1087,1101-1102` | gateway.md still lists "no reconnectable sessions" under "Resolved for v1" and lists "reconnectable sessions" / "multi-subscriber event fan-out" as post-v1 revisit items. The shipped `ReplayGap` reconnect-replay contract and multi-subscrib… | -| Contracts-021 | Low | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:731-733` | The `replay_gap` field comment ends with "(Reconnect/replay logic is Task 12; this is the contract surface only.)". That parenthetical is now stale — the reconnect/replay logic has shipped and is exercised by EventStreamServiceTests/Sessio… | -| Contracts-022 | Low | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | No round-trip / descriptor pin exists for the new `ReplayGap` message or `MxEvent.replay_gap` (field 14). The field is exercised functionally end-to-end, but there is no contract-level pin to catch a future renumber/type-narrowing of `repl… | -| IntegrationTests-030 | Low | Documentation & comments | `docs/GatewayTesting.md:76`, `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:576,728` | `docs/GatewayTesting.md` says "All six tests are gated by MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1" and enumerates five parity paths. This diff adds two new `[LiveMxAccessFact]` tests (B8 new COM commands: AuthenticateUser/ArchestrAUserToId/Sus… | -| IntegrationTests-031 | Low | Documentation & comments | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:672` | The inline comment at line 672 says "Suspend / Activate against the advised item", but no `Advise` call is made between `AddItem` (line 616) and `CreateSuspendRequest` (line 677) — the item is added but not advised. The comment mislabels t… | -| IntegrationTests-032 | Low | Testing coverage | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:823-865` | In the buffered-item test, when no sample-bearing `OnBufferedDataChange` batch arrives, the sample-predicate `TimeoutException` is caught and discarded (line 831) before asserting `bootstrapBufferedEvents > 0`. The final failure message ("… | -| IntegrationTests-033 | Low | Testing coverage | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:577-709` | The new-COM-commands live test covers AuthenticateUser/ArchestrAUserToId/Suspend/Activate but not `AddItem2`/`Write2` — the B8 extended commands with a second context parameter introduced in the same bundle. Only live COM tests can verify… | -| Server-055 | Low | Correctness & logic bugs | `src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs:842-851,1841-1871` | When `AttachEventSubscriber`/`AttachEventSubscriberWithReplay` fails inside `StartDistributorAndRegister`, the catch calls `DetachEventSubscriber()`, which decrements the active count back to 0 and — because the session is still `Ready` an… | -| Tests-036 | Low | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs` | Three new validator rules — `DetachGraceSeconds >= 0` (GatewayOptionsValidator.cs:185-186), `ReplayBufferCapacity >= 0` (:215-216), `ReplayRetentionSeconds >= 0` (:219-220) — have no tests, while the sibling new options (`MaxEventSubscribe… | -| Tests-037 | Low | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | The reconnect/replay contract surface (`ReplayGap` message, `MxEvent.replay_gap = 14`, `StreamEventsRequest.after_worker_sequence`) has no protobuf serialize/parse round-trip test pinning the wire shape and the documented sentinel invarian… | -| Tests-038 | Low | Performance & resource management | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs:702-713` | `DrainUntilFaultAsync` relies on the channel completing WITH a fault so `WaitToReadAsync` re-throws. Correct for current callers, but if reused on a channel that completes gracefully, `WaitToReadAsync` returns false without throwing and th… | | Worker.Tests-034 | Low | Code organization & conventions | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs:2233`, `src/ZB.MOM.WW.MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs:97` | `FakeMxStatus` is defined twice — file-scope in `TestSupport/NoopMxAccessServer.cs:97` and nested in `MxAccessCommandExecutorTests.FakeMxAccessComObject:2233` — both exposing the same four public fields that `MxStatusProxyConverter` reflec… | | Worker.Tests-035 | Low | Testing coverage | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs`, `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessCommandExecutor.cs:99-136` | `MxAccessCommandExecutor.Execute` has a `_` discard arm returning `CreateInvalidRequestReply(... "Unsupported MXAccess command kind ...")` — the safety net for an unknown `MxCommandKind` (e.g. a future gateway enum value before the worker… | | Worker.Tests-036 | Low | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:983-996` | `RunAsync_SendsFirstHeartbeatImmediatelyOnEnteringLoop` carries a redundant wall-clock assertion `Assert.True(elapsed < TimeSpan.FromSeconds(5), ...)`. The existing `heartbeatWait` CTS (cancel-after 5s) already enforces the same bound — th… | @@ -111,11 +79,13 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Dotnet-003 | Medium | Resolved | Concurrency & thread safety | `clients/dotnet/MxGateway.Client/MxGatewaySession.cs:659-663`, `clients/dotnet/MxGateway.Client/MxGatewayClient.cs:230-240` | | Client.Dotnet-018 | Medium | Resolved | Documentation & comments | `clients/dotnet/README.md:137-138` | | Client.Dotnet-022 | Medium | Resolved | mxaccessgw conventions | `clients/dotnet/Directory.Build.props:1-21` | +| Client.Dotnet-028 | Medium | Resolved | Security | `clients/dotnet/.../MxGatewayClientCli.cs:156` | | Client.Go-002 | Medium | Resolved | Error handling & resilience | `clients/go/mxgateway/session.go:440-516` | | Client.Go-003 | Medium | Resolved | Correctness & logic bugs | `clients/go/cmd/mxgw-go/main.go:517-532` | | Client.Go-022 | Medium | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:398-412,417-519` | | Client.Go-023 | Medium | Resolved | Concurrency & thread safety | `clients/go/cmd/mxgw-go/main.go:604-606,616-632` | | Client.Go-028 | Medium | Resolved | Correctness & logic bugs | `scripts/tag-go-module.ps1:42-46` | +| Client.Go-030 | Medium | Resolved | Concurrency & thread safety | `clients/go/cmd/mxgw-go/main.go:1491-1494` | | Client.Java-001 | Medium | Resolved | Security | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java:30-32` | | Client.Java-002 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:31,66-92` | | Client.Java-003 | Medium | Resolved | mxaccessgw conventions | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:119-140` | @@ -137,6 +107,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Python-024 | Medium | Resolved | Code organization & conventions | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:13,48-119` | | Client.Python-027 | Medium | Resolved | Security | `clients/python/src/zb_mom_ww_mxgateway/client.py:36-54`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:47-66`, `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:165-172,918-930` | | Client.Python-028 | Medium | Resolved | Error handling & resilience | `clients/python/src/zb_mom_ww_mxgateway/options.py:120-130`, `clients/python/src/zb_mom_ww_mxgateway/client.py:59`, `clients/python/src/zb_mom_ww_mxgateway/galaxy.py:71` | +| Client.Python-036 | Medium | Resolved | Documentation & comments | `clients/python/README.md:143-158` | | Client.Rust-005 | Medium | Resolved | Correctness & logic bugs | `clients/rust/src/session.rs:489-520` | | Client.Rust-006 | Medium | Resolved | Error handling & resilience | `clients/rust/src/session.rs:531-555` | | Client.Rust-015 | Medium | Resolved | Error handling & resilience | `clients/rust/crates/mxgw-cli/src/main.rs:1053-1070` | @@ -145,6 +116,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Rust-022 | Medium | Resolved | Correctness & logic bugs | `clients/rust/src/session.rs:369-391,403-420,427-444,452-469,476-493,631-696,706-724` | | Client.Rust-024 | Medium | Resolved | Testing coverage | `clients/rust/tests/client_behavior.rs:405-415`; `clients/rust/src/session.rs:369-493`; `clients/rust/src/client.rs:265-291`; `clients/rust/crates/mxgw-cli/src/main.rs:1310-1505` | | Client.Rust-031 | Medium | Resolved | Error handling & resilience | `clients/rust/src/options.rs:196-240` (`build_tls_config`); `clients/rust/Cargo.toml:40` (tonic features); docs: `clients/rust/src/options.rs:76-101`, `clients/rust/README.md` (TLS trust section), `clients/rust/crates/mxgw-cli/src/main.rs:429-431`, `clients/rust/RustClientDesign.md:202` | +| Client.Rust-033 | Medium | Resolved | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:485` | | Contracts-002 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:384-385`, `:95` | | Contracts-009 | Medium | Resolved | Design-document adherence | `docs/Contracts.md:13-24` | | IntegrationTests-003 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:89-97` | @@ -169,6 +141,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Server-038 | Medium | Resolved | Security | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/Hubs/EventsHub.cs:23-44` | | Server-044 | Medium | Resolved | Correctness & logic bugs | `src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionManager.cs:216-254` | | Server-051 | Medium | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:64-78` | +| Server-054 | Medium | Resolved | Design-document adherence | `docs/DesignDecisions.md` (Session Reconnect / Event Subscribers / Later Revisit Items §470-471), `CLAUDE.md` (Repository-Specific Conventions) | +| Server-056 | Medium | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs:296-310,449-453,629-635` | | Tests-003 | Medium | Resolved | Performance & resource management | `src/MxGateway.Tests/Security/Authentication/SqliteAuthStoreTests.cs:170-176`, `src/MxGateway.Tests/Security/Authentication/ApiKeyAdminCliRunnerTests.cs:252-258` | | Tests-004 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs` | | Tests-005 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/EventStreamServiceTests.cs:239-261`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` | @@ -217,6 +191,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Dotnet-023 | Low | Resolved | Code organization & conventions | `clients/dotnet/Directory.Build.props:17`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Cli/IMxGatewayCliClient.cs:6`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client.Tests/*.cs` | | Client.Dotnet-024 | Low | Resolved | Code organization & conventions | `clients/dotnet/Directory.Build.props:12`, `clients/dotnet/ZB.MOM.WW.MxGateway.Client/ZB.MOM.WW.MxGateway.Client.csproj:19-24` | | Client.Dotnet-025 | Low | Resolved | Concurrency & thread safety | `clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs:38,41,54,82,94` | +| Client.Dotnet-026 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/.../MxGatewayClientCli.cs:306` (isLongRunning) | +| Client.Dotnet-027 | Low | Won't Fix | Performance & resource management | `clients/dotnet/ZB.MOM.WW.MxGateway.Client/LazyBrowseNode.cs:15` | +| Client.Dotnet-029 | Low | Resolved | Code organization & conventions | `clients/dotnet/.../IMxGatewayCliClient.cs:6` | | Client.Go-004 | Low | Resolved | mxaccessgw conventions | `clients/go/mxgateway/alarms_test.go:153-154`, `clients/go/mxgateway/galaxy_test.go:58-59` | | Client.Go-005 | Low | Resolved | Design-document adherence | `clients/go/mxgateway/client.go:64,68`, `clients/go/mxgateway/galaxy.go:83,87` | | Client.Go-006 | Low | Resolved | Error handling & resilience | `clients/go/mxgateway/errors.go:9-130` | @@ -240,6 +217,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Go-026 | Low | Resolved | Error handling & resilience | `clients/go/cmd/mxgw-go/main.go:1196-1222` | | Client.Go-027 | Low | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:1195-1206` | | Client.Go-029 | Low | Resolved | Documentation & comments | `clients/go/README.md:300-303` | +| Client.Go-031 | Low | Resolved | Correctness & logic bugs | `clients/go/cmd/mxgw-go/main.go:1037-1046` | +| Client.Go-032 | Low | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:839-841` | +| Client.Go-033 | Low | Resolved | Testing coverage | `clients/go/cmd/mxgw-go/main_test.go` | +| Client.Go-034 | Low | Resolved | Documentation & comments | `clients/go/README.md:245-263` | | Client.Java-006 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:323-328`, `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java:279-284` | | Client.Java-007 | Low | Resolved | Testing coverage | `clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/` | | Client.Java-008 | Low | Resolved | Error handling & resilience | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:298-304` | @@ -284,6 +265,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Python-029 | Low | Resolved | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway/options.py:78-90` | | Client.Python-030 | Low | Resolved | Code organization & conventions | `clients/python/pyproject.toml:17` | | Client.Python-031 | Low | Resolved | Testing coverage | `clients/python/tests/test_tls.py:34`, `clients/python/pyproject.toml:53-56` | +| Client.Python-032 | Low | Resolved | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:1048,1065-1066` | +| Client.Python-033 | Low | Resolved | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:772,1490-1494` | +| Client.Python-034 | Low | Resolved | Correctness & logic bugs | `clients/python/src/zb_mom_ww_mxgateway_cli/commands.py:1497-1501` | +| Client.Python-035 | Low | Resolved | Code organization & conventions | `clients/python/src/zb_mom_ww_mxgateway/__init__.py`, `.../options.py:63-77`, `.../galaxy.py:293` | | Client.Rust-004 | Low | Resolved | Documentation & comments | `clients/rust/src/version.rs:7` | | Client.Rust-007 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:14-55` | | Client.Rust-008 | Low | Resolved | Performance & resource management | `clients/rust/src/value.rs:161-261` | @@ -301,6 +286,11 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Rust-027 | Low | Resolved | Documentation & comments | `clients/rust/.cargo/config.toml:1-9` | | Client.Rust-028 | Low | Resolved | mxaccessgw conventions | `clients/rust/crates/mxgw-cli/src/main.rs:1126-1166` | | Client.Rust-032 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md`; surface in `clients/rust/src/galaxy.rs:281-379` | +| Client.Rust-034 | Low | Resolved | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:48-51,548` | +| Client.Rust-035 | Low | Resolved | Security | `clients/rust/crates/mxgw-cli/src/main.rs:489-495` | +| Client.Rust-036 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:351` | +| Client.Rust-037 | Low | Resolved | Design-document adherence | `clients/rust/README.md:164-179` | +| Client.Rust-038 | Low | Resolved | Testing coverage | `clients/rust/crates/mxgw-cli/src/main.rs:2336-2564` | | Contracts-001 | Low | Resolved | Design-document adherence | `docs/Grpc.md:13` (and `:3`, `:32`, `:39`) | | Contracts-003 | Low | Won't Fix | Code organization & conventions | `src/MxGateway.Contracts/MxGateway.Contracts.csproj:10` | | Contracts-004 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/GatewayContractInfo.cs:3-6` | @@ -318,6 +308,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Contracts-017 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:23-29` (the `rpc QueryActiveAlarms` block) | | Contracts-018 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs:396` (`ActiveAlarmSnapshot_RoundTripsAllFields`) | | Contracts-019 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:850-851` (`ActiveAlarmSnapshot`), `:318-324` (`AlarmProviderMode`) | +| Contracts-020 | Low | Resolved | Design-document adherence | `gateway.md:1087,1101-1102` | +| Contracts-021 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto:731-733` | +| Contracts-022 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | | IntegrationTests-007 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:20`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:5`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:9` | | IntegrationTests-008 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/LiveLdapFactAttribute.cs`, `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs`, `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs` | | IntegrationTests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:372-375` | @@ -336,6 +329,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | IntegrationTests-027 | Low | Resolved | Code organization & conventions | `src/ZB.MOM.WW.MxGateway.IntegrationTests/ZB.MOM.WW.MxGateway.IntegrationTests.csproj`, `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:4-5,134` | | IntegrationTests-028 | Low | Resolved | Design-document adherence | `src/ZB.MOM.WW.MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:120-161`, `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardServiceCollectionExtensions.cs:35` | | IntegrationTests-029 | Low | Resolved | Documentation & comments | `docs/GatewayTesting.md:218-224` | +| IntegrationTests-030 | Low | Resolved | Documentation & comments | `docs/GatewayTesting.md:76`, `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:576,728` | +| IntegrationTests-031 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:672` | +| IntegrationTests-032 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:823-865` | +| IntegrationTests-033 | Low | Deferred | Testing coverage | `src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:577-709` | | Server-007 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyHierarchyProjector.cs:55-70` | | Server-008 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:111-134,160-189` | | Server-009 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Security/Authentication/AuthSqliteConnectionFactory.cs:15-32` | @@ -372,6 +369,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Server-050 | Low | Resolved | Error handling & resilience | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardSessionAdminService.cs:42-75,92-125` | | Server-052 | Low | Resolved | Documentation & comments | `src/ZB.MOM.WW.MxGateway.Server/Alarms/IAlarmWatchListResolver.cs:24-30`, `src/ZB.MOM.WW.MxGateway.Server/Alarms/AlarmWatchListResolver.cs:101-114`, `docs/GatewayConfiguration.md:247` | | Server-053 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmWatchListResolverTests.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Alarms/GatewayAlarmMonitorProviderModeTests.cs` | +| Server-055 | Low | Resolved | Correctness & logic bugs | `src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs:842-851,1841-1871` | | Tests-007 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:682`, `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:324`, `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:460`, `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs:233` | | Tests-008 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs:1-9`, `src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs:1-3`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerAlarmAutoSubscribeTests.cs:1` | | Tests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:36-37,99,365` | @@ -395,6 +393,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Tests-033 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Server/Dashboard/DashboardAlarmProviderStatus.cs`, `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Dashboard/DashboardBrowseAndAlarmModelTests.cs:140-195` | | Tests-034 | Low | Resolved | mxaccessgw conventions | `src/ZB.MOM.WW.MxGateway.Tests/Diagnostics/GatewayLogRedactorSeamTests.cs:1-15` | | Tests-035 | Low | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Tests/Alarms/AlarmFailoverEndToEndTests.cs:315-329` | +| Tests-036 | Low | Resolved | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs` | +| Tests-037 | Low | Won't Fix | Testing coverage | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | +| Tests-038 | Low | Resolved | Performance & resource management | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs:702-713` | +| Tests-039 | Low | Resolved | Concurrency & thread safety | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionDashboardMirrorTests.cs` (`DashboardMirror_AndGrpcSubscriber_BothReceiveEvents`) | | Worker-009 | Low | Resolved | Performance & resource management | `src/MxGateway.Worker/Ipc/WorkerFrameReader.cs:31,49`, `src/MxGateway.Worker/Ipc/WorkerFrameWriter.cs:57-58` | | Worker-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Conversion/VariantConverter.cs:204-226` | | Worker-011 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeClient.cs:169-171` | diff --git a/code-reviews/Server/findings.md b/code-reviews/Server/findings.md index 0e46441..09ca63d 100644 --- a/code-reviews/Server/findings.md +++ b/code-reviews/Server/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 2 | +| Open findings | 0 | ## Checklist coverage @@ -1047,13 +1047,13 @@ Additionally, `GatewayAlarmMonitor.ApplyProviderModeChangeAsync` increments the | Severity | Medium | | Category | Design-document adherence | | Location | `docs/DesignDecisions.md` (Session Reconnect / Event Subscribers / Later Revisit Items §470-471), `CLAUDE.md` (Repository-Specific Conventions) | -| Status | Open | +| Status | Resolved | **Description:** The session-resilience epic shipped multi-subscriber fan-out (`SessionEventDistributor`), reconnectable sessions with replay (`AttachEventSubscriberWithReplay`/`ReplayGap`), and detach-grace retention — but `docs/DesignDecisions.md` still states "no reconnectable sessions for v1" and "one active StreamEvents subscriber per session for v1", and still files both as post-v1 "Later Revisit Items". `CLAUDE.md` likewise still says these are "explicitly out of scope". This is the stale-prose-vs-shipped-behavior drift the "update docs in the same change as the source" rule prohibits. **Recommendation:** Update both `DesignDecisions.md` sections and the revisit list to describe the shipped behavior (gated by `AllowMultipleEventSubscribers`, `DetachGraceSeconds`, replay options), and amend the CLAUDE.md convention bullet. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: updated `docs/DesignDecisions.md` (Session Reconnect section rewritten to describe the shipped detach-grace + replay-on-reconnect behavior with config references; Event Subscribers section rewritten to describe the config-gated multi-subscriber fan-out, mode-dependent `FailFast` semantics, and internal vs external subscriber distinction; Later Revisit Items list removes the two shipped items and records them as shipped with config cross-references) and the `CLAUDE.md` conventions bullet to describe the shipped config-gated multi-subscriber + reconnect-replay behavior while preserving the one-worker-per-session invariant. ### Server-055 @@ -1062,10 +1062,25 @@ Additionally, `GatewayAlarmMonitor.ApplyProviderModeChangeAsync` increments the | Severity | Low | | Category | Correctness & logic bugs | | Location | `src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs:842-851,1841-1871` | -| Status | Open | +| Status | Resolved | **Description:** When `AttachEventSubscriber`/`AttachEventSubscriberWithReplay` fails inside `StartDistributorAndRegister`, the catch calls `DetachEventSubscriber()`, which decrements the active count back to 0 and — because the session is still `Ready` and detach-grace is enabled — stamps `_detachedAtUtc = now`. A freshly-`Ready` session that never had a successful subscriber thus enters the detach-grace window on a failed first attach, making it sweep-eligible after `DetachGraceSeconds` even though no client ever streamed. Impact is minor (the lease still protects it; a later successful attach clears the stamp) but the "last subscriber dropped" semantics are violated. **Recommendation:** Only stamp `_detachedAtUtc` on a detach that mirrors a prior successful attach — roll the failure path back without entering grace, or guard the stamp on "a subscriber had previously been registered." -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: `GatewaySession` now tracks `_everHadEventSubscriber` (a `bool` field, set to `true` inside `MarkEventSubscriberAttached()` which is called only after `StartDistributorAndRegister` succeeds). `DetachEventSubscriber` gates the `_detachedAtUtc` stamp on `_everHadEventSubscriber`, so the catch-path rollback decrements the reserved slot but does not enter detach-grace. A regression `[Fact]` (`DetachGrace_FailedFirstAttach_DoesNotEnterGrace`) in `GatewaySessionTests.cs` verifies that after a failed first attach the session has `DetachedAtUtc == null`, `ActiveEventSubscriberCount == 0`, and `IsDetachGraceExpired` returns `false` regardless of clock advance. + +### Server-056 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Concurrency & thread safety | +| Location | `src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs:296-310,449-453,629-635` | +| Status | Resolved | + +**Description:** `SessionEventDistributor` orphaned any subscriber that registered in the window AFTER the pump ran its final `CompleteAllSubscribers` sweep (the event source completed or faulted and the pump exited) but BEFORE `DisposeAsync`. `RegisterSubscriber`/`RegisterWithReplay` guarded only against `_disposed`, not against the pump having already completed, so a subscriber added in that window got a channel the now-exited pump would never complete — its reader (`ReadAllAsync`) waited forever. In production this is the edge case of a client calling `StreamEvents` after the worker's event stream has ended but before the session is torn down. Discovered while diagnosing an order-dependent hang in `GatewaySessionDashboardMirrorTests`, where a gRPC subscriber attached after a fast-completing worker stream had already drained (its `await foreach` has no timeout, so the orphaned channel surfaced as an infinite hang rather than a clean failure). + +**Recommendation:** Record terminal completion (a `_completed` flag plus the terminal error) under `_lifecycleLock` and have both register paths complete a late registrant's channel immediately with the same terminal state. + +**Resolution:** 2026-06-17: added `_completed` + `_completionError`, set inside `CompleteAllSubscribers` under `_lifecycleLock` — the same lock the register paths take, so completion and registration serialize (a subscriber added before the sweep is completed by the loop; one racing in after sees `_completed` and self-completes). `Register` and `RegisterWithReplay` now `TryComplete` a late registrant's channel with `_completionError` when `_completed`; a late resume still receives its retained replay batch, then a cleanly-completed empty live channel. No lock-ordering risk — `CompleteAllSubscribers` takes only `_lifecycleLock`, and the subscriber channels use `AllowSynchronousContinuations=false` so `TryComplete` under the lock runs no continuation inline. New regression `[Fact]` `Register_AfterSourceCompletes_CompletesLateSubscriberInsteadOfHanging` (`SessionEventDistributorTests.cs`) registers a subscriber after the pump completes and asserts its channel completes (bounded read); verified it fails without the fix (5 s timeout) and passes with it (12 ms). The racy `GatewaySessionDashboardMirrorTests.DashboardMirror_AndGrpcSubscriber_BothReceiveEvents` that exposed it was also made deterministic — see Tests-039. diff --git a/code-reviews/Tests/findings.md b/code-reviews/Tests/findings.md index 42068da..0bd4db3 100644 --- a/code-reviews/Tests/findings.md +++ b/code-reviews/Tests/findings.md @@ -7,7 +7,7 @@ | Review date | 2026-06-16 | | Commit reviewed | `8df5ab3` | | Status | Re-reviewed | -| Open findings | 3 | +| Open findings | 0 | ## Checklist coverage @@ -672,13 +672,13 @@ The cancellation tests for `WorkerClient` in `WorkerClientTests` *do* exercise t | Severity | Low | | Category | Testing coverage | | Location | `src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs` | -| Status | Open | +| Status | Resolved | **Description:** Three new validator rules — `DetachGraceSeconds >= 0` (GatewayOptionsValidator.cs:185-186), `ReplayBufferCapacity >= 0` (:215-216), `ReplayRetentionSeconds >= 0` (:219-220) — have no tests, while the sibling new options (`MaxEventSubscribersPerSession`, `WorkerReadyWaitTimeoutMs`) do. A regression dropping/inverting any of the three guards would pass with no failing test. **Recommendation:** Add boundary theories mirroring the `MaxEventSubscribersPerSession` pattern: a failing case (`-1`) asserting the message contains each config path, and a succeeding boundary case (`0`). -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added six tests to `GatewayOptionsValidatorTests.cs` covering all three guards: `Validate_Fails_WhenDetachGraceSecondsIsNegative` / `Validate_Succeeds_WhenDetachGraceSecondsIsZero` (via `CloneWithSessions`); `Validate_Fails_WhenReplayBufferCapacityIsNegative` / `Validate_Succeeds_WhenReplayBufferCapacityIsZero` and `Validate_Fails_WhenReplayRetentionSecondsIsNegative` / `Validate_Succeeds_WhenReplayRetentionSecondsIsZero` (via a new `CloneWithEvents` helper). Each failing case asserts the failure message contains the config path; each boundary case asserts `Succeeded`. Mirrors the `MaxEventSubscribersPerSession` / `WorkerReadyWaitTimeoutMs` pattern. ### Tests-037 @@ -687,13 +687,13 @@ The cancellation tests for `WorkerClient` in `WorkerClientTests` *do* exercise t | Severity | Low | | Category | Testing coverage | | Location | `src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs` | -| Status | Open | +| Status | Won't Fix | **Description:** The reconnect/replay contract surface (`ReplayGap` message, `MxEvent.replay_gap = 14`, `StreamEventsRequest.after_worker_sequence`) has no protobuf serialize/parse round-trip test pinning the wire shape and the documented sentinel invariant (family UNSPECIFIED, body oneof and per-item fields unset). Behavior is exercised in EventStreamServiceTests; this is a wire-contract gap. **Recommendation:** Add a round-trip test building an `MxEvent` with `ReplayGap` populated, asserting the two sequence fields survive and the sentinel invariants hold (field 14, `Family == Unspecified`, `BodyCase` unset). -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16: covered by the ReplayGap round-trip + descriptor-pin test added under Contracts-022 in ProtobufContractRoundTripTests.cs; a duplicate here would be redundant. ### Tests-038 @@ -702,10 +702,25 @@ The cancellation tests for `WorkerClient` in `WorkerClientTests` *do* exercise t | Severity | Low | | Category | Performance & resource management | | Location | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs:702-713` | -| Status | Open | +| Status | Resolved | **Description:** `DrainUntilFaultAsync` relies on the channel completing WITH a fault so `WaitToReadAsync` re-throws. Correct for current callers, but if reused on a channel that completes gracefully, `WaitToReadAsync` returns false without throwing and the helper spins in a tight CPU loop with no escape (ReadTimeout bounds only the individual wait). A maintenance hazard, not a current bug. **Recommendation:** When `WaitToReadAsync` returns false, await `reader.Completion` (surfaces the fault or completes cleanly) and `Assert.Fail` on graceful completion, so the helper fails fast instead of spinning. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — When `WaitToReadAsync` returns `false` (graceful completion), the helper now awaits `reader.Completion` (propagating any stored fault) and then calls `Assert.Fail` so the helper fails fast rather than spinning; the fault-path behavior (re-throw from `WaitToReadAsync`) is preserved unchanged. + +### Tests-039 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionDashboardMirrorTests.cs` (`DashboardMirror_AndGrpcSubscriber_BothReceiveEvents`) | +| Status | Resolved | + +**Description:** `DashboardMirror_AndGrpcSubscriber_BothReceiveEvents` attached its gRPC subscriber via `StreamEventsAsync` AFTER `MarkReady()` had already started the pump draining a fast-completing 3-event fake worker — a register-vs-pump race. It passed alone but hung the whole `GatewaySessionDashboardMirrorTests` class when another test ran first (warm JIT let the pump drain and complete before the gRPC subscriber registered). Its `await foreach` over the gRPC stream uses `CancellationToken.None` with no timeout, so the race surfaced as an indefinite hang rather than a clean failure (unlike the sibling tests' `WaitUntilAsync`, which self-times-out at 5s). This exposed the production race fixed under Server-056. + +**Recommendation:** Make the test deterministic — hold the worker stream until both the dashboard mirror and the gRPC subscriber have attached, then release, so neither subscriber can miss an event regardless of scheduling. + +**Resolution:** 2026-06-17 — added a release-gate to the test's `FakeWorkerClient` (`HoldEventsUntilReleased()` / `ReleaseEvents()`; `ReadEventsAsync` awaits the gate before yielding, ungated by default so other tests are unaffected). The test now holds the stream, starts the gRPC reader on a background task, waits for `session.ActiveEventSubscriberCount == 1` (the internal dashboard mirror is excluded from the count, so this confirms the gRPC subscriber attached), then releases — both subscribers deterministically receive all three events. With the Server-056 production fix in place, the full `GatewaySessionDashboardMirrorTests` class now passes (5/5) instead of hanging. diff --git a/docs/DesignDecisions.md b/docs/DesignDecisions.md index 6a3c6d0..f3a9e54 100644 --- a/docs/DesignDecisions.md +++ b/docs/DesignDecisions.md @@ -62,37 +62,67 @@ Implementation guidance: ## Session Reconnect -Decision: no reconnectable sessions for v1. +Reconnectable sessions with event replay are shipped and config-gated. The +original "no reconnectable sessions" constraint is superseded. One `OpenSession` creates one gateway session and one worker process. The session ends on `CloseSession`, client disconnect policy, lease expiry, worker -fault, or gateway shutdown. +fault, gateway shutdown, or — when `DetachGraceSeconds > 0` — detach-grace +expiry after the last external event subscriber drops. -Rationale: reconnectable sessions require event replay, orphan ownership, -security checks, and more complicated worker lifetime rules. They are not needed -for the first parity slice. +`MxGateway:Sessions:DetachGraceSeconds` (default `30`) controls the retention +window. When positive, a session whose last external gRPC event-stream +subscriber drops stays `Ready` for that many seconds so a client can reconnect +to the same session instead of triggering a new `OpenSession` → worker spawn. +Setting it to `0` reverts to closing only on normal lease expiry. + +A reconnecting client issues `StreamEvents` with `after_worker_sequence` set to +the last sequence it observed; the gateway replays retained events newer than +that watermark (capped by `MxGateway:Events:ReplayBufferCapacity` and +`MxGateway:Events:ReplayRetentionSeconds`) then transitions seamlessly to live +delivery. If the requested position precedes the oldest retained event, a +`ReplayGap` sentinel signals the client to re-snapshot. The replay→live handoff +is atomic (no gap, no duplicate). See [Sessions](./Sessions.md) for the full +reconnect and replay protocol. ## Event Subscribers -Decision: one active `StreamEvents` subscriber per session for v1. +Multi-subscriber fan-out for data-side `StreamEvents` is shipped and +config-gated. The original "one active subscriber per session" constraint is +superseded for deployments that opt in. -A second subscriber should be rejected with a clear session error. Multi-client -fan-out may be added later with explicit backpressure semantics. +`MxGateway:Sessions:AllowMultipleEventSubscribers` (default `false`) controls +the mode. When `false` the session still rejects a second `StreamEvents` +subscriber with `EventSubscriberAlreadyActive`, preserving the original +single-subscriber behavior. When `true`, up to +`MxGateway:Sessions:MaxEventSubscribersPerSession` (default `8`) concurrent +external subscribers may attach; a new attach that would exceed the cap is +rejected with `EventSubscriberLimitReached`. The count-check-and-increment is +atomic under the session lock. -Rationale: one subscriber preserves simple event ordering and failure behavior -while parity is being proven. +Failure semantics differ by mode: in single-subscriber mode a slow consumer's +channel overflow faults the whole session (`FailFast` backpressure); in +multi-subscriber mode the same condition disconnects only that subscriber so one +slow consumer never faults a session shared by others. The mode is fixed at +session construction and is not changed by a live subscriber-count snapshot. -### Alarms — superseded for the alarm subsystem +The gateway-owned internal dashboard mirror subscribes directly on the +distributor with `isInternal: true` and is not counted toward the cap or the +detach-grace subscriber-count in either mode. -The single-subscriber rule above no longer applies to alarms. The gateway runs -an always-on central alarm monitor (`GatewayAlarmMonitor`) that owns one +See [Sessions](./Sessions.md) for the full event-distributor and backpressure +design. + +### Alarms — separate fan-out architecture + +The single-subscriber rule never applied to alarms. The gateway runs an +always-on central alarm monitor (`GatewayAlarmMonitor`) that owns one gateway-managed worker session, caches the active-alarm set, and fans it out to -any number of clients through the session-less `StreamAlarms` RPC. Per-session -alarm auto-subscribe is removed; `AcknowledgeAlarm` is session-less and routes -through the monitor. Data-side `StreamEvents` remains one subscriber per -session. Rationale: alarm state is gateway-wide, not session-scoped — every -client wants the same current set plus updates, and forcing each to own a -worker would multiply AVEVA polling load for no benefit. +any number of clients through the session-less `StreamAlarms` RPC. +`AcknowledgeAlarm` is session-less and routes through the monitor. Rationale: +alarm state is gateway-wide, not session-scoped — every client wants the same +current set plus updates, and forcing each to own a worker would multiply AVEVA +polling load for no benefit. ## Authentication @@ -467,12 +497,21 @@ against the live MXAccess attribute set. These are explicit post-v1 revisit items, not open blockers: -- reconnectable sessions, -- multiple event subscribers per session, - restricted worker service account, - production coalescing by item handle, - command batching for high-volume tag setup. +The following items were previously listed here and have since shipped: + +- **Reconnectable sessions with replay** — shipped, config-gated via + `MxGateway:Sessions:DetachGraceSeconds` and + `MxGateway:Events:ReplayBufferCapacity` / `ReplayRetentionSeconds`. + See [Session Reconnect](#session-reconnect) above and [Sessions](./Sessions.md). +- **Multiple event subscribers per session** — shipped, config-gated via + `MxGateway:Sessions:AllowMultipleEventSubscribers` and + `MxGateway:Sessions:MaxEventSubscribersPerSession`. + See [Event Subscribers](#event-subscribers) above and [Sessions](./Sessions.md). + ## Related Documentation - [Gateway Process Detailed Design](./GatewayProcessDesign.md) diff --git a/docs/GatewayTesting.md b/docs/GatewayTesting.md index 9a75102..bdb04ba 100644 --- a/docs/GatewayTesting.md +++ b/docs/GatewayTesting.md @@ -51,7 +51,7 @@ shutdown request even when a command or event assertion fails. Cleanup failures in that `finally` block are logged rather than thrown, so a real assertion failure is never masked by a shutdown timeout. -`WorkerLiveMxAccessSmokeTests` additionally covers five MXAccess parity paths the +`WorkerLiveMxAccessSmokeTests` additionally covers seven MXAccess parity paths the fake-worker tests cannot validate: - a `Write` round-trip against an advised item, asserting both that the reply is @@ -67,13 +67,21 @@ fake-worker tests cannot validate: - a `WriteSecured` round-trip after `AuthenticateUser`, asserting the reply carries `MxCommandKind.WriteSecured` and the credential password never appears in the diagnostic message (parity for both the secured-write - ordering rule and the "do not log secrets" contract), and + ordering rule and the "do not log secrets" contract), - an abnormal worker exit (the worker process is killed mid-session) where the gateway must transition the session to `SessionState.Faulted` with a non-empty fault description carrying a known worker-client classification - (pipe disconnected / worker faulted / end-of-stream / heartbeat expired). + (pipe disconnected / worker faulted / end-of-stream / heartbeat expired), +- the B8 new COM commands — `AuthenticateUser`, `ArchestrAUserToId`, `Suspend`, + and `Activate` — each asserting a real MXAccess reply (not `InvalidRequest`) + is returned against an added-but-not-advised item, and +- the buffered-data path — `AddBufferedItem` and `SetBufferedUpdateInterval` — + asserting the commands round-trip and that the worker delivers at least one + `OnBufferedDataChange` event (the empty NoData bootstrap) without crashing + or dropping frames; live §3.2 multi-sample conversion is noted as a residual + when the rig does not drive sample-bearing buffered batches on demand. -All six tests are gated by the same `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1` +All eight tests are gated by the same `MXGATEWAY_RUN_LIVE_MXACCESS_TESTS=1` opt-in variable. Build the worker before running the smoke: diff --git a/gateway.md b/gateway.md index 087b77f..356a5ef 100644 --- a/gateway.md +++ b/gateway.md @@ -1084,12 +1084,19 @@ Resolved for v1: - MXAccess COM target is `ArchestrA.MxAccess.LMXProxyServerClass` / `LMXProxy.LMXProxyServer.1` from the installed 32-bit `LmxProxy.dll`. -- One `OpenSession` maps to one worker process; no reconnectable sessions. -- One active event subscriber per session. +- One `OpenSession` maps to one worker process. +- Reconnectable sessions: clients reconnect by re-issuing `StreamEvents` with + `after_worker_sequence`; the gateway replays the retained ring tail and emits + a `ReplayGap` sentinel when events were evicted. See `docs/Sessions.md`. +- Multi-subscriber event fan-out: multiple concurrent `StreamEvents` callers on + the same session are supported; single-subscriber mode uses fail-fast + backpressure, multi-subscriber mode disconnects only the slow consumer. See + `docs/Sessions.md`. - API key authentication with hashed keys in gateway-owned SQLite. - Basic Blazor Server dashboard with Bootstrap CSS/JS and real-time updates. - Workers run as the gateway service identity. -- Event backpressure is fail-fast with bounded queues. +- Event backpressure is fail-fast with bounded queues (single-subscriber) or + per-subscriber disconnect (multi-subscriber). - No public command batching. - `OperationComplete` is forwarded only when native MXAccess raises it. - `OnBufferedDataChange` is modeled now; multi-sample payload conversion remains @@ -1098,8 +1105,6 @@ Resolved for v1: Post-v1 revisit items: - production event-rate target and optional coalescing, -- reconnectable sessions, -- multi-subscriber event fan-out, - restricted worker process identity, - command batching for high-volume setup. diff --git a/src/ZB.MOM.WW.MxGateway.Contracts/Generated/MxaccessGateway.cs b/src/ZB.MOM.WW.MxGateway.Contracts/Generated/MxaccessGateway.cs index 48601e2..97b1d31 100644 --- a/src/ZB.MOM.WW.MxGateway.Contracts/Generated/MxaccessGateway.cs +++ b/src/ZB.MOM.WW.MxGateway.Contracts/Generated/MxaccessGateway.cs @@ -23596,8 +23596,7 @@ namespace ZB.MOM.WW.MxGateway.Contracts.Proto { /// stream; it is ALWAYS unset on events in DrainEventsReply (the diagnostic /// drain path never emits the sentinel). /// Additive (proto3): existing clients that ignore this field continue to - /// deserialize the stream unchanged. (Reconnect/replay logic is Task 12; this - /// is the contract surface only.) + /// deserialize the stream unchanged. /// [global::System.Diagnostics.DebuggerNonUserCodeAttribute] [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] diff --git a/src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto b/src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto index 3c9d587..00e5024 100644 --- a/src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto +++ b/src/ZB.MOM.WW.MxGateway.Contracts/Protos/mxaccess_gateway.proto @@ -729,8 +729,7 @@ message MxEvent { // stream; it is ALWAYS unset on events in DrainEventsReply (the diagnostic // drain path never emits the sentinel). // Additive (proto3): existing clients that ignore this field continue to - // deserialize the stream unchanged. (Reconnect/replay logic is Task 12; this - // is the contract surface only.) + // deserialize the stream unchanged. optional ReplayGap replay_gap = 14; oneof body { diff --git a/src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs b/src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs index 49d4ffd..6f7b6a9 100644 --- a/src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs +++ b/src/ZB.MOM.WW.MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs @@ -669,11 +669,12 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) Assert.NotEqual(0, userToIdReply.ArchestraUserToId.UserId); } - // Suspend / Activate against the advised item. The dev-rig TestInt item class - // may not be suspendable (MXAccess returns 0x80070057 / E_INVALIDARG for a - // wrong item class — see B8 notes). That is MXAccess parity: assert the reply - // kind and a non-INVALID_REQUEST status, surface the HResult and MxStatusProxy - // for the record, and do NOT treat a provider-side rejection as a test failure. + // Suspend / Activate against the added-but-not-advised item (no Advise was issued + // between AddItem and this call). The dev-rig TestInt item class may not be + // suspendable (MXAccess returns 0x80070057 / E_INVALIDARG for a wrong item class + // — see B8 notes). That is MXAccess parity: assert the reply kind and a + // non-INVALID_REQUEST status, surface the HResult and MxStatusProxy for the + // record, and do NOT treat a provider-side rejection as a test failure. MxCommandReply suspendReply = await fixture.Service.Invoke( CreateSuspendRequest(sessionId, serverHandle, itemHandle), new TestServerCallContext()).ConfigureAwait(false); @@ -827,8 +828,9 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) streamCancellation.Token) .ConfigureAwait(false); } - catch (TimeoutException) + catch (TimeoutException ex) { + output.WriteLine($"B8: sample-bearing batch predicate timed out: {ex.Message}"); bufferedBatch = null; } diff --git a/src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs b/src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs index e211ab9..a5bf1c6 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Sessions/GatewaySession.cs @@ -25,6 +25,12 @@ public sealed class GatewaySession private readonly TimeSpan _detachGrace; private readonly TimeSpan _workerReadyWaitTimeout; private DateTimeOffset? _detachedAtUtc; + // True once at least one external subscriber attached SUCCESSFULLY. Detach-grace's + // "last subscriber dropped" stamp (see DetachEventSubscriber) is gated on this so a + // FAILED first attach — which still runs the rollback DetachEventSubscriber from the + // attach catch path — does not push a never-subscribed session into the grace window + // (Server-055). + private bool _everHadEventSubscriber; private SessionEventDistributor? _eventDistributor; private bool _eventDistributorStarted; private bool _dashboardMirrorStarted; @@ -842,6 +848,7 @@ public sealed class GatewaySession try { IEventSubscriberLease distributorLease = StartDistributorAndRegister(); + MarkEventSubscriberAttached(); return new EventSubscriberLease(this, distributorLease); } catch @@ -906,6 +913,7 @@ public sealed class GatewaySession out ulong oldestAvailableSequence, out ulong liveResumeSequence); + MarkEventSubscriberAttached(); return new EventSubscriberReplayAttachment( new EventSubscriberLease(this, distributorLease), replayedEvents, @@ -920,6 +928,17 @@ public sealed class GatewaySession } } + // Records that an external subscriber attached successfully. Gates the detach-grace + // "last subscriber dropped" stamp so a FAILED first attach (which still rolls back via + // DetachEventSubscriber) never pushes a never-subscribed session into grace (Server-055). + private void MarkEventSubscriberAttached() + { + lock (_syncRoot) + { + _everHadEventSubscriber = true; + } + } + /// /// Invokes a worker command synchronously and returns the reply. /// @@ -1862,7 +1881,12 @@ public sealed class GatewaySession // Closing/Closed/Faulted there is nothing to retain. This is the detach→grace-start // transition; it shares _syncRoot with the reattach→grace-cancel write above and the // sweeper's IsDetachGraceExpired read, so the three serialize. - if (_detachGrace > TimeSpan.Zero + // Only stamp a detach that mirrors a prior SUCCESSFUL attach. The attach catch path + // calls this same method to roll back a reserved slot when the FIRST attach failed + // before any subscriber registered; that never-subscribed session must not enter the + // grace window (Server-055). + if (_everHadEventSubscriber + && _detachGrace > TimeSpan.Zero && _activeEventSubscriberCount == 0 && _state is not (SessionState.Closing or SessionState.Closed or SessionState.Faulted)) { diff --git a/src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs b/src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs index 2b37ddf..c35bc63 100644 --- a/src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs +++ b/src/ZB.MOM.WW.MxGateway.Server/Sessions/SessionEventDistributor.cs @@ -116,6 +116,17 @@ public sealed class SessionEventDistributor : IAsyncDisposable private bool _started; private bool _disposed; + // Set once the pump has run its final CompleteAllSubscribers sweep — the event source + // completed or faulted and the pump exited. Guarded by _lifecycleLock together with the + // subscriber add. A subscriber that registers AFTER this point but BEFORE DisposeAsync + // (the source ended but the session is not yet torn down) would otherwise be added with a + // channel the now-exited pump never completes, hanging its reader forever. The register + // paths complete such a late registrant's channel immediately with the same terminal + // state. _completionError carries the terminal exception (source fault) or null (graceful + // source completion), mirroring what the final CompleteAllSubscribers passed. + private bool _completed; + private Exception? _completionError; + /// /// Initializes a per-session event distributor. /// @@ -304,6 +315,16 @@ public sealed class SessionEventDistributor : IAsyncDisposable { ObjectDisposedException.ThrowIf(_disposed, this); _subscribers[subscriber.Id] = subscriber; + + // Close the register-after-pump-completion window: if the pump already ran its + // final CompleteAllSubscribers (source completed/faulted) but the distributor is + // not yet disposed, no further completion sweep will run, so complete this late + // registrant's channel now with the same terminal state instead of leaving its + // reader hanging. + if (_completed) + { + subscriber.Channel.Writer.TryComplete(_completionError); + } } return new SubscriberLease(this, subscriber); @@ -450,6 +471,14 @@ public sealed class SessionEventDistributor : IAsyncDisposable { ObjectDisposedException.ThrowIf(_disposed, this); _subscribers[id] = subscriber; + + // Same register-after-pump-completion guard as Register: a resume that races in + // after the source already ended still gets its retained replay batch (snapshot + // above), but its live channel must be completed now since the pump is gone. + if (_completed) + { + subscriber.Channel.Writer.TryComplete(_completionError); + } } } @@ -628,9 +657,21 @@ public sealed class SessionEventDistributor : IAsyncDisposable private void CompleteAllSubscribers(Exception? error) { - foreach (Subscriber subscriber in _subscribers.Values) + // Record the terminal state AND complete the current subscribers under _lifecycleLock + // so this serializes with the subscriber-add in Register/RegisterWithReplay: a + // subscriber added before this runs is in the map and completed by the loop; one that + // races in afterward sees _completed and completes its own channel in the register + // path. Exactly one of the two completes each subscriber. TryComplete is non-blocking + // and (channels use AllowSynchronousContinuations=false) runs no continuation inline, + // so holding the lock across the loop cannot stall or re-enter. + lock (_lifecycleLock) { - subscriber.Channel.Writer.TryComplete(error); + _completed = true; + _completionError = error; + foreach (Subscriber subscriber in _subscribers.Values) + { + subscriber.Channel.Writer.TryComplete(error); + } } } diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs index 29309ee..4d78263 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Configuration/GatewayOptionsValidatorTests.cs @@ -393,4 +393,91 @@ public sealed class GatewayOptionsValidatorTests ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); Assert.True(result.Succeeded); } + + [Fact] + public void Validate_Fails_WhenDetachGraceSecondsIsNegative() + { + GatewayOptions options = CloneWithSessions( + ValidOptions(), + new SessionOptions { DetachGraceSeconds = -1 }); + ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); + Assert.True(result.Failed); + Assert.Contains( + result.Failures!, + f => f.Contains("MxGateway:Sessions:DetachGraceSeconds")); + } + + [Fact] + public void Validate_Succeeds_WhenDetachGraceSecondsIsZero() + { + GatewayOptions options = CloneWithSessions( + ValidOptions(), + new SessionOptions { DetachGraceSeconds = 0 }); + ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); + Assert.True(result.Succeeded); + } + + // ------------------------------------------------------------------------- + // ReplayBufferCapacity / ReplayRetentionSeconds validation + // ------------------------------------------------------------------------- + + private static GatewayOptions CloneWithEvents(GatewayOptions source, EventOptions events) + => new() + { + Authentication = source.Authentication, + Ldap = source.Ldap, + Worker = source.Worker, + Sessions = source.Sessions, + Events = events, + Dashboard = source.Dashboard, + Protocol = source.Protocol, + Alarms = source.Alarms, + Tls = source.Tls, + }; + + [Fact] + public void Validate_Fails_WhenReplayBufferCapacityIsNegative() + { + GatewayOptions options = CloneWithEvents( + ValidOptions(), + new EventOptions { ReplayBufferCapacity = -1 }); + ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); + Assert.True(result.Failed); + Assert.Contains( + result.Failures!, + f => f.Contains("MxGateway:Events:ReplayBufferCapacity")); + } + + [Fact] + public void Validate_Succeeds_WhenReplayBufferCapacityIsZero() + { + GatewayOptions options = CloneWithEvents( + ValidOptions(), + new EventOptions { ReplayBufferCapacity = 0 }); + ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); + Assert.True(result.Succeeded); + } + + [Fact] + public void Validate_Fails_WhenReplayRetentionSecondsIsNegative() + { + GatewayOptions options = CloneWithEvents( + ValidOptions(), + new EventOptions { ReplayRetentionSeconds = -1 }); + ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); + Assert.True(result.Failed); + Assert.Contains( + result.Failures!, + f => f.Contains("MxGateway:Events:ReplayRetentionSeconds")); + } + + [Fact] + public void Validate_Succeeds_WhenReplayRetentionSecondsIsZero() + { + GatewayOptions options = CloneWithEvents( + ValidOptions(), + new EventOptions { ReplayRetentionSeconds = 0 }); + ValidateOptionsResult result = new GatewayOptionsValidator().Validate(null, options); + Assert.True(result.Succeeded); + } } diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs index 614c522..1bb9069 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Contracts/ProtobufContractRoundTripTests.cs @@ -1543,4 +1543,49 @@ public sealed class ProtobufContractRoundTripTests Assert.Equal(AlarmProviderMode.Subtag, parsed.OnAlarmProviderModeChanged.Mode); Assert.Equal(unchecked((int)0x80004005), parsed.OnAlarmProviderModeChanged.Hresult); } + + /// + /// Verifies that an carrying a + /// (the optional replay_gap = 14 field) + /// round-trips with both sequence fields populated, that + /// remains + /// (replay_gap is not part of the body oneof), and pins the wire field + /// numbers for MxEvent.replay_gap (14), + /// ReplayGap.requested_after_sequence (1), and + /// ReplayGap.oldest_available_sequence (2) via the descriptor. + /// + [Fact] + public void MxEvent_RoundTripsReplayGapSentinelAndPinsFieldNumbers() + { + // ReplayGap field on MxEvent must be wire number 14. + Assert.Equal(14, MxEvent.ReplayGapFieldNumber); + + // ReplayGap sub-field numbers must be pinned. + var replayGapFields = ReplayGap.Descriptor.Fields; + Assert.Equal(1, replayGapFields[ReplayGap.RequestedAfterSequenceFieldNumber].FieldNumber); + Assert.Equal("requested_after_sequence", replayGapFields[ReplayGap.RequestedAfterSequenceFieldNumber].Name); + Assert.Equal(2, replayGapFields[ReplayGap.OldestAvailableSequenceFieldNumber].FieldNumber); + Assert.Equal("oldest_available_sequence", replayGapFields[ReplayGap.OldestAvailableSequenceFieldNumber].Name); + + // Build a sentinel MxEvent: replay_gap set, body oneof unset, family UNSPECIFIED. + var original = new MxEvent + { + SessionId = "session-1", + WorkerSequence = 0, + ReplayGap = new ReplayGap + { + RequestedAfterSequence = 150, + OldestAvailableSequence = 200, + }, + }; + + var parsed = MxEvent.Parser.ParseFrom(original.ToByteArray()); + + Assert.Equal(original, parsed); + // replay_gap is NOT part of the body oneof — BodyCase must remain None. + Assert.Equal(MxEvent.BodyOneofCase.None, parsed.BodyCase); + Assert.NotNull(parsed.ReplayGap); + Assert.Equal(150UL, parsed.ReplayGap.RequestedAfterSequence); + Assert.Equal(200UL, parsed.ReplayGap.OldestAvailableSequence); + } } diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionDashboardMirrorTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionDashboardMirrorTests.cs index 2f2b186..037e12f 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionDashboardMirrorTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionDashboardMirrorTests.cs @@ -67,6 +67,12 @@ public sealed class GatewaySessionDashboardMirrorTests workerClient.Events.Add(CreateWorkerEvent(2, MxEventFamily.OnDataChange)); workerClient.Events.Add(CreateWorkerEvent(3, MxEventFamily.OnWriteComplete)); workerClient.CompleteAfterConfiguredEvents = true; + // Hold the worker stream until BOTH subscribers are attached so neither misses an event. + // MarkReady registers the internal dashboard subscriber and starts the pump, which then + // blocks on the gate; the gRPC subscriber attaches below; only then is the finite stream + // released. Without this gate the pump can drain all three events before the gRPC + // subscriber registers — a register-vs-pump race that otherwise makes this test flaky. + workerClient.HoldEventsUntilReleased(); RecordingDashboardEventBroadcaster broadcaster = new(); await using GatewaySession session = CreateSession(workerClient, broadcaster); @@ -79,13 +85,22 @@ public sealed class GatewaySessionDashboardMirrorTests new GatewayMetrics()); List grpcEvents = []; - await foreach (MxEvent mxEvent in service - .StreamEventsAsync(new StreamEventsRequest { SessionId = session.SessionId }, CancellationToken.None) - .WithCancellation(CancellationToken.None)) + Task grpcReader = Task.Run(async () => { - grpcEvents.Add(mxEvent); - } + await foreach (MxEvent mxEvent in service + .StreamEventsAsync(new StreamEventsRequest { SessionId = session.SessionId }, CancellationToken.None) + .WithCancellation(CancellationToken.None)) + { + grpcEvents.Add(mxEvent); + } + }); + // The gRPC subscriber counts against ActiveEventSubscriberCount (the internal dashboard + // mirror does not), so count == 1 confirms it has attached. Only then release the stream. + await WaitUntilAsync(() => session.ActiveEventSubscriberCount == 1); + workerClient.ReleaseEvents(); + + await grpcReader.WaitAsync(TestTimeout); await WaitUntilAsync(() => broadcaster.Captures.Count == 3); Assert.Equal([1UL, 2UL, 3UL], grpcEvents.Select(mxEvent => mxEvent.WorkerSequence).ToArray()); @@ -280,6 +295,24 @@ public sealed class GatewaySessionDashboardMirrorTests public bool CompleteAfterConfiguredEvents { get; set; } + // Gate that holds the event stream before it yields anything. Released by default, so + // ungated tests are unaffected. HoldEventsUntilReleased() makes ReadEventsAsync block + // until ReleaseEvents(), letting a test attach every subscriber before a finite, + // fast-completing stream drains (avoids a register-vs-pump race). + private TaskCompletionSource _releaseGate = CreateReleasedGate(); + + private static TaskCompletionSource CreateReleasedGate() + { + TaskCompletionSource gate = new(TaskCreationOptions.RunContinuationsAsynchronously); + gate.SetResult(); + return gate; + } + + public void HoldEventsUntilReleased() => + _releaseGate = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + + public void ReleaseEvents() => _releaseGate.TrySetResult(); + public string SessionId { get; } = "session-dashboard-mirror"; public int? ProcessId { get; } = 1234; @@ -298,6 +331,9 @@ public sealed class GatewaySessionDashboardMirrorTests public async IAsyncEnumerable ReadEventsAsync( [EnumeratorCancellation] CancellationToken cancellationToken) { + // Block before yielding any event until released (ungated by default). + await _releaseGate.Task.WaitAsync(cancellationToken).ConfigureAwait(false); + foreach (WorkerEvent workerEvent in Events) { cancellationToken.ThrowIfCancellationRequested(); diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs index 8d4de39..27a8947 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs @@ -545,6 +545,43 @@ public sealed class GatewaySessionTests Assert.False(session.IsDetachGraceExpired(clock.GetUtcNow())); } + /// + /// Server-055 regression. A FAILED first attach (the distributor never registered a + /// subscriber) must NOT enter the detach-grace window: the catch path's + /// DetachEventSubscriber rolls the reserved slot back to 0 but must not stamp + /// DetachedAtUtc, because the "last subscriber dropped" semantics only apply once + /// a subscriber was successfully registered. A freshly-Ready session whose first attach + /// failed must therefore stay out of grace and never become sweep-eligible on that basis. + /// + [Fact] + public async Task DetachGrace_FailedFirstAttach_DoesNotEnterGrace() + { + FakeTimeProvider clock = new(DateTimeOffset.UtcNow); + FakeWorkerClient workerClient = new(); + + // QueueCapacity = 0 makes the distributor constructor throw ArgumentOutOfRangeException + // inside StartDistributorAndRegister, so the very first AttachEventSubscriber fails after + // it reserved a slot — exercising the catch → DetachEventSubscriber rollback path. + await using GatewaySession session = CreateReadySessionWithDetachGrace( + workerClient, + clock, + detachGrace: TimeSpan.FromSeconds(30), + queueCapacity: 0); + + Assert.ThrowsAny( + () => session.AttachEventSubscriber(maxSubscribers: 1)); + + // The reserved slot was rolled back, but no successful subscriber ever existed, so the + // session must NOT have entered detach-grace. + Assert.Equal(SessionState.Ready, session.State); + Assert.Equal(0, session.ActiveEventSubscriberCount); + Assert.Null(session.DetachedAtUtc); + + // And it must never become detach-grace-eligible no matter how far the clock advances. + clock.Advance(TimeSpan.FromHours(1)); + Assert.False(session.IsDetachGraceExpired(clock.GetUtcNow())); + } + /// /// Task 11. The gateway-owned internal dashboard subscriber must NOT keep a session out /// of detach-grace: with only the dashboard mirror attached (and no external gRPC @@ -618,7 +655,8 @@ public sealed class GatewaySessionTests IWorkerClient workerClient, TimeProvider timeProvider, TimeSpan detachGrace, - IDashboardEventBroadcaster? dashboardBroadcaster = null) + IDashboardEventBroadcaster? dashboardBroadcaster = null, + int queueCapacity = 8) { GatewaySession session = new( sessionId: "session-test-detach-grace", @@ -636,7 +674,7 @@ public sealed class GatewaySessionTests openedAt: timeProvider.GetUtcNow(), eventStreaming: new SessionEventStreaming( new MxAccessGrpcMapper(), - new EventOptions { QueueCapacity = 8 }, + new EventOptions { QueueCapacity = queueCapacity }, NullLogger.Instance, timeProvider, new GatewayMetrics(), diff --git a/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs b/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs index 2cb8152..e3a60a9 100644 --- a/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Tests/Gateway/Sessions/SessionEventDistributorTests.cs @@ -702,16 +702,71 @@ public sealed class SessionEventDistributorTests private static async Task DrainUntilFaultAsync(ChannelReader reader) { // Drains any buffered events, then surfaces the channel's completion fault (if any) - // by awaiting the final read past the buffered tail. + // by awaiting the final WaitToReadAsync past the buffered tail. + // If WaitToReadAsync returns false (graceful completion rather than a fault), + // await Completion to surface any fault stored there, then Assert.Fail so the + // helper does not spin forever on a channel that completes without an exception. while (true) { - await reader.WaitToReadAsync().AsTask().WaitAsync(ReadTimeout); + bool hasMore = await reader.WaitToReadAsync().AsTask().WaitAsync(ReadTimeout); + if (!hasMore) + { + // Graceful completion — propagate any stored exception, then fail. + await reader.Completion; + Assert.Fail("DrainUntilFaultAsync: channel completed gracefully (no fault)."); + return; + } + while (reader.TryRead(out _)) { } } } + /// + /// Regression: a subscriber that registers in the window AFTER the pump has completed + /// (its event source finished) but BEFORE the distributor is disposed must have its + /// channel completed immediately, not left open forever. The pump has already run its + /// final CompleteAllSubscribers sweep and exited, so without the + /// register-after-completion guard the late subscriber's reader hangs indefinitely. + /// This was observed as an order-dependent hang in + /// GatewaySessionDashboardMirrorTests, where a gRPC subscriber attached after a + /// fast-completing worker stream had already drained. + /// + [Fact] + public async Task Register_AfterSourceCompletes_CompletesLateSubscriberInsteadOfHanging() + { + Channel source = Channel.CreateUnbounded(); + await using SessionEventDistributor distributor = CreateDistributor(source.Reader); + await distributor.StartAsync(CancellationToken.None); + + // An early subscriber lets us observe when the pump's final completion sweep has run. + using IEventSubscriberLease early = distributor.Register(); + + // Complete the source: the pump drains it, runs CompleteAllSubscribers, and exits. + source.Writer.Complete(); + + // Draining the early subscriber to completion proves the pump finished its sweep — so + // a subscriber registering now is unambiguously in the register-after-completion window. + using (CancellationTokenSource earlyCts = new(ReadTimeout)) + { + await foreach (MxEvent _ in early.Reader.ReadAllAsync(earlyCts.Token)) + { + } + } + + // Register AFTER the pump has completed. The channel must be completed immediately; the + // bounded read below must end rather than hang (the ReadTimeout converts a regression + // into a fast OperationCanceledException failure instead of an indefinite hang). + using IEventSubscriberLease late = distributor.Register(); + using CancellationTokenSource lateCts = new(ReadTimeout); + await foreach (MxEvent _ in late.Reader.ReadAllAsync(lateCts.Token)) + { + } + + Assert.False(lateCts.IsCancellationRequested); + } + private static SessionEventDistributor CreateDistributor(ChannelReader source) => CreateDistributor(source, replayBufferCapacity: 1024, replayRetentionSeconds: 300);