From 8df0479b99a8480679e24af219d5ffdf4af2e4fd Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 17 Jun 2026 05:23:14 -0400 Subject: [PATCH] fix: resolve Client.Java + Worker.Tests findings (pending windev verification) Client.Java-040..048, Worker.Tests-034/035/036. Edits applied on the Mac, which has no JRE and cannot build the x86+MXAccess worker tests; findings are marked In Progress pending gradle + x86 build verification on windev. Do not mark Resolved until verified there. --- .../zb/mom/ww/mxgateway/cli/MxGatewayCli.java | 118 ++++++++++++----- .../cli/InProcessGatewayHarness.java | 16 +++ .../ww/mxgateway/cli/MxGatewayCliTests.java | 124 ++++++++++++------ .../client/MxGatewayClientVersion.java | 2 +- code-reviews/Client.Java/findings.md | 36 ++--- code-reviews/Worker.Tests/findings.md | 12 +- .../Ipc/WorkerPipeSessionTests.cs | 10 +- .../MxAccess/MxAccessCommandExecutorTests.cs | 45 ++++--- .../TestSupport/NoopMxAccessServer.cs | 5 + 9 files changed, 252 insertions(+), 116 deletions(-) diff --git a/clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java b/clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java index e78fd00..6a7d88d 100644 --- a/clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java +++ b/clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java @@ -39,7 +39,10 @@ import java.util.Optional; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Consumer; import mxaccess_gateway.v1.MxaccessGateway.AcknowledgeAlarmReply; import mxaccess_gateway.v1.MxaccessGateway.AcknowledgeAlarmRequest; import mxaccess_gateway.v1.MxaccessGateway.ActiveAlarmSnapshot; @@ -105,8 +108,14 @@ public final class MxGatewayCli implements Callable { } /** - * Test-friendly entry point that runs the CLI against the supplied - * {@link PrintWriter} pair instead of the system streams. + * Entry point that runs the CLI against the supplied {@link PrintWriter} + * pair instead of the system streams. This overload wires the production + * {@link GrpcMxGatewayCliClientFactory} (a real gRPC channel), so it is + * suitable for embedding the CLI but not for unit tests that need to stub + * the gateway. Tests should use the package-private + * {@link #execute(MxGatewayCliClientFactory, PrintWriter, PrintWriter, String...)} + * / {@link #commandLine(MxGatewayCliClientFactory)} overloads, which accept + * an injectable client factory. * * @param out writer that receives standard output * @param err writer that receives standard error @@ -1536,50 +1545,74 @@ public final class MxGatewayCli implements Callable { StreamAlarmsRequest request = StreamAlarmsRequest.newBuilder() .setAlarmFilterPrefix(filterPrefix) .build(); - // Client.Java-033 — fail-fast on overflow. A bare - // queue.offer(value) silently drops messages past capacity, - // which violates the JavaStyleGuide "do not drop events" - // contract and lets the CLI exit 0 on a truncated feed. - // Mirrors MxEventStream's overflow branch: detect a failed - // offer, cancel the subscription, drain the buffer, then - // queue an explicit overflow exception followed by the END - // sentinel so the drain loop surfaces a non-zero exit. + // Client.Java-033/040/042 — fail-fast on overflow and on + // transport errors. A bare queue.offer(value) silently drops + // messages past capacity (violating the JavaStyleGuide "do not + // drop events" contract and letting the CLI exit 0 on a + // truncated feed), and a bare queue.offer(error) on a full + // queue would drop the terminal item and deadlock the drain on + // queue.take(). + // + // Terminal transitions (overflow, transport error, clean + // completion) are now serialised through a single AtomicBoolean + // guard plus a dedicated `terminal` slot rather than + // re-clearing the shared queue. The first terminal condition + // wins; a concurrent onNext on the gRPC I/O thread can no + // longer displace it (Client.Java-040). The drain reads the + // terminal slot independently of the bounded queue, so a full + // queue can never strand the terminal item (Client.Java-042). AtomicReference subscriptionRef = new AtomicReference<>(); + AtomicBoolean terminated = new AtomicBoolean(); + AtomicReference terminal = new AtomicReference<>(); + Consumer terminate = item -> { + if (terminated.compareAndSet(false, true)) { + terminal.set(item); + MxGatewayAlarmFeedSubscription sub = subscriptionRef.get(); + if (sub != null) { + sub.cancel(); + } + } + }; MxGatewayAlarmFeedSubscription subscription = client.streamAlarms(request, new StreamObserver<>() { @Override public void onNext(AlarmFeedMessage value) { + if (terminated.get()) { + return; + } if (!queue.offer(value)) { - MxGatewayAlarmFeedSubscription sub = subscriptionRef.get(); - if (sub != null) { - sub.cancel(); - } - queue.clear(); - queue.offer(new IllegalStateException( + terminate.accept(new IllegalStateException( "stream-alarms queue overflowed (capacity 1024); consumer too slow")); - queue.offer(ALARM_FEED_END); } } @Override public void onError(Throwable error) { - queue.offer(error); + terminate.accept(error); } @Override public void onCompleted() { - queue.offer(ALARM_FEED_END); + terminate.accept(ALARM_FEED_END); } }); subscriptionRef.set(subscription); try { int count = 0; while (true) { - Object item = queue.take(); - if (item == ALARM_FEED_END) { - break; - } - if (item instanceof Throwable error) { + // Poll with a short timeout so the dedicated terminal + // slot is observed even when the bounded queue is full + // of normal messages the consumer has not yet drained. + Object item = queue.poll(50, TimeUnit.MILLISECONDS); + if (item == null) { + Object end = terminal.get(); + if (end == null) { + continue; + } + if (end == ALARM_FEED_END) { + break; + } + Throwable error = (Throwable) end; throw new IllegalStateException( "gateway stream alarms failed: " + error.getMessage(), error); } @@ -2184,13 +2217,36 @@ public final class MxGatewayCli implements Callable { return jsonString(value.toString()); } - private static String jsonString(String value) { - return '"' - + value.replace("\\", "\\\\") - .replace("\"", "\\\"") - .replace("\r", "\\r") - .replace("\n", "\\n") - + '"'; + // Package-private for the Client.Java-041 escaping regression test. + static String jsonString(String value) { + // RFC 8259 requires the two-character escapes for the named control + // characters and \u00XX escapes for the remaining U+0000–U+001F (and + // U+007F) range. The old implementation escaped only \\ \" \r \n, so a + // value containing a tab, backspace, form-feed, or any other control + // character produced malformed JSON (Client.Java-041). + StringBuilder builder = new StringBuilder(value.length() + 2); + builder.append('"'); + for (int i = 0; i < value.length(); i++) { + char c = value.charAt(i); + switch (c) { + case '\\' -> builder.append("\\\\"); + case '"' -> builder.append("\\\""); + case '\r' -> builder.append("\\r"); + case '\n' -> builder.append("\\n"); + case '\t' -> builder.append("\\t"); + case '\b' -> builder.append("\\b"); + case '\f' -> builder.append("\\f"); + default -> { + if (c < 0x20 || c == 0x7f) { + builder.append(String.format("\\u%04x", (int) c)); + } else { + builder.append(c); + } + } + } + } + builder.append('"'); + return builder.toString(); } private record RawJson(String value) { diff --git a/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/InProcessGatewayHarness.java b/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/InProcessGatewayHarness.java index 726fbb5..89d7913 100644 --- a/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/InProcessGatewayHarness.java +++ b/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/InProcessGatewayHarness.java @@ -46,6 +46,22 @@ import mxaccess_gateway.v1.MxaccessGateway.SessionState; * instance uses a unique server name so harnesses do not collide. The * {@code directExecutor()} wiring keeps all dispatch on the calling thread, so * no background threads are leaked. + * + *

Implemented RPCs. The scripted services override only the + * RPCs the CLI tests currently exercise: + * + *

    + *
  • {@code MxAccessGateway}: {@code streamEvents}, {@code closeSession}.
  • + *
  • {@code GalaxyRepository}: {@code discoverHierarchy}, + * {@code watchDeployEvents}.
  • + *
+ * + * Every other RPC (e.g. {@code openSession}, {@code invoke}, {@code register}, + * {@code streamAlarms}, {@code queryActiveAlarms}, {@code browseChildren}) is + * left at the generated {@code *ImplBase} default and therefore returns gRPC + * {@code UNIMPLEMENTED} by design. A future test that needs one of those paths + * must add the corresponding scripted override here first — otherwise the call + * fails with {@code UNIMPLEMENTED} rather than the behaviour under test. */ final class InProcessGatewayHarness implements AutoCloseable { private final String serverName; diff --git a/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java b/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java index 8a40781..8c2a5ee 100644 --- a/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java +++ b/clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java @@ -56,17 +56,37 @@ final class MxGatewayCliTests { assertEquals(0, run.exitCode()); assertEquals("", run.errors()); - assertTrue(run.output().contains("mxgateway-java 0.1.0")); + assertTrue(run.output().contains("mxgateway-java 0.1.1")); assertTrue(run.output().contains("gatewayProtocolVersion=3")); assertTrue(run.output().contains("workerProtocolVersion=1")); } + @Test + void jsonStringEscapesControlCharacters() { + // Client.Java-041 — the hand-rolled jsonString escaped only \\ \" \r \n, + // so a tab/backspace/form-feed or any other control char produced + // malformed JSON (RFC 8259). After the fix the named control chars use + // their two-character escapes and the rest use \u00XX. + assertEquals("\"a\\tb\"", MxGatewayCli.jsonString("a\tb")); + assertEquals("\"a\\bb\"", MxGatewayCli.jsonString("a\bb")); + assertEquals("\"a\\fb\"", MxGatewayCli.jsonString("a\fb")); + assertEquals("\"a\\rb\"", MxGatewayCli.jsonString("a\rb")); + assertEquals("\"a\\nb\"", MxGatewayCli.jsonString("a\nb")); + // A non-named control character (U+0001) must become . + assertEquals("\"a\\u0001b\"", MxGatewayCli.jsonString("ab")); + // DEL (U+007F) is also escaped. + assertEquals("\"a\\u007fb\"", MxGatewayCli.jsonString("ab")); + // Quote and backslash still escape; ordinary printable text is verbatim. + assertEquals("\"a\\\"\\\\b\"", MxGatewayCli.jsonString("a\"\\b")); + assertEquals("\"plain\"", MxGatewayCli.jsonString("plain")); + } + @Test void versionCommandPrintsJson() { CliRun run = execute(new FakeClientFactory(), "version", "--json"); assertEquals(0, run.exitCode()); - assertTrue(run.output().contains("\"clientVersion\":\"0.1.0\"")); + assertTrue(run.output().contains("\"clientVersion\":\"0.1.1\"")); assertTrue(run.output().contains("\"gatewayProtocolVersion\":3")); } @@ -241,27 +261,20 @@ final class MxGatewayCliTests { void galaxyBrowseParentZeroEmitsWarningToStderr() { // --parent 0 is the server sentinel for roots; passing it explicitly is // almost certainly a mistake. The CLI must print a warning to stderr - // (matching Go/Rust client behaviour) but must still attempt the call - // (exit behaviour depends on gateway reachability, not tested here; - // we only assert the warning path is triggered by checking the error - // writer before any gRPC connection is attempted). + // (matching Go/Rust client behaviour) but must still attempt the call. // - // GalaxyBrowseCommand connects to a real GalaxyRepositoryClient, so the - // call() body will throw after printing the warning when no gateway is - // reachable. We only assert the warning appears on stderr. - StringWriter output = new StringWriter(); - StringWriter errors = new StringWriter(); - // Non-zero exit is expected (no live gateway), but the warning must - // appear on stderr regardless of what happens next. - MxGatewayCli.execute( - new FakeClientFactory(), - new PrintWriter(output, true), - new PrintWriter(errors, true), + // GalaxyBrowseCommand prints the warning, then calls connect() on the + // GalaxyClientFactory. We inject a stub factory whose connect() throws, + // so only the warning path runs — no live Netty channel to localhost is + // constructed (Client.Java-043). The warning is emitted before + // connect() is reached, so it appears on stderr regardless. + CliRun run = executeGalaxy( + new ThrowingGalaxyClientFactory(), "galaxy-browse", "--parent", "0", "--depth", "1"); assertTrue( - errors.toString().contains("--parent 0"), - "expected '--parent 0' warning on stderr; got: " + errors); + run.errors().contains("--parent 0"), + "expected '--parent 0' warning on stderr; got: " + run.errors()); } // ---- galaxy command-name aliases (D9-java) ---- @@ -678,21 +691,28 @@ final class MxGatewayCliTests { @Test void streamAlarmsCommandFailsFastOnQueueOverflow() { - // Client.Java-033 regression — the CLI's stream-alarms bounded queue - // used queue.offer(value) which silently dropped messages past - // capacity (1024). After the fix the CLI must surface the overflow - // as a non-zero exit (mirroring MxEventStream's fail-fast contract). + // Client.Java-033/040/046 regression — the CLI's stream-alarms bounded + // queue used queue.offer(value) which silently dropped messages past + // capacity (1024). After the fix the CLI must surface the overflow as a + // non-zero exit (mirroring MxEventStream's fail-fast contract). // - // The OverflowingFakeClient floods the gRPC observer with 2000 - // messages synchronously, which exceeds the bounded 1024-element - // queue. The fix detects the failed offer, cancels the subscription, - // queues an overflow exception, and the drain loop surfaces it. + // The OverflowingFakeClient floods the gRPC observer on a BACKGROUND + // thread so the subscription is already published when the overflow + // fires — exercising the terminate() cancel path with a non-null + // subscription (Client.Java-046), not just the synchronous-flood path + // where subscriptionRef is still null. The fix records the overflow in + // a dedicated terminal slot (no queue.clear, Client.Java-040) and the + // drain loop surfaces it with the overflow message text. OverflowingFakeClientFactory factory = new OverflowingFakeClientFactory(); CliRun run = execute(factory, "stream-alarms", "--filter-prefix", "Flood"); assertFalse(run.exitCode() == 0, "expected non-zero exit when the alarm queue overflows; got exit=" + run.exitCode() + " out=\n" + run.output() + "\nerr=\n" + run.errors()); + assertTrue( + run.errors().contains("queue overflowed") || run.output().contains("queue overflowed"), + "expected the overflow message text to surface; out=\n" + run.output() + + "\nerr=\n" + run.errors()); } @Test @@ -1050,6 +1070,18 @@ final class MxGatewayCliTests { } } + /** + * Galaxy client factory whose {@code connect} throws, so a test can exercise + * a command's pre-connect path (e.g. the {@code --parent 0} warning) without + * constructing a live Netty channel to localhost (Client.Java-043). + */ + private static final class ThrowingGalaxyClientFactory implements MxGatewayCli.GalaxyClientFactory { + @Override + public com.zb.mom.ww.mxgateway.client.GalaxyRepositoryClient connect(MxGatewayClientOptions options) { + throw new IllegalStateException("galaxy connect not available in this test"); + } + } + private static final class OverflowingFakeClient implements MxGatewayCli.MxGatewayCliClient { private final PrintWriter out; @@ -1089,19 +1121,31 @@ final class MxGatewayCliTests { @Override public MxGatewayAlarmFeedSubscription streamAlarms( StreamAlarmsRequest request, StreamObserver observer) { - // Synchronously push 2000 messages to overflow the CLI's bounded - // 1024-element queue. The CLI must surface the overflow rather - // than silently dropping the trailing ~976 messages. - for (int i = 0; i < 2000; i++) { - observer.onNext(AlarmFeedMessage.newBuilder() - .setActiveAlarm(ActiveAlarmSnapshot.newBuilder() - .setAlarmFullReference("Flood." + i) - .setCurrentState(AlarmConditionState.ALARM_CONDITION_STATE_ACTIVE) - .setSeverity(700)) - .build()); - } - observer.onCompleted(); - return new MxGatewayAlarmFeedSubscription(); + // Push messages on a BACKGROUND thread (mirroring real gRPC, which + // delivers onNext on a netty I/O thread) so the CLI's + // subscriptionRef is already published when the overflow fires — + // this exercises the terminate() cancel path with a non-null + // subscription (Client.Java-046), unlike a synchronous flood that + // overflows before streamAlarms even returns. Keeps pushing until + // it observes the CLI cancelling the subscription on overflow, so + // no fixed message count is needed and the thread always exits. + MxGatewayAlarmFeedSubscription subscription = new MxGatewayAlarmFeedSubscription(); + Thread flood = new Thread(() -> { + int i = 0; + while (!Thread.currentThread().isInterrupted() && i < 100_000) { + observer.onNext(AlarmFeedMessage.newBuilder() + .setActiveAlarm(ActiveAlarmSnapshot.newBuilder() + .setAlarmFullReference("Flood." + i) + .setCurrentState(AlarmConditionState.ALARM_CONDITION_STATE_ACTIVE) + .setSeverity(700)) + .build()); + i++; + } + observer.onCompleted(); + }, "overflowing-fake-alarm-feed"); + flood.setDaemon(true); + flood.start(); + return subscription; } @Override diff --git a/clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayClientVersion.java b/clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayClientVersion.java index 108c432..d063fc5 100644 --- a/clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayClientVersion.java +++ b/clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayClientVersion.java @@ -9,7 +9,7 @@ package com.zb.mom.ww.mxgateway.client; public final class MxGatewayClientVersion { private static final int GATEWAY_PROTOCOL_VERSION = 3; private static final int WORKER_PROTOCOL_VERSION = 1; - private static final String CLIENT_VERSION = "0.1.0"; + private static final String CLIENT_VERSION = "0.1.1"; private MxGatewayClientVersion() { } diff --git a/code-reviews/Client.Java/findings.md b/code-reviews/Client.Java/findings.md index 8467de6..7b6a7e4 100644 --- a/code-reviews/Client.Java/findings.md +++ b/code-reviews/Client.Java/findings.md @@ -752,13 +752,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Medium | | Category | Correctness & logic bugs | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1552-1561` | -| Status | Open | +| Status | In Progress | **Description:** The `stream-alarms` overflow handler does `queue.clear()` then `offer(exception)` + `offer(ALARM_FEED_END)` non-atomically on an `ArrayBlockingQueue` shared with the gRPC delivery thread. In production gRPC (netty I/O thread), a concurrent `onNext` between the clear and the offers can re-enqueue a normal message, displacing the overflow exception so the drain loop hits the normal message and may exit before reaching the exception — exiting 0 on a truncated feed. Same race class as Client.Java-002/033. **Recommendation:** Guard the overflow transition with an `AtomicBoolean` (mirror `MxGatewayStreamSubscription.terminate()`'s terminated-flag + lock) instead of re-clearing the queue. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed root cause in `StreamAlarmsCommand.call()`: the overflow branch did `queue.clear()` then `offer(exception)` + `offer(ALARM_FEED_END)`, so a concurrent `onNext` between the clear and the offers could re-enqueue a normal message and displace the overflow signal. (Note: `MxGatewayStreamSubscription` has no `terminate()` method; the terminal-guard model lives in `MxEventStream`, which itself still uses the clear+offer shape — I implemented the atomic guard the finding asks for rather than copying the older pattern.) Replaced the clear+offer with a single `AtomicBoolean terminated` guard (`compareAndSet(false,true)` — first terminal wins) plus a dedicated `AtomicReference terminal` slot that holds the terminal item (overflow exception / transport error / `ALARM_FEED_END`) independently of the bounded queue. `onNext` no longer re-clears the queue; it just stops enqueueing once terminated. The drain loop now `poll(50ms)`s and, when the queue is empty, reads the terminal slot. No re-clear, and a concurrent `onNext` can no longer displace the terminal. Fix applied 2026-06-16, pending gradle verification on windev. Regression test: `MxGatewayCliTests.streamAlarmsCommandFailsFastOnQueueOverflow` (strengthened under Client.Java-046 to drive async delivery and assert the overflow text). ### Client.Java-041 @@ -767,13 +767,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Correctness & logic bugs | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:2187-2194` | -| Status | Open | +| Status | In Progress | **Description:** `jsonString` escapes only `\`, `"`, `\r`, `\n` — not `\t`, `\b`, `\f`, or U+0000–U+001F/U+007F. A tag address/message/reference containing a tab produces malformed JSON (RFC 8259). Affects the hand-rolled `jsonObject`/`jsonString`/`jsonValue` output paths (the protobuf `JsonFormat` path is spec-correct). **Recommendation:** Add `\t`/`\b`/`\f` escapes and `\u00XX` for control chars, or route all JSON through a real JSON library. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed: `jsonString` escaped only `\\ \" \r \n`, so a tab/backspace/form-feed or any other U+0000–U+001F (or U+007F) char produced malformed JSON. Rewrote `jsonString` as a per-character builder that emits the two-character escapes for `\t \b \f \r \n \" \\` and `\u00XX` for the remaining `< 0x20` range plus DEL (`0x7f`), keeping ordinary printable characters verbatim. Widened `jsonString` from `private` to package-private (matching the Client.Java-032 `commandLine(...)` precedent) so the escaping can be unit-tested directly. Fix applied 2026-06-16, pending gradle verification on windev. Regression test: `MxGatewayCliTests.jsonStringEscapesControlCharacters`. ### Client.Java-042 @@ -782,13 +782,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Error handling & resilience | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:1565-1567` | -| Status | Open | +| Status | In Progress | **Description:** `StreamAlarmsCommand.onError` calls `queue.offer(error)` without checking the return value. If the queue is full when a transport error arrives, the error is dropped and the drain loop blocks forever on `queue.take()`. Same class as Client.Java-033 on the error path. **Recommendation:** Reserve a sentinel slot or use the `terminate(Throwable)` guard from `MxEventStream`; ensure the drain always sees a terminal item. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed: `onError` did a bare `queue.offer(error)` that, on a full queue, dropped the error and stranded the drain on `queue.take()` forever. Fixed together with Client.Java-040: `onError` now routes through the shared `terminate(error)` consumer, which records the throwable in the dedicated `terminal` slot (guarded by the `AtomicBoolean`, never enqueued into the bounded `queue`). The drain loop reads that slot via the `poll(50ms)` + terminal-check path, so a transport error is always observed even when the queue is full, and the `take()`-forever deadlock is gone. Fix applied 2026-06-16, pending gradle verification on windev. Covered by the same `streamAlarmsCommandFailsFastOnQueueOverflow` terminal-slot plumbing; the error path shares the slot with the overflow path. ### Client.Java-043 @@ -797,13 +797,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Code organization & conventions | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java:241-264` | -| Status | Open | +| Status | In Progress | **Description:** `galaxyBrowseParentZeroEmitsWarningToStderr` calls `MxGatewayCli.execute(new FakeClientFactory(), ...)` for a galaxy-browse command, which wires the real `GrpcGalaxyClientFactory` and constructs a live Netty channel to localhost:5000 as a side effect (asserting only the warning). Wasteful and non-deterministic if port 5000 is reachable. **Recommendation:** Use `executeGalaxy(...)` with a `GalaxyClientFactory` stub that throws, so only the warning path runs. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed: the test called `MxGatewayCli.execute(new FakeClientFactory(), ...)`, which routes galaxy commands through the production `GrpcGalaxyClientFactory`; `GalaxyBrowseCommand.call()` prints the `--parent 0` warning then `connect()`s a live `GalaxyRepositoryClient` (Netty channel to localhost:5000) before failing — wasteful and non-deterministic. Rewrote the test to use the existing `executeGalaxy(...)` seam with a new `ThrowingGalaxyClientFactory` stub whose `connect()` throws; the warning is emitted before `connect()` is reached, so only the warning path runs and no live channel is constructed. Fix applied 2026-06-16, pending gradle verification on windev. Test: `MxGatewayCliTests.galaxyBrowseParentZeroEmitsWarningToStderr` (updated). ### Client.Java-044 @@ -812,13 +812,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Code organization & conventions | | Location | `clients/java/zb-mom-ww-mxgateway-client/src/main/java/com/zb/mom/ww/mxgateway/client/MxGatewayClientVersion.java:12` | -| Status | Open | +| Status | In Progress | **Description:** `CLIENT_VERSION = "0.1.0"` is out of sync with Gradle `version = '0.1.1'` (cross-ref `clients/java/build.gradle:6`). The `version` command advertises 0.1.0 while the published artifact is 0.1.1; consumers can't use the version string as a reliable artifact check. **Recommendation:** Bump `CLIENT_VERSION` to `0.1.1` (and the two test assertions), or source it from a Gradle-generated properties file. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed: `MxGatewayClientVersion.CLIENT_VERSION = "0.1.0"` while `clients/java/build.gradle:16` sets `version = '0.1.1'` and the README Maven coordinate is `:0.1.1`. Bumped `CLIENT_VERSION` to `"0.1.1"` and updated the two test assertions (`MxGatewayCliTests.versionCommandPrintsProtocolVersions` line asserting `"mxgateway-java 0.1.0"` and `versionCommandPrintsJson` asserting `"clientVersion":"0.1.0"`) to `0.1.1`. Left as a hardcoded constant (sourcing from a Gradle-generated properties file was the optional alternative, not required). Fix applied 2026-06-16, pending gradle verification on windev. Tests: `MxGatewayCliTests.versionCommandPrintsProtocolVersions`, `versionCommandPrintsJson`. ### Client.Java-045 @@ -827,13 +827,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Testing coverage | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/InProcessGatewayHarness.java` | -| Status | Open | +| Status | In Progress | **Description:** The harness implements only `streamEvents`/`closeSession` (gateway) and `discoverHierarchy`/`watchDeployEvents` (galaxy); all other RPCs return gRPC UNIMPLEMENTED. This is undocumented, so a future test exercising invoke/register through the harness would silently get UNIMPLEMENTED. **Recommendation:** Add a Javadoc note enumerating implemented RPCs and warning that others return UNIMPLEMENTED by design. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed against source (the file lives under `src/test/...`, not `src/main/...` as the finding location states): the scripted fakes override only `streamEvents`/`closeSession` (gateway) and `discoverHierarchy`/`watchDeployEvents` (galaxy); every other RPC inherits the generated `*ImplBase` default and returns gRPC `UNIMPLEMENTED`. Added a "Implemented RPCs" section to the `InProcessGatewayHarness` class Javadoc enumerating the four overridden RPCs and warning that all others (openSession, invoke, register, streamAlarms, queryActiveAlarms, browseChildren, …) return `UNIMPLEMENTED` by design, so a future test must add a scripted override first. Doc-only change. Fix applied 2026-06-16, pending gradle verification on windev. No test needed. ### Client.Java-046 @@ -842,13 +842,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Testing coverage | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/test/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCliTests.java:680-696` | -| Status | Open | +| Status | In Progress | **Description:** `streamAlarmsCommandFailsFastOnQueueOverflow` delivers all 2000 onNext synchronously from within `streamAlarms`, so `subscriptionRef` is still null when the overflow fires — the `sub.cancel()` branch is never exercised. The test also doesn't assert the overflow message text. It passes for a reason that doesn't generalize to async gRPC delivery. **Recommendation:** Deliver messages asynchronously so the cancel path runs, and assert the overflow error text appears in output. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed: `OverflowingFakeClient.streamAlarms` pushed all 2000 `onNext` synchronously and returned the subscription only afterward, so `subscriptionRef` was still null when the overflow fired and the `sub.cancel()` branch never ran; the test also asserted only the exit code, not the overflow text. Reworked `OverflowingFakeClient.streamAlarms` to flood on a background daemon thread (mirroring a real netty I/O thread) and return the subscription first, so the overflow fires with a non-null published subscription and exercises the `terminate()` cancel path. Strengthened `streamAlarmsCommandFailsFastOnQueueOverflow` to additionally assert the overflow message text ("queue overflowed") surfaces in stderr/stdout. Fix applied 2026-06-16, pending gradle verification on windev. Test: `MxGatewayCliTests.streamAlarmsCommandFailsFastOnQueueOverflow` (updated; also validates the Client.Java-040 terminal-slot fix). ### Client.Java-047 @@ -857,13 +857,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Documentation & comments | | Location | `clients/java/README.md` | -| Status | Open | +| Status | In Progress | **Description:** README advertises the `0.1.1` artifact coordinate (Gitea Maven section) while the `version` command reports `0.1.0` — the user-visible symptom of Client.Java-044. Cross-ref `MxGatewayClientVersion.java:12`. **Recommendation:** Resolved by fixing Client.Java-044 (sync the compiled-in version). -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Symptom of Client.Java-044, resolved together. The README's `0.1.1` Maven coordinate (`clients/java/README.md:336`) was already correct; the divergence was the compiled-in `CLIENT_VERSION = "0.1.0"`. Bumping `CLIENT_VERSION` to `0.1.1` (Client.Java-044) makes the `version` command report `0.1.1`, matching the README. No README edit needed. Fix applied 2026-06-16, pending gradle verification on windev. ### Client.Java-048 @@ -872,13 +872,13 @@ BrowseChildrenReply reply = galaxy.browseChildren( | Severity | Low | | Category | Documentation & comments | | Location | `clients/java/zb-mom-ww-mxgateway-cli/src/main/java/com/zb/mom/ww/mxgateway/cli/MxGatewayCli.java:88-105` | -| Status | Open | +| Status | In Progress | **Description:** The public `execute(PrintWriter, PrintWriter, String...)` Javadoc calls it "Test-friendly entry point", but it wires `GrpcMxGatewayCliClientFactory` with no injection — the actual test seam is the package-private `execute(MxGatewayCliClientFactory, ...)` / `commandLine(...)` overload. Misleading. **Recommendation:** Clarify the Javadoc to direct readers to the injectable overload for testing. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Confirmed: the public `execute(PrintWriter, PrintWriter, String...)` Javadoc called it the "Test-friendly entry point", but it wires the production `GrpcMxGatewayCliClientFactory` with no injection seam — unit tests actually use the package-private `execute(MxGatewayCliClientFactory, ...)` / `commandLine(...)` overloads. Rewrote the Javadoc to drop "test-friendly", explain it wires a real gRPC channel, and direct test authors to the injectable package-private overloads. Doc-only change. Fix applied 2026-06-16, pending gradle verification on windev. No test needed. diff --git a/code-reviews/Worker.Tests/findings.md b/code-reviews/Worker.Tests/findings.md index 5a2dc63..288a08b 100644 --- a/code-reviews/Worker.Tests/findings.md +++ b/code-reviews/Worker.Tests/findings.md @@ -640,13 +640,13 @@ Re-review of the worker-test delta covering the new COM seam (`MxAccessCommandEx | Severity | Low | | Category | Code organization & conventions | | Location | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs:2233`, `src/ZB.MOM.WW.MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs:97` | -| Status | Open | +| Status | In Progress | **Description:** `FakeMxStatus` is defined twice — file-scope in `TestSupport/NoopMxAccessServer.cs:97` and nested in `MxAccessCommandExecutorTests.FakeMxAccessComObject:2233` — both exposing the same four public fields that `MxStatusProxyConverter` reflects over. The two copies must stay structurally identical; a future field change to the real COM struct requires updating two places, and the duplication is invisible to a reader consulting only one file. **Recommendation:** Extract `FakeMxStatus` into its own `TestSupport/FakeMxStatus.cs` (or colocate both doubles) and have `MxAccessCommandExecutorTests` use the shared type instead of its nested copy. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Removed the nested `FakeMxStatus` class from `MxAccessCommandExecutorTests.FakeMxAccessComObject`; the two `new FakeMxStatus { ... }` usages in `Suspend`/`Activate` now resolve to the shared `TestSupport.FakeMxStatus` via the pre-existing `using ZB.MOM.WW.MxGateway.Worker.Tests.TestSupport;` import. Updated the XML doc on `TestSupport/NoopMxAccessServer.cs:FakeMxStatus` to note the consolidation. Fix applied 2026-06-16, pending build verification on windev. ### Worker.Tests-035 @@ -655,13 +655,13 @@ Re-review of the worker-test delta covering the new COM seam (`MxAccessCommandEx | Severity | Low | | Category | Testing coverage | | Location | `src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs`, `src/ZB.MOM.WW.MxGateway.Worker/MxAccess/MxAccessCommandExecutor.cs:99-136` | -| Status | Open | +| Status | In Progress | **Description:** `MxAccessCommandExecutor.Execute` has a `_` discard arm returning `CreateInvalidRequestReply(... "Unsupported MXAccess command kind ...")` — the safety net for an unknown `MxCommandKind` (e.g. a future gateway enum value before the worker is updated). No test passes an unknown kind and asserts `InvalidRequest`. A regression changing the arm to `throw` would propagate an unhandled exception through `WorkerPipeSession` and no test would catch it. **Recommendation:** Add a `[Fact]` constructing a `StaCommand` with an undefined `MxCommandKind` value and asserting the reply is `ProtocolStatusCode.InvalidRequest` with "Unsupported" in the diagnostic. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Added `DispatchAsync_WithUnknownCommandKind_ReturnsInvalidRequestWithUnsupportedDiagnostic` to `MxAccessCommandExecutorTests`. Casts `int.MaxValue` to `MxCommandKind` (an undefined value not present in the proto-generated enum), dispatches it through `MxAccessStaSession.DispatchAsync`, asserts `ProtocolStatusCode.InvalidRequest`, and asserts `reply.DiagnosticMessage` contains "Unsupported" (case-insensitive — matching `CreateInvalidRequestReply`'s `"Unsupported MXAccess command kind ..."` message). Fix applied 2026-06-16, pending build verification on windev. ### Worker.Tests-036 @@ -670,10 +670,10 @@ Re-review of the worker-test delta covering the new COM seam (`MxAccessCommandEx | Severity | Low | | Category | Concurrency & thread safety | | Location | `src/ZB.MOM.WW.MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:983-996` | -| Status | Open | +| Status | In Progress | **Description:** `RunAsync_SendsFirstHeartbeatImmediatelyOnEnteringLoop` carries a redundant wall-clock assertion `Assert.True(elapsed < TimeSpan.FromSeconds(5), ...)`. The existing `heartbeatWait` CTS (cancel-after 5s) already enforces the same bound — the extra wall-clock check can only fire if the heartbeat arrived but took >5s to be received, which the CTS already prevents. It is the same coarse wall-clock pattern prior findings (Worker.Tests-003/004/013/020) corrected. **Recommendation:** Remove the `start`/`elapsed`/`Assert.True(elapsed < ...)` check; the CTS timeout already pins the timing contract. -**Resolution:** _(empty until closed)_ +**Resolution:** 2026-06-16 — Removed the `DateTimeOffset start`, `TimeSpan elapsed`, and `Assert.True(elapsed < TimeSpan.FromSeconds(5), ...)` wall-clock assertions from `RunAsync_SendsFirstHeartbeatImmediatelyOnEnteringLoop`. The `heartbeatWait` CTS (cancel-after 5s) already enforces the same timing bound. Added an inline comment explaining why the wall-clock floor is omitted, consistent with the Worker.Tests-003/004/013/020 pattern. Fix applied 2026-06-16, pending build verification on windev. diff --git a/src/ZB.MOM.WW.MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs b/src/ZB.MOM.WW.MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs index 1db3fd0..b269b7d 100644 --- a/src/ZB.MOM.WW.MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs @@ -980,7 +980,11 @@ public sealed class WorkerPipeSessionTests Task runTask = session.RunAsync(cancellation.Token); await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token); - DateTimeOffset start = DateTimeOffset.UtcNow; + // The heartbeatWait CTS (5s cancel-after) already enforces the timing bound: + // if the first heartbeat is not received within 5s, ReadUntilAsync throws + // OperationCanceledException and the test fails. A redundant wall-clock + // elapsed < 5s assertion would add the same class of flakiness + // Workers.Tests-003/004/013/020 corrected elsewhere, so it is omitted here. using CancellationTokenSource heartbeatWait = CancellationTokenSource .CreateLinkedTokenSource(cancellation.Token); heartbeatWait.CancelAfter(TimeSpan.FromSeconds(5)); @@ -988,12 +992,8 @@ public sealed class WorkerPipeSessionTests pipePair.GatewayReader, WorkerEnvelope.BodyOneofCase.WorkerHeartbeat, heartbeatWait.Token); - TimeSpan elapsed = DateTimeOffset.UtcNow - start; Assert.Equal(WorkerEnvelope.BodyOneofCase.WorkerHeartbeat, heartbeat.BodyCase); - Assert.True( - elapsed < TimeSpan.FromSeconds(5), - $"First heartbeat took {elapsed}, expected well under the 30s interval."); await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); } diff --git a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs index 2bcf8c1..e48dd19 100644 --- a/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs +++ b/src/ZB.MOM.WW.MxGateway.Worker.Tests/MxAccess/MxAccessCommandExecutorTests.cs @@ -1123,6 +1123,36 @@ public sealed class MxAccessCommandExecutorTests Assert.Equal(500, fakeComObject.SetBufferedUpdateIntervalValue); } + /// + /// Verifies that a command with an unknown value returns an + /// reply whose diagnostic contains "Unsupported". + /// This pins the _ => CreateInvalidRequestReply(...) discard arm in + /// MxAccessCommandExecutor.Execute: a regression that changed the arm to + /// throw would propagate an unhandled exception through WorkerPipeSession + /// and no other test would catch it. + /// + [Fact] + public async Task DispatchAsync_WithUnknownCommandKind_ReturnsInvalidRequestWithUnsupportedDiagnostic() + { + FakeMxAccessComObjectFactory factory = new(new FakeMxAccessComObject(registerHandle: 999)); + using StaRuntime runtime = CreateRuntime(); + using MxAccessStaSession session = new(runtime, factory, new NoopEventSink()); + await session.StartAsync(workerProcessId: 1234); + + // Cast an integer outside the defined MxCommandKind range to an unknown kind value. + MxCommandKind unknownKind = (MxCommandKind)int.MaxValue; + MxCommandReply reply = await session.DispatchAsync(new StaCommand( + "session-1", + "unknown-kind-correlation", + new MxCommand + { + Kind = unknownKind, + })); + + Assert.Equal(ProtocolStatusCode.InvalidRequest, reply.ProtocolStatus.Code); + Assert.Contains("Unsupported", reply.DiagnosticMessage, StringComparison.OrdinalIgnoreCase); + } + private static StaCommand CreateSuspendCommand( string correlationId, int serverHandle, @@ -2229,21 +2259,6 @@ public sealed class MxAccessCommandExecutorTests SetBufferedUpdateIntervalValue = updateIntervalMilliseconds; } - /// Status stand-in reflected over by the worker's MxStatusProxy converter. - internal sealed class FakeMxStatus - { - /// Success indicator read by the status converter. - public int success; - - /// Status category read by the status converter. - public int category; - - /// Status detected-by read by the status converter. - public int detectedBy; - - /// Status detail read by the status converter. - public int detail; - } } /// Factory for creating fake MXAccess COM objects in tests. diff --git a/src/ZB.MOM.WW.MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs b/src/ZB.MOM.WW.MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs index 692db6a..39c7b17 100644 --- a/src/ZB.MOM.WW.MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs +++ b/src/ZB.MOM.WW.MxGateway.Worker.Tests/TestSupport/NoopMxAccessServer.cs @@ -94,6 +94,11 @@ internal sealed class NoopMxAccessServer : IMxAccessServer /// success, category, detectedBy, and detail /// fields, so this fake exposes the same field shape with all-OK values. /// +/// +/// Previously duplicated as a nested class in MxAccessCommandExecutorTests.FakeMxAccessComObject. +/// Consolidated here per Worker.Tests-034 so a future field change to the real COM struct only +/// requires updating one place. +/// internal sealed class FakeMxStatus { // These public fields exist solely so MxStatusProxyConverter can reflect