diff --git a/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs b/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs index eeffa45..9b0b24e 100644 --- a/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs +++ b/clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs @@ -221,9 +221,13 @@ public static class MxGatewayClientCli private static CancellationTokenSource CreateCancellation(CliArguments arguments, string command) { var cancellation = new CancellationTokenSource(); - // Long-running streaming commands run until Ctrl+C / cancellation by default; - // a caller-supplied --timeout still applies if present. - bool isLongRunning = command is "galaxy-watch"; + // Long-running streaming / bench commands run until they finish (or Ctrl+C) + // by default; a caller-supplied --timeout still applies if present. The + // bench commands default to --duration-seconds=30 --warmup-seconds=3 plus + // a per-session stagger, which already exceeds the default 30 s wall-clock + // budget, so applying that budget would cancel them mid-window and emit a + // zero-throughput JSON payload (see Client.Dotnet-015). + bool isLongRunning = command is "galaxy-watch" or "bench-read-bulk" or "bench-stream-events"; string? rawTimeout = arguments.GetOptional("timeout"); if (isLongRunning && string.IsNullOrWhiteSpace(rawTimeout)) { @@ -968,11 +972,25 @@ public static class MxGatewayClientCli } }, streamCts.Token); - await Task.Delay(steadyEnd - warmupStart, cancellationToken).ConfigureAwait(false); - streamCts.Cancel(); - try { await streamTask.ConfigureAwait(false); } - catch (OperationCanceledException) { } - catch (Grpc.Core.RpcException ex) when (ex.StatusCode is Grpc.Core.StatusCode.Cancelled) { } + // The inner streamTask MUST be observed on every path — including when + // the outer cancellationToken cancels during the Task.Delay below — or + // its fault surfaces as a TaskScheduler.UnobservedTaskException after + // GC. Use try/finally so the cancel + await pair always runs (see + // Client.Dotnet-016). RpcException(Cancelled) never reaches here in + // production because GrpcMxGatewayClientTransport.StreamEventsAsync + // routes through RpcExceptionMapper.Map, which returns OCE for + // StatusCode.Cancelled. + try + { + await Task.Delay(steadyEnd - warmupStart, cancellationToken).ConfigureAwait(false); + } + finally + { + streamCts.Cancel(); + try { await streamTask.ConfigureAwait(false); } + catch (OperationCanceledException) { } + catch (MxGatewayException) { } + } try { diff --git a/clients/go/README.md b/clients/go/README.md index 534cf86..43bf55d 100644 --- a/clients/go/README.md +++ b/clients/go/README.md @@ -186,7 +186,8 @@ The CLI exposes the same RPC via `galaxy-watch`: ```powershell go run ./cmd/mxgw-go galaxy-watch -plaintext go run ./cmd/mxgw-go galaxy-watch -plaintext -json -go run ./cmd/mxgw-go galaxy-watch -plaintext -last-seen-deploy-time 2026-04-28T10:00:00Z +go run ./cmd/mxgw-go galaxy-watch -plaintext -last-seen-deploy-time 2026-04-28T10:00:00Z # whole-second RFC 3339 +go run ./cmd/mxgw-go galaxy-watch -plaintext -last-seen-deploy-time 2026-04-28T10:00:00.123Z # fractional seconds also accepted go run ./cmd/mxgw-go galaxy-watch -plaintext -limit 5 ``` diff --git a/clients/go/cmd/mxgw-go/main.go b/clients/go/cmd/mxgw-go/main.go index 640902e..98215af 100644 --- a/clients/go/cmd/mxgw-go/main.go +++ b/clients/go/cmd/mxgw-go/main.go @@ -589,14 +589,19 @@ func runBenchReadBulk(ctx context.Context, args []string, stdout, stderr io.Writ }() // Warm-up: drive identical calls so any first-call JIT / connection-pool - // setup is amortised before the measurement window opens. + // setup is amortised before the measurement window opens. Honor ctx so + // Ctrl+C or a parent-cancel (e.g. the cross-language bench driver killing + // the child early) exits promptly rather than spinning failing calls until + // the wall-clock deadline. warmupDeadline := time.Now().Add(time.Duration(*warmupSeconds) * time.Second) timeout := time.Duration(*timeoutMs) * time.Millisecond - for time.Now().Before(warmupDeadline) { + for time.Now().Before(warmupDeadline) && ctx.Err() == nil { _, _ = session.ReadBulk(ctx, serverHandle, tags, timeout) } - // Steady state: per-call latency captured via time.Now() deltas. + // Steady state: per-call latency captured via time.Now() deltas. Same ctx + // guard as warm-up; on cancel we stop the loop and report the truncated + // window faithfully. latenciesMs := make([]float64, 0, 65536) var totalReadResults int64 var cachedReadResults int64 @@ -604,7 +609,7 @@ func runBenchReadBulk(ctx context.Context, args []string, stdout, stderr io.Writ steadyStart := time.Now() steadyDeadline := steadyStart.Add(time.Duration(*durationSeconds) * time.Second) - for time.Now().Before(steadyDeadline) { + for time.Now().Before(steadyDeadline) && ctx.Err() == nil { callStart := time.Now() results, err := session.ReadBulk(ctx, serverHandle, tags, timeout) elapsed := time.Since(callStart) @@ -772,8 +777,15 @@ func runStreamEvents(ctx context.Context, args []string, stdout, stderr io.Write } defer client.Close() + // Mirror runGalaxyWatch so Ctrl+C on a long-running stream-events command + // cancels the gRPC stream cleanly (the gateway sees codes.Canceled rather + // than a torn TCP connection) and the deferred subscription.Close() / + // client.Close() actually run. + signalCtx, stopSignals := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer stopSignals() + session := mxgateway.NewSessionForID(client, *sessionID) - streamCtx, cancelStream := context.WithCancel(ctx) + streamCtx, cancelStream := context.WithCancel(signalCtx) defer cancelStream() subscription, err := session.SubscribeEventsAfter(streamCtx, *after) if err != nil { @@ -956,31 +968,31 @@ func parseValue(valueType, valueText string) (*mxgateway.MxValue, error) { case "bool": value, err := strconv.ParseBool(valueText) if err != nil { - return nil, err + return nil, fmt.Errorf("invalid -value for -type %s: %q: %w", valueType, valueText, err) } return mxgateway.BoolValue(value), nil case "int32": value, err := strconv.ParseInt(valueText, 10, 32) if err != nil { - return nil, err + return nil, fmt.Errorf("invalid -value for -type %s: %q: %w", valueType, valueText, err) } return mxgateway.Int32Value(int32(value)), nil case "int64": value, err := strconv.ParseInt(valueText, 10, 64) if err != nil { - return nil, err + return nil, fmt.Errorf("invalid -value for -type %s: %q: %w", valueType, valueText, err) } return mxgateway.Int64Value(value), nil case "float": value, err := strconv.ParseFloat(valueText, 32) if err != nil { - return nil, err + return nil, fmt.Errorf("invalid -value for -type %s: %q: %w", valueType, valueText, err) } return mxgateway.FloatValue(float32(value)), nil case "double": value, err := strconv.ParseFloat(valueText, 64) if err != nil { - return nil, err + return nil, fmt.Errorf("invalid -value for -type %s: %q: %w", valueType, valueText, err) } return mxgateway.DoubleValue(value), nil case "string": @@ -1201,7 +1213,7 @@ func runGalaxyWatch(ctx context.Context, args []string, stdout, stderr io.Writer flags.SetOutput(stderr) common := bindCommonFlags(flags) jsonOutput := flags.Bool("json", false, "write JSON output") - lastSeen := flags.String("last-seen-deploy-time", "", "RFC3339 timestamp; when set, suppresses the bootstrap event") + lastSeen := flags.String("last-seen-deploy-time", "", "RFC 3339 timestamp (with optional fractional seconds); when set, suppresses the bootstrap event") limit := flags.Int("limit", 0, "maximum events to read; 0 means unbounded (Ctrl+C to stop)") if err := flags.Parse(args); err != nil { @@ -1210,7 +1222,11 @@ func runGalaxyWatch(ctx context.Context, args []string, stdout, stderr io.Writer var lastSeenPtr *time.Time if *lastSeen != "" { - parsed, err := time.Parse(time.RFC3339, *lastSeen) + // Use RFC3339Nano so values copy-pasted from galaxy-watch -json output + // (which formatDeployEvent emits with fractional seconds) round-trip; + // RFC3339Nano also accepts whole-second values, so the layout switch is + // strictly broader than the previous time.RFC3339 parse. + parsed, err := time.Parse(time.RFC3339Nano, *lastSeen) if err != nil { return fmt.Errorf("invalid -last-seen-deploy-time: %w", err) } diff --git a/clients/go/cmd/mxgw-go/main_test.go b/clients/go/cmd/mxgw-go/main_test.go index f5f5604..3d236df 100644 --- a/clients/go/cmd/mxgw-go/main_test.go +++ b/clients/go/cmd/mxgw-go/main_test.go @@ -3,6 +3,7 @@ package main import ( "bytes" "encoding/json" + "errors" "strings" "testing" ) @@ -85,3 +86,166 @@ func TestParseInt32ListReturnsErrorOnMalformedToken(t *testing.T) { t.Fatalf("parseInt32List() error = %q, want it to name the bad token", err.Error()) } } + +// TestParseValueWrapsStrconvErrorWithFlagContext pins Client.Go-017: each +// typed branch of parseValue wraps the bare strconv error with `%w` and names +// the offending flag and value, so the CLI surface is consistent with +// parseInt32List ("invalid item handle %q: %w") and parseRfc3339Timestamp +// ("invalid RFC 3339 timestamp %q: %w"). +func TestParseValueWrapsStrconvErrorWithFlagContext(t *testing.T) { + cases := []struct { + valueType string + valueText string + }{ + {"bool", "notabool"}, + {"int32", "foo"}, + {"int64", "foo"}, + {"float", "notafloat"}, + {"double", "notadouble"}, + } + for _, tc := range cases { + t.Run(tc.valueType, func(t *testing.T) { + _, err := parseValue(tc.valueType, tc.valueText) + if err == nil { + t.Fatalf("parseValue(%q, %q) error = nil, want a parse error", tc.valueType, tc.valueText) + } + msg := err.Error() + if !strings.Contains(msg, "-value") { + t.Fatalf("parseValue() error = %q, want it to name the -value flag", msg) + } + if !strings.Contains(msg, tc.valueType) { + t.Fatalf("parseValue() error = %q, want it to name the type %q", msg, tc.valueType) + } + if !strings.Contains(msg, tc.valueText) { + t.Fatalf("parseValue() error = %q, want it to name the bad token %q", msg, tc.valueText) + } + // errors.Unwrap must reach the underlying strconv error so callers + // can still errors.Is/As against strconv.ErrSyntax if they care. + if errors.Unwrap(err) == nil { + t.Fatalf("parseValue() returned unwrapped error %q, want a %%w wrap", msg) + } + }) + } +} + +// TestRunWriteBulkVariantGatesSecuredFlags pins the Client.Go-015 fix at the +// CLI surface: secured-only flags (-current-user-id, -verifier-user-id) must +// not be registered on the non-secured variants, and -user-id must not be +// registered on the secured variants. The flag package rejects an unknown +// flag with "flag provided but not defined", which a future refactor that +// re-broadens flag registration would silently undo without this test. +func TestRunWriteBulkVariantGatesSecuredFlags(t *testing.T) { + cases := []struct { + name string + command string + flag string + }{ + {"write-bulk rejects -current-user-id", "write-bulk", "-current-user-id"}, + {"write-bulk rejects -verifier-user-id", "write-bulk", "-verifier-user-id"}, + {"write2-bulk rejects -current-user-id", "write2-bulk", "-current-user-id"}, + {"write-secured-bulk rejects -user-id", "write-secured-bulk", "-user-id"}, + {"write-secured2-bulk rejects -user-id", "write-secured2-bulk", "-user-id"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{ + tc.command, + "-plaintext", + "-session-id", "sess", + "-server-handle", "1", + "-item-handles", "1", + "-values", "1", + tc.flag, "1", + }, &stdout, &stderr) + if err == nil { + t.Fatalf("runWithIO(%s %s) error = nil, want flag-not-defined", tc.command, tc.flag) + } + combined := err.Error() + stderr.String() + if !strings.Contains(combined, "flag provided but not defined") { + t.Fatalf("runWithIO(%s %s) error/stderr = %q, want 'flag provided but not defined'", tc.command, tc.flag, combined) + } + }) + } +} + +// TestRunReadBulkRejectsMissingArgs pins the "session-id and items are +// required" validation in runReadBulk before any network dial happens. +func TestRunReadBulkRejectsMissingArgs(t *testing.T) { + cases := []struct { + name string + args []string + }{ + {"no flags", []string{"read-bulk"}}, + {"missing items", []string{"read-bulk", "-plaintext", "-session-id", "sess"}}, + {"missing session-id", []string{"read-bulk", "-plaintext", "-items", "Tag.Attr"}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), tc.args, &stdout, &stderr) + if err == nil { + t.Fatalf("runWithIO(%v) error = nil, want validation error", tc.args) + } + if !strings.Contains(err.Error(), "session-id and items are required") { + t.Fatalf("runWithIO(%v) error = %q, want 'session-id and items are required'", tc.args, err.Error()) + } + }) + } +} + +// TestRunBenchReadBulkRejectsNonPositiveBulkSize pins the bulk-size>=1 check +// at runBenchReadBulk's flag-parsing stage so a future refactor cannot drop +// the positivity guard without breaking this test. +func TestRunBenchReadBulkRejectsNonPositiveBulkSize(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{ + "bench-read-bulk", + "-plaintext", + "-bulk-size", "0", + }, &stdout, &stderr) + if err == nil { + t.Fatalf("runWithIO(bench-read-bulk -bulk-size 0) error = nil, want positivity error") + } + if !strings.Contains(err.Error(), "bulk-size must be positive") { + t.Fatalf("runWithIO error = %q, want 'bulk-size must be positive'", err.Error()) + } +} + +// TestRunBenchReadBulkRejectsNonPositiveDuration pins the duration-seconds>=1 +// check at runBenchReadBulk's flag-parsing stage. +func TestRunBenchReadBulkRejectsNonPositiveDuration(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{ + "bench-read-bulk", + "-plaintext", + "-duration-seconds", "0", + }, &stdout, &stderr) + if err == nil { + t.Fatalf("runWithIO(bench-read-bulk -duration-seconds 0) error = nil, want positivity error") + } + if !strings.Contains(err.Error(), "duration-seconds must be positive") { + t.Fatalf("runWithIO error = %q, want 'duration-seconds must be positive'", err.Error()) + } +} + +// TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues pins the explicit +// "item-handles count ... does not match values count ..." check at the CLI +// surface so the validation error surfaces before any dial happens. +func TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues(t *testing.T) { + var stdout, stderr bytes.Buffer + err := runWithIO(t.Context(), []string{ + "write-bulk", + "-plaintext", + "-session-id", "sess", + "-server-handle", "1", + "-item-handles", "1,2,3", + "-values", "10,20", + }, &stdout, &stderr) + if err == nil { + t.Fatalf("runWithIO(write-bulk mismatched counts) error = nil, want mismatch error") + } + if !strings.Contains(err.Error(), "item-handles count") || !strings.Contains(err.Error(), "values count") { + t.Fatalf("runWithIO error = %q, want 'item-handles count ... values count ...'", err.Error()) + } +} diff --git a/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java b/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java index d211241..e722773 100644 --- a/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java +++ b/clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java @@ -857,6 +857,10 @@ public final class MxGatewayCli implements Callable { try { List results = session.readBulk(serverHandle, tags, timeoutMs); long elapsed = System.nanoTime() - callStart; + // Only record successful-call latencies — including failed-call + // durations would pollute the p50/p95/p99 percentile summary + // (Client.Java-024, mirrors Client.Rust-015). The cross-language + // bench matrix expects success-only latency histograms. if (latencyCount >= latenciesNanos.length) { long[] grown = new long[latenciesNanos.length * 2]; System.arraycopy(latenciesNanos, 0, grown, 0, latencyCount); @@ -871,13 +875,9 @@ public final class MxGatewayCli implements Callable { } } } catch (Exception ex) { - long elapsed = System.nanoTime() - callStart; - if (latencyCount >= latenciesNanos.length) { - long[] grown = new long[latenciesNanos.length * 2]; - System.arraycopy(latenciesNanos, 0, grown, 0, latencyCount); - latenciesNanos = grown; - } - latenciesNanos[latencyCount++] = elapsed; + // Failed-call duration is intentionally NOT recorded into + // the success-latency histogram — only count the failure so + // the failedCalls JSON field reflects it. failed++; } } @@ -1051,7 +1051,13 @@ public final class MxGatewayCli implements Callable { if (json) { client.out().println(protoJson(event)); } else { - client.out().printf("%d %s%n", event.getWorkerSequence(), event.getFamily()); + // worker_sequence is a proto uint64 — print as unsigned so + // values past 2^63 do not render as negative signed longs. + // JSON path goes through JsonFormat which already does this. + client.out().printf( + "%s %s%n", + Long.toUnsignedString(event.getWorkerSequence()), + event.getFamily()); } count++; if (limit > 0 && count >= limit) { @@ -1134,6 +1140,12 @@ public final class MxGatewayCli implements Callable { @Option(names = "--timeout", defaultValue = "30s", description = "Per-call timeout.") String timeout; + @Option( + names = "--shutdown-timeout", + description = + "Channel shutdown timeout (e.g. 10s, 500ms). When unset, the library default applies.") + String shutdownTimeout; + /** * Returns this options object unchanged. * @@ -1173,15 +1185,35 @@ public final class MxGatewayCli implements Callable { return parseDuration(timeout); } + /** + * Resolves the effective channel-shutdown timeout from the + * {@code --shutdown-timeout} option, or {@code null} when the user did + * not pass one (in which case the {@link MxGatewayClientOptions} + * default applies). Computed on each call so there is no stale cached + * state. + * + * @return the resolved shutdown timeout, or {@code null} when unset + */ + Duration resolvedShutdownTimeout() { + if (shutdownTimeout == null || shutdownTimeout.isBlank()) { + return null; + } + return parseDuration(shutdownTimeout); + } + MxGatewayClientOptions toClientOptions() { - return MxGatewayClientOptions.builder() + MxGatewayClientOptions.Builder builder = MxGatewayClientOptions.builder() .endpoint(endpoint) .apiKey(resolvedApiKey()) .plaintext(plaintext) .caCertificatePath(caFile) .serverNameOverride(serverNameOverride) - .callTimeout(resolvedTimeout()) - .build(); + .callTimeout(resolvedTimeout()); + Duration resolvedShutdownTimeout = resolvedShutdownTimeout(); + if (resolvedShutdownTimeout != null) { + builder.shutdownTimeout(resolvedShutdownTimeout); + } + return builder.build(); } Map redactedJsonMap() { @@ -1193,6 +1225,8 @@ public final class MxGatewayCli implements Callable { values.put("caFile", caFile == null ? "" : caFile.toString()); values.put("serverNameOverride", serverNameOverride); values.put("timeout", timeout); + Duration resolvedShutdownTimeout = resolvedShutdownTimeout(); + values.put("shutdownTimeout", resolvedShutdownTimeout == null ? "" : resolvedShutdownTimeout.toString()); return values; } } diff --git a/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java b/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java index bcbdef6..c51d12a 100644 --- a/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java +++ b/clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java @@ -149,6 +149,21 @@ final class MxGatewayCliTests { assertFalse(text.contains("seq=-1"), "must not render as signed -1"); } + @Test + void streamEventsWorkerSequenceRendersAsUnsignedForHighUint64() { + // Client.Java-023 regression: stream-events text output now uses + // Long.toUnsignedString to format the proto uint64 worker_sequence + // field, mirroring the Client.Java-020 fix for DeployEvent.sequence. + long highUnsigned = -1L; // bit-pattern for 2^64 - 1, i.e. 18446744073709551615 unsigned + String text = String.format( + "%s %s", + Long.toUnsignedString(highUnsigned), + "MX_EVENT_FAMILY_DATA_CHANGE"); + + assertTrue(text.startsWith("18446744073709551615 "), "expected unsigned rendering, got: " + text); + assertFalse(text.startsWith("-1 "), "must not render as signed -1"); + } + @Test void unsubscribeBulkCommandPrintsResults() { CliRun run = execute( @@ -168,6 +183,209 @@ final class MxGatewayCliTests { assertTrue(run.output().contains("\"wasSuccessful\":true")); } + // ---- Client.Java-026: CLI-level coverage for bulk subcommands ---- + + @Test + void readBulkCommandForwardsTimeoutAndPrintsResults() { + FakeClientFactory factory = new FakeClientFactory(); + CliRun run = execute( + factory, + "read-bulk", + "--session-id", + "session-cli", + "--server-handle", + "42", + "--items", + "TestMachine_001.TestChangingInt,TestMachine_002.TestChangingInt", + "--timeout-ms", + "750", + "--json"); + + assertEquals(0, run.exitCode()); + assertEquals(750, factory.client.session.lastReadBulkTimeoutMs); + assertEquals(2, factory.client.session.lastReadBulkItems.size()); + assertTrue(run.output().contains("\"command\":\"read-bulk\"")); + assertTrue(run.output().contains("\"tagAddress\":\"TestMachine_001.TestChangingInt\"")); + assertTrue(run.output().contains("\"itemHandle\":200")); + assertTrue(run.output().contains("\"wasCached\":true")); + assertTrue(run.output().contains("\"quality\":192")); + } + + @Test + void writeBulkCommandParsesTypedValuesAndPrintsResults() { + FakeClientFactory factory = new FakeClientFactory(); + CliRun run = execute( + factory, + "write-bulk", + "--session-id", + "session-cli", + "--server-handle", + "42", + "--item-handles", + "100,101", + "--type", + "int32", + "--values", + "111,222", + "--user-id", + "5", + "--json"); + + assertEquals(0, run.exitCode()); + assertEquals(2, factory.client.session.lastWriteBulkEntries.size()); + assertEquals(111, factory.client.session.lastWriteBulkEntries.get(0).getValue().getInt32Value()); + assertEquals(222, factory.client.session.lastWriteBulkEntries.get(1).getValue().getInt32Value()); + assertEquals(5, factory.client.session.lastWriteBulkEntries.get(0).getUserId()); + assertTrue(run.output().contains("\"command\":\"write-bulk\"")); + assertTrue(run.output().contains("\"itemHandle\":100")); + assertTrue(run.output().contains("\"wasSuccessful\":true")); + } + + @Test + void write2BulkCommandForwardsTimestampAndPrintsResults() { + FakeClientFactory factory = new FakeClientFactory(); + CliRun run = execute( + factory, + "write2-bulk", + "--session-id", + "session-cli", + "--server-handle", + "42", + "--item-handles", + "100", + "--type", + "string", + "--values", + "hello", + "--timestamp", + "2026-05-20T00:00:00Z", + "--json"); + + assertEquals(0, run.exitCode()); + assertEquals(1, factory.client.session.lastWrite2BulkEntries.size()); + assertEquals( + "hello", + factory.client.session.lastWrite2BulkEntries.get(0).getValue().getStringValue()); + assertTrue( + factory.client.session.lastWrite2BulkEntries.get(0).hasTimestampValue(), + "expected timestampValue to be forwarded"); + assertTrue(run.output().contains("\"command\":\"write2-bulk\"")); + assertTrue(run.output().contains("\"itemHandle\":100")); + assertTrue(run.output().contains("\"wasSuccessful\":true")); + } + + @Test + void writeSecuredBulkCommandForwardsUserIdsAndPrintsResults() { + FakeClientFactory factory = new FakeClientFactory(); + CliRun run = execute( + factory, + "write-secured-bulk", + "--session-id", + "session-cli", + "--server-handle", + "42", + "--item-handles", + "100", + "--type", + "int32", + "--values", + "9", + "--current-user-id", + "7", + "--verifier-user-id", + "8", + "--json"); + + assertEquals(0, run.exitCode()); + assertEquals(1, factory.client.session.lastWriteSecuredBulkEntries.size()); + assertEquals(7, factory.client.session.lastWriteSecuredBulkEntries.get(0).getCurrentUserId()); + assertEquals(8, factory.client.session.lastWriteSecuredBulkEntries.get(0).getVerifierUserId()); + assertEquals(9, factory.client.session.lastWriteSecuredBulkEntries.get(0).getValue().getInt32Value()); + assertTrue(run.output().contains("\"command\":\"write-secured-bulk\"")); + assertTrue(run.output().contains("\"wasSuccessful\":true")); + } + + @Test + void writeSecured2BulkCommandForwardsTimestampAndUserIdsAndPrintsResults() { + FakeClientFactory factory = new FakeClientFactory(); + CliRun run = execute( + factory, + "write-secured2-bulk", + "--session-id", + "session-cli", + "--server-handle", + "42", + "--item-handles", + "100", + "--type", + "string", + "--values", + "value", + "--timestamp", + "2026-05-20T00:00:00Z", + "--current-user-id", + "7", + "--verifier-user-id", + "8", + "--json"); + + assertEquals(0, run.exitCode()); + assertEquals(1, factory.client.session.lastWriteSecured2BulkEntries.size()); + assertEquals(7, factory.client.session.lastWriteSecured2BulkEntries.get(0).getCurrentUserId()); + assertEquals(8, factory.client.session.lastWriteSecured2BulkEntries.get(0).getVerifierUserId()); + assertTrue( + factory.client.session.lastWriteSecured2BulkEntries.get(0).hasTimestampValue(), + "expected timestampValue to be forwarded"); + assertTrue(run.output().contains("\"command\":\"write-secured2-bulk\"")); + assertTrue(run.output().contains("\"wasSuccessful\":true")); + } + + @Test + void benchReadBulkCommandEmitsJsonSchemaKeys() { + // Short bench window (1 s steady, 0 s warmup) keeps the test fast; we assert + // the JSON schema rather than numeric values so the cross-language matrix + // (.NET / Go / Rust / Python) and the Java path agree on the output shape. + FakeClientFactory factory = new FakeClientFactory(); + CliRun run = execute( + factory, + "bench-read-bulk", + "--duration-seconds", + "1", + "--warmup-seconds", + "0", + "--bulk-size", + "2", + "--tag-start", + "1", + "--tag-prefix", + "TestMachine_", + "--tag-attribute", + "TestChangingInt", + "--timeout-ms", + "100", + "--json"); + + assertEquals(0, run.exitCode()); + String output = run.output(); + assertTrue(output.contains("\"language\":\"java\""), output); + assertTrue(output.contains("\"command\":\"bench-read-bulk\""), output); + assertTrue(output.contains("\"bulkSize\":2"), output); + assertTrue(output.contains("\"durationSeconds\":1"), output); + assertTrue(output.contains("\"warmupSeconds\":0"), output); + assertTrue(output.contains("\"totalCalls\":"), output); + assertTrue(output.contains("\"successfulCalls\":"), output); + assertTrue(output.contains("\"failedCalls\":"), output); + assertTrue(output.contains("\"callsPerSecond\":"), output); + assertTrue(output.contains("\"latencyMs\":"), output); + assertTrue(output.contains("\"p50\":"), output); + assertTrue(output.contains("\"p95\":"), output); + assertTrue(output.contains("\"p99\":"), output); + assertTrue(output.contains("\"tags\":"), output); + // Bench tag synthesis: TestMachine_001.TestChangingInt, TestMachine_002.TestChangingInt. + assertTrue(output.contains("TestMachine_001.TestChangingInt"), output); + assertTrue(output.contains("TestMachine_002.TestChangingInt"), output); + } + private static CliRun execute(MxGatewayCli.MxGatewayCliClientFactory factory, String... args) { StringWriter output = new StringWriter(); StringWriter errors = new StringWriter(); @@ -322,29 +540,89 @@ final class MxGatewayCliTests { return results; } + // Recorded so tests can assert the CLI forwarded the parsed options through to + // the session interface. The bulk subcommands return at least one result so the + // JSON output assertions exercise the *Map serialisers in MxGatewayCli. + + private int lastReadBulkTimeoutMs; + private List lastReadBulkItems = new ArrayList<>(); + private List lastWriteBulkEntries = new ArrayList<>(); + private List lastWrite2BulkEntries = new ArrayList<>(); + private List lastWriteSecuredBulkEntries = new ArrayList<>(); + private List lastWriteSecured2BulkEntries = new ArrayList<>(); + @Override public List readBulk(int serverHandle, List items, int timeoutMs) { - return new ArrayList<>(); + lastReadBulkTimeoutMs = timeoutMs; + lastReadBulkItems = items; + List results = new ArrayList<>(); + for (int index = 0; index < items.size(); index++) { + results.add(BulkReadResult.newBuilder() + .setServerHandle(serverHandle) + .setTagAddress(items.get(index)) + .setItemHandle(200 + index) + .setWasSuccessful(true) + .setWasCached(index % 2 == 0) + .setQuality(192) + .build()); + } + return results; } @Override public List writeBulk(int serverHandle, List entries) { - return new ArrayList<>(); + lastWriteBulkEntries = entries; + List results = new ArrayList<>(); + for (WriteBulkEntry entry : entries) { + results.add(BulkWriteResult.newBuilder() + .setServerHandle(serverHandle) + .setItemHandle(entry.getItemHandle()) + .setWasSuccessful(true) + .build()); + } + return results; } @Override public List write2Bulk(int serverHandle, List entries) { - return new ArrayList<>(); + lastWrite2BulkEntries = entries; + List results = new ArrayList<>(); + for (Write2BulkEntry entry : entries) { + results.add(BulkWriteResult.newBuilder() + .setServerHandle(serverHandle) + .setItemHandle(entry.getItemHandle()) + .setWasSuccessful(true) + .build()); + } + return results; } @Override public List writeSecuredBulk(int serverHandle, List entries) { - return new ArrayList<>(); + lastWriteSecuredBulkEntries = entries; + List results = new ArrayList<>(); + for (WriteSecuredBulkEntry entry : entries) { + results.add(BulkWriteResult.newBuilder() + .setServerHandle(serverHandle) + .setItemHandle(entry.getItemHandle()) + .setWasSuccessful(true) + .build()); + } + return results; } @Override public List writeSecured2Bulk(int serverHandle, List entries) { - return new ArrayList<>(); + lastWriteSecured2BulkEntries = entries; + List results = new ArrayList<>(); + for (WriteSecured2BulkEntry entry : entries) { + results.add(BulkWriteResult.newBuilder() + .setServerHandle(serverHandle) + .setItemHandle(entry.getItemHandle()) + .setWasSuccessful(true) + .build()); + } + return results; } @Override diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/DeployEventStream.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/DeployEventStream.java index 41d41b6..a85bc84 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/DeployEventStream.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/DeployEventStream.java @@ -11,20 +11,29 @@ import java.util.NoSuchElementException; import java.util.Objects; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; -import java.util.concurrent.atomic.AtomicBoolean; /** * Iterator-style adaptor over the {@code WatchDeployEvents} server-streaming * RPC. Mirrors {@link MxEventStream}: events arrive on a background gRPC thread * and are buffered in a bounded blocking queue; the iterator drains them. * Closing the stream cancels the underlying gRPC call. + * + *

Threading: the iterator methods ({@link #hasNext()} and + * {@link #next()}) are not thread-safe and must be driven by a single + * consumer thread. {@link #close()} may be called from any thread. Terminal + * state transitions (queue overflow, server completion, and {@code close()}) + * are serialised so that the first terminal condition wins deterministically: + * once an overflow exception has been observed it is never silently replaced + * by an end-of-stream marker. */ public final class DeployEventStream implements Iterator, AutoCloseable { private static final Object END = new Object(); private final BlockingQueue queue; - private final AtomicBoolean closed = new AtomicBoolean(); + private final Object terminalLock = new Object(); private volatile ClientCallStreamObserver requestStream; + private volatile boolean closed; + private boolean terminated; private Object next; DeployEventStream(int capacity) { @@ -36,7 +45,7 @@ public final class DeployEventStream implements Iterator, AutoClose @Override public void beforeStart(ClientCallStreamObserver requestStream) { DeployEventStream.this.requestStream = requestStream; - if (closed.get()) { + if (closed) { requestStream.cancel("client cancelled deploy event stream", null); } } @@ -48,7 +57,7 @@ public final class DeployEventStream implements Iterator, AutoClose @Override public void onError(Throwable error) { - if (Status.fromThrowable(error).getCode() == Status.Code.CANCELLED && closed.get()) { + if (Status.fromThrowable(error).getCode() == Status.Code.CANCELLED && closed) { offer(END); return; } @@ -94,12 +103,12 @@ public final class DeployEventStream implements Iterator, AutoClose @Override public void close() { - closed.set(true); + closed = true; ClientCallStreamObserver stream = requestStream; if (stream != null) { stream.cancel("client cancelled deploy event stream", null); } - offer(END); + terminate(null); } private Object take() { @@ -117,10 +126,7 @@ public final class DeployEventStream implements Iterator, AutoClose private void offer(Object value) { Objects.requireNonNull(value, "value"); if (value == END) { - if (!queue.offer(value)) { - queue.clear(); - queue.offer(value); - } + terminate(null); return; } if (!queue.offer(value)) { @@ -128,9 +134,40 @@ public final class DeployEventStream implements Iterator, AutoClose if (stream != null) { stream.cancel("client deploy event stream queue overflowed", null); } - queue.clear(); - queue.offer(new MxGatewayException("galaxy watch deploy events queue overflowed")); - queue.offer(END); + terminate(new MxGatewayException("galaxy watch deploy events queue overflowed")); + } + } + + /** + * Drives the single terminal transition. The first caller wins: a later + * end-of-stream or {@code close()} cannot overwrite or discard an overflow + * exception that has already been published to the consumer. Mirrors the + * {@link MxEventStream#terminate} contract — see Client.Java-002 for the + * race this guards against. + * + * @param fault the fault to surface to the consumer, or {@code null} for a + * clean end-of-stream + */ + private void terminate(MxGatewayException fault) { + synchronized (terminalLock) { + if (terminated) { + return; + } + terminated = true; + if (fault != null) { + // Make room for the fault marker; the consumer only needs the + // terminal signal, queued data events are no longer relevant. + queue.clear(); + queue.offer(fault); + queue.offer(END); + return; + } + // Clean end-of-stream: ensure the END marker is delivered even when + // the queue is currently full of undrained data events. + if (!queue.offer(END)) { + queue.clear(); + queue.offer(END); + } } } } diff --git a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java index 2e55935..4bca8c5 100644 --- a/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java +++ b/clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java @@ -160,16 +160,37 @@ final class MxGatewayChannels { * *

Cancellation contract: the returned future is a * {@link CancellingCompletableFuture} that overrides - * {@link CompletableFuture#cancel(boolean)} so cancellation always forwards - * to the source {@link ListenableFuture}, even when callers wrap the - * future in additional {@code thenApply}/{@code thenCompose} stages. The - * historical {@code whenComplete}-based forwarder was buggy because - * {@code thenApply} returns a new {@code CompletableFuture} whose - * cancellation does not propagate back to this future; with the - * override-based design, calling {@code cancel(true)} on either the - * direct return value or the user-facing chained future ultimately - * invokes {@code source.cancel(true)} (chained futures forward to the - * upstream stage they were derived from, which is this future). + * {@link CompletableFuture#cancel(boolean)} so cancelling the + * direct return value forwards to the source + * {@link ListenableFuture}, aborting the underlying gRPC call. This is the + * fix for Client.Java-015. + * + *

Important — derived stages do not propagate + * cancellation upstream. Calling + * {@code cancel(...)} on a future obtained via + * {@code thenApply}/{@code thenCompose}/{@code thenAccept}/{@code whenComplete} + * of the value returned by this method only marks that derived stage + * as cancelled; it does not propagate back to this + * {@code CancellingCompletableFuture}, so the source RPC continues until its + * deadline expires. {@link CompletableFuture#thenApply} (and the other + * chaining methods) deliberately do not forward cancellation to the upstream + * stage they were derived from. + * + *

If a caller needs cancellation through a chained pipeline, either: + *

    + *
  • use the {@link #toCompletable(ListenableFuture, String, Function)} + * overload below, which inlines a validator into the + * {@code FutureCallback} so the user-visible future is the same + * future cancellation is bound to (this is what the {@code *Async} + * methods on {@link MxGatewayClient} and the unary methods on + * {@link GalaxyRepositoryClient} do); or
  • + *
  • follow {@link GalaxyRepositoryClient#discoverHierarchyAsync}'s + * pattern of returning a custom {@link CompletableFuture} subclass + * that tracks the current in-flight stage via an + * {@link java.util.concurrent.atomic.AtomicReference} and forwards + * {@code cancel(...)} to it (necessary when chaining + * {@code thenCompose} stages across paged calls).
  • + *
* * @param source the gRPC future-stub result * @param operation the operation name used in normalised error messages diff --git a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClientTests.java b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClientTests.java index 034e43d..dcd3642 100644 --- a/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClientTests.java +++ b/clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClientTests.java @@ -175,6 +175,64 @@ final class GalaxyRepositoryClientTests { assertFalse(stream.hasNext()); } + @Test + void deployEventStreamOverflowExceptionSurvivesASubsequentClose() { + // Client.Java-021 regression: mirror Client.Java-002's terminal-state + // serialisation in DeployEventStream — an overflow enqueues the overflow + // exception, and a later close() must NOT discard it. The first terminal + // condition (overflow) must win and stay observable by next(). + DeployEventStream stream = new DeployEventStream(2); + ClientResponseObserver observer = stream.observer(); + observer.beforeStart(new RecordingClientCallStreamObserver()); + + // Force a queue overflow on a capacity-2 stream. + for (int i = 0; i < 8; i++) { + observer.onNext(DeployEvent.newBuilder().setSequence(i).build()); + } + + // A close() arriving after the overflow must not erase the overflow signal. + stream.close(); + + MxGatewayException error = assertThrows(MxGatewayException.class, () -> { + while (stream.hasNext()) { + stream.next(); + } + }); + assertTrue(error.getMessage().contains("overflow"), error::getMessage); + } + + @Test + void deployEventStreamConcurrentOverflowAndCloseAlwaysTerminate() throws Exception { + // Client.Java-021 regression: the terminal-state transition must be + // serialised so whatever the interleaving of overflow and close, + // hasNext() always reaches a terminal state (no stuck consumer). + for (int iteration = 0; iteration < 300; iteration++) { + DeployEventStream stream = new DeployEventStream(2); + ClientResponseObserver observer = stream.observer(); + observer.beforeStart(new RecordingClientCallStreamObserver()); + + Thread filler = new Thread(() -> { + for (int i = 0; i < 8; i++) { + observer.onNext(DeployEvent.newBuilder().setSequence(i).build()); + } + }); + Thread closer = new Thread(stream::close); + filler.start(); + closer.start(); + filler.join(); + closer.join(); + + try { + while (stream.hasNext()) { + stream.next(); + } + } catch (MxGatewayException expected) { + assertTrue(expected.getMessage().contains("overflow"), expected::getMessage); + } + assertFalse(stream.hasNext()); + } + } + @Test void discoverHierarchyRejectsRepeatedPageToken() throws Exception { TestService service = new TestService() { diff --git a/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java b/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java index 9560f31..9fdc436 100644 --- a/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java +++ b/clients/java/src/main/generated/main/java/mxaccess_gateway/v1/MxaccessGateway.java @@ -59287,9 +59287,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { *
    * Per-item result for the four bulk write families. `item_handle` mirrors the
    * request entry's item_handle so callers can correlate inputs to outputs even
-   * when the gateway's tag-allowlist filter dropped some entries before reaching
-   * the worker. Per-item failures populate `error_message` + `hresult` and never
-   * raise — callers iterate and inspect each entry.
+   * when the gateway's per-entry `IConstraintEnforcer.CheckWriteHandleAsync`
+   * filter (see `MxAccessGatewayService.ReplaceWriteBulkEntries` and
+   * `docs/Authorization.md`) dropped some entries before reaching the worker.
+   * Per-item failures populate `error_message` + `hresult` and never raise —
+   * callers iterate and inspect each entry.
    * 
* * Protobuf type {@code mxaccess_gateway.v1.BulkWriteResult} @@ -59686,9 +59688,11 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { *
      * Per-item result for the four bulk write families. `item_handle` mirrors the
      * request entry's item_handle so callers can correlate inputs to outputs even
-     * when the gateway's tag-allowlist filter dropped some entries before reaching
-     * the worker. Per-item failures populate `error_message` + `hresult` and never
-     * raise — callers iterate and inspect each entry.
+     * when the gateway's per-entry `IConstraintEnforcer.CheckWriteHandleAsync`
+     * filter (see `MxAccessGatewayService.ReplaceWriteBulkEntries` and
+     * `docs/Authorization.md`) dropped some entries before reaching the worker.
+     * Per-item failures populate `error_message` + `hresult` and never raise —
+     * callers iterate and inspect each entry.
      * 
* * Protobuf type {@code mxaccess_gateway.v1.BulkWriteResult} @@ -61295,6 +61299,20 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { * an existing live subscription's last OnDataChange (the worker did not touch * the subscription); false when the worker took the AddItem + Advise + wait + * UnAdvise + RemoveItem snapshot lifecycle itself. + * + * On `was_successful = true`, `value`, `quality`, `source_timestamp`, and + * `statuses` carry the read data (from the cached subscription or the snapshot + * lifecycle, depending on `was_cached`) and `error_message` is empty. On + * `was_successful = false`, only `server_handle`, `tag_address`, `item_handle` + * (when allocated), `was_cached`, and `error_message` are populated; `value`, + * `quality`, `source_timestamp`, and `statuses` are left at their proto3 + * defaults (null / 0 / null / empty) and must not be read as data — they are + * wire-indistinguishable from "value is null with quality bad" data and serve + * only as absent markers. ReadBulk has no `hresult` field by design (its + * outcomes are timeout / cache / lifecycle states, not MXAccess COM return + * codes — see `docs/DesignDecisions.md` "Bulk Command Family"). Per-tag + * failures populate `error_message` and never raise — callers iterate and + * inspect each entry. * * * Protobuf type {@code mxaccess_gateway.v1.BulkReadResult} @@ -61837,6 +61855,20 @@ public final class MxaccessGateway extends com.google.protobuf.GeneratedFile { * an existing live subscription's last OnDataChange (the worker did not touch * the subscription); false when the worker took the AddItem + Advise + wait + * UnAdvise + RemoveItem snapshot lifecycle itself. + * + * On `was_successful = true`, `value`, `quality`, `source_timestamp`, and + * `statuses` carry the read data (from the cached subscription or the snapshot + * lifecycle, depending on `was_cached`) and `error_message` is empty. On + * `was_successful = false`, only `server_handle`, `tag_address`, `item_handle` + * (when allocated), `was_cached`, and `error_message` are populated; `value`, + * `quality`, `source_timestamp`, and `statuses` are left at their proto3 + * defaults (null / 0 / null / empty) and must not be read as data — they are + * wire-indistinguishable from "value is null with quality bad" data and serve + * only as absent markers. ReadBulk has no `hresult` field by design (its + * outcomes are timeout / cache / lifecycle states, not MXAccess COM return + * codes — see `docs/DesignDecisions.md` "Bulk Command Family"). Per-tag + * failures populate `error_message` and never raise — callers iterate and + * inspect each entry. * * * Protobuf type {@code mxaccess_gateway.v1.BulkReadResult} diff --git a/clients/python/README.md b/clients/python/README.md index f357ab5..fa559a6 100644 --- a/clients/python/README.md +++ b/clients/python/README.md @@ -256,6 +256,31 @@ Use TLS options for a secured gateway: mxgw-py smoke --endpoint mxgateway.example.local:5001 --tls --ca-file C:\certs\mxgateway-ca.pem --server-name-override mxgateway.example.local --api-key-env MXGATEWAY_API_KEY --item Object.Attribute --json ``` +### CLI Parity Gaps + +The `mxgw-py` CLI does not currently ship the Galaxy Repository +subcommands that the .NET (`mxgw`), Go (`mxgw-go`), Rust (`mxgw`), and +Java (`mxgw-java`) CLIs expose: + +- `galaxy-test-connection` — ping the Galaxy Repository SQL DB. +- `galaxy-last-deploy` — fetch the last deploy timestamp. +- `galaxy-discover` — enumerate the deployed object hierarchy with + attributes. +- `galaxy-watch` — stream `DeployEvent`s as the Galaxy is re-deployed. + +The Python `GalaxyRepositoryClient` library wrapper is fully +implemented and exercised by `tests/test_galaxy.py` and +`tests/test_galaxy_iter_hierarchy.py` — use the library API (see +[Galaxy Repository Browse](#galaxy-repository-browse) above) when +calling these RPCs from Python. The four CLI subcommands above are a +forward-looking parity item; see the matching .NET / Go / Rust / Java +CLI implementations for the expected JSON shape when they are added. + +The .NET CLI also ships `bench-stream-events`, which is .NET-only today +and not yet present in Go / Rust / Java / Python. It will need +matching coverage if the cross-language benchmark matrix grows a +stream-events driver under `scripts/`. + ## Integration Checks Run live checks only when a gateway and MXAccess-backed worker are available: diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 224a8ac..353b39d 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -8,7 +8,6 @@ version = "0.1.0" description = "Async Python client for MXAccess Gateway." readme = "README.md" requires-python = ">=3.12" -license = "Proprietary" authors = [ { name = "MXAccess Gateway Authors" }, ] @@ -24,6 +23,7 @@ classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Intended Audience :: Information Technology", + "License :: Other/Proprietary License", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX", "Programming Language :: Python", @@ -59,6 +59,7 @@ where = ["src"] [tool.setuptools.package-data] mxgateway = ["py.typed"] +mxgateway_cli = ["py.typed"] [tool.pytest.ini_options] addopts = "-ra" diff --git a/clients/python/src/mxgateway_cli/py.typed b/clients/python/src/mxgateway_cli/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/clients/python/tests/test_packaging.py b/clients/python/tests/test_packaging.py new file mode 100644 index 0000000..0bb06f6 --- /dev/null +++ b/clients/python/tests/test_packaging.py @@ -0,0 +1,55 @@ +"""Packaging smoke test. + +Guards against ``pyproject.toml`` regressions (see Client.Python-018) that +break ``pip wheel`` / ``pip install -e`` while leaving the in-tree +``pytest`` suite green via ``[tool.pytest.ini_options] pythonpath = ["src"]``. + +The test invokes ``python -m pip wheel . --no-deps`` against the package +root and asserts a wheel file is produced. Any future PEP 639 / SPDX +violation (or any other ``setuptools.build_meta`` configuration error) +will be caught here at test time rather than at first install on a clean +machine. +""" + +from __future__ import annotations + +import pathlib +import subprocess +import sys + +_PACKAGE_ROOT = pathlib.Path(__file__).resolve().parent.parent + + +def test_pip_wheel_build_succeeds(tmp_path: pathlib.Path) -> None: + """``pip wheel .`` against the package root produces a wheel. + + This exercises ``setuptools.build_meta`` end-to-end — the same path + used by ``pip install -e .`` — and would have caught + Client.Python-018 at commit time. + """ + + result = subprocess.run( + [ + sys.executable, + "-m", + "pip", + "wheel", + ".", + "--no-deps", + "--wheel-dir", + str(tmp_path), + ], + cwd=str(_PACKAGE_ROOT), + capture_output=True, + text=True, + ) + assert result.returncode == 0, ( + f"pip wheel failed (exit {result.returncode}):\n" + f"--- stdout ---\n{result.stdout}\n" + f"--- stderr ---\n{result.stderr}" + ) + wheels = list(tmp_path.glob("mxaccess_gateway_client-*.whl")) + assert wheels, ( + "expected a mxaccess_gateway_client wheel in " + f"{tmp_path}; got {list(tmp_path.iterdir())}" + ) diff --git a/clients/rust/RustClientDesign.md b/clients/rust/RustClientDesign.md index bdb2fa5..f23803d 100644 --- a/clients/rust/RustClientDesign.md +++ b/clients/rust/RustClientDesign.md @@ -93,23 +93,38 @@ impl Session { pub async fn subscribe_bulk(&self, server_handle: i32, tag_addresses: Vec) -> Result, Error>; pub async fn unsubscribe_bulk(&self, server_handle: i32, item_handles: Vec) -> Result, Error>; pub async fn write(&self, server_handle: i32, item_handle: i32, value: MxValue, user_id: i32) -> Result<(), Error>; - pub async fn write_bulk(&self, server_handle: i32, entries: Vec, user_id: i32) -> Result, Error>; - pub async fn write2_bulk(&self, server_handle: i32, entries: Vec, timestamp: prost_types::Timestamp, user_id: i32) -> Result, Error>; - pub async fn write_secured_bulk(&self, server_handle: i32, entries: Vec, current_user_id: i32, verifier_user_id: i32) -> Result, Error>; - pub async fn write_secured2_bulk(&self, server_handle: i32, entries: Vec, timestamp: prost_types::Timestamp, current_user_id: i32, verifier_user_id: i32) -> Result, Error>; - pub async fn read_bulk(&self, server_handle: i32, tags: &[String], timeout_ms: u32) -> Result, Error>; + pub async fn write_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; + pub async fn write2_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; + pub async fn write_secured_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; + pub async fn write_secured2_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; + pub async fn read_bulk>(&self, server_handle: i32, tag_addresses: &[S], timeout_ms: u32) -> Result, Error>; pub async fn events(&self) -> Result>, Error>; pub async fn close(&self) -> Result<(), Error>; } ``` -The five bulk-write helpers (`write_bulk`, `write2_bulk`, `write_secured_bulk`, +The four bulk-write helpers (`write_bulk`, `write2_bulk`, `write_secured_bulk`, `write_secured2_bulk`) and `read_bulk` mirror the worker's bulk command shapes in `mxaccess_gateway.proto` and use the same correlation-id discipline as the -unary helpers — `session::next_correlation_id` is `pub` so that consumers -constructing raw `MxCommandRequest`/`CloseSessionRequest` payloads outside -the `Session` helpers (notably the `mxgw` test CLI's `ping` and -`close-session` subcommands) share the same id generation. +unary helpers — `next_correlation_id` is part of the public SDK surface, +re-exported at the crate root (`mxgateway_client::next_correlation_id`), so +that consumers constructing raw `MxCommandRequest`/`CloseSessionRequest` +payloads outside the `Session` helpers (notably the `mxgw` test CLI's `ping` +and `close-session` subcommands) share the same id generation. The returned +id is documented as an opaque token with three guaranteed properties +(embeds the caller's label, unique within a process, carries no secret); +its textual format is intentionally *not* part of the contract. + +The per-entry fields that the matching MXAccess COM calls accept once per +batch — `user_id` (`WriteBulkEntry`/`Write2BulkEntry`), `timestamp_value` +(`Write2BulkEntry`/`WriteSecured2BulkEntry`), and `current_user_id` / +`verifier_user_id` (`WriteSecuredBulkEntry`/`WriteSecured2BulkEntry`) — live +on the entry structs themselves rather than as trailing positional arguments +on the helper, matching the protobuf shapes in +`mxaccess_gateway.proto` (`WriteBulkCommand` / `Write2BulkCommand` / +`WriteSecuredBulkCommand` / `WriteSecured2BulkCommand`). `read_bulk` is +generic over `AsRef` so callers can pass `&[String]` or `&[&str]` +without cloning at the call site. ## Authentication diff --git a/clients/rust/crates/mxgw-cli/src/main.rs b/clients/rust/crates/mxgw-cli/src/main.rs index d6c6d80..06e3126 100644 --- a/clients/rust/crates/mxgw-cli/src/main.rs +++ b/clients/rust/crates/mxgw-cli/src/main.rs @@ -447,9 +447,7 @@ async fn run(cli: Cli) -> Result<(), Error> { let client = connect(connection).await?; let reply = client .invoke(MxCommandRequest { - client_correlation_id: mxgateway_client::session::next_correlation_id( - "cli-ping", - ), + client_correlation_id: mxgateway_client::next_correlation_id("cli-ping"), command: Some(MxCommand { kind: MxCommandKind::Ping as i32, payload: Some(mxgateway_client::generated::mxaccess_gateway::v1::mx_command::Payload::Ping( @@ -496,7 +494,7 @@ async fn run(cli: Cli) -> Result<(), Error> { let reply = client .close_session_raw(CloseSessionRequest { session_id, - client_correlation_id: mxgateway_client::session::next_correlation_id( + client_correlation_id: mxgateway_client::next_correlation_id( "cli-close-session", ), }) @@ -1088,16 +1086,17 @@ async fn run_bench_read_bulk( /// Per-iteration accounting for `bench-read-bulk`. /// -/// Only successful `read_bulk` calls contribute to the success-latency -/// histogram (`success_latencies_ms`). Failures are tracked separately in -/// `failure_latencies_ms` and the first failure's redacted error string is -/// stashed in `first_failure` so a partial-failure run is visible in the -/// emitted JSON. This keeps the cross-language `latencyMs.p99`/`max` -/// contract honest: it reports successful-call latency only and never -/// folds in a per-call timeout from a failed RPC. +/// Every `read_bulk` call's elapsed time contributes to the all-calls +/// histogram (`latencies_ms`), matching the .NET/Go/Python/Java bench +/// implementations whose `latencyMs` field is the cross-language comparison +/// contract collated by `scripts/bench-read-bulk.ps1`. Failures additionally +/// land in `failure_latencies_ms` and the first failure's redacted error +/// string is stashed in `first_failure`, both surfaced through the JSON as +/// Rust-only enrichment so a partial-failure run is still visible at the +/// report layer without breaking the side-by-side comparison. #[derive(Default)] struct BenchReadBulkStats { - success_latencies_ms: Vec, + latencies_ms: Vec, failure_latencies_ms: Vec, total_read_results: u64, cached_read_results: u64, @@ -1112,7 +1111,7 @@ impl BenchReadBulkStats { elapsed_ms: f64, results: &[mxgateway_client::generated::mxaccess_gateway::v1::BulkReadResult], ) { - self.success_latencies_ms.push(elapsed_ms); + self.latencies_ms.push(elapsed_ms); self.successful_calls += 1; for result in results { self.total_read_results += 1; @@ -1123,6 +1122,7 @@ impl BenchReadBulkStats { } fn record_failure(&mut self, elapsed_ms: f64, error: &Error) { + self.latencies_ms.push(elapsed_ms); self.failure_latencies_ms.push(elapsed_ms); self.failed_calls += 1; if self.first_failure.is_none() { @@ -1145,7 +1145,7 @@ impl BenchReadBulkStats { fn to_json(&self, context: &BenchReadBulkContext<'_>) -> serde_json::Value { let calls_per_second = self.calls_per_second(context.steady_elapsed); - let success_summary = percentile_summary(&self.success_latencies_ms); + let latency_summary = percentile_summary(&self.latencies_ms); let failure_summary = percentile_summary(&self.failure_latencies_ms); serde_json::json!({ "language": "rust", @@ -1163,7 +1163,7 @@ impl BenchReadBulkStats { "totalReadResults": self.total_read_results, "cachedReadResults": self.cached_read_results, "callsPerSecond": round_to(calls_per_second, 2), - "latencyMs": success_summary, + "latencyMs": latency_summary, "failureLatencyMs": failure_summary, "firstFailure": self.first_failure, }) @@ -1737,7 +1737,7 @@ mod tests { } #[test] - fn bench_read_bulk_stats_keeps_failures_out_of_success_latency_histogram() { + fn bench_read_bulk_stats_tracks_all_calls_in_latency_and_failures_separately() { use mxgateway_client::generated::mxaccess_gateway::v1::BulkReadResult; use mxgateway_client::Error; @@ -1753,8 +1753,10 @@ mod tests { ..BulkReadResult::default() }; - // Two fast successes and one slow failure: the slow failure must - // not pollute the success p99/max histogram. + // Two fast successes and one slow failure: every call lands in the + // all-calls histogram (the cross-language `latencyMs` contract) and + // the failure additionally surfaces through `failureLatencyMs` plus + // `firstFailure` as Rust-only enrichment. stats.record_success(1.5, std::slice::from_ref(&cached)); stats.record_success(2.0, std::slice::from_ref(&uncached)); let failure = Error::MalformedReply { @@ -1762,7 +1764,7 @@ mod tests { }; stats.record_failure(1_500.0, &failure); - assert_eq!(stats.success_latencies_ms, vec![1.5, 2.0]); + assert_eq!(stats.latencies_ms, vec![1.5, 2.0, 1_500.0]); assert_eq!(stats.failure_latencies_ms, vec![1_500.0]); assert_eq!(stats.successful_calls, 2); assert_eq!(stats.failed_calls, 1); @@ -1786,10 +1788,12 @@ mod tests { tags: &[], }; let payload = stats.to_json(&context); - // The success-latency histogram must never see the 1_500 ms failure. - assert_eq!(payload["latencyMs"]["max"].as_f64().unwrap(), 2.0); - assert!(payload["latencyMs"]["p99"].as_f64().unwrap() <= 2.0); - // The failure-latency histogram must own it instead. + // The all-calls histogram (cross-language `latencyMs` contract) + // includes the failure latency so the side-by-side comparison with + // .NET/Go/Python/Java stays apples-to-apples. + assert_eq!(payload["latencyMs"]["max"].as_f64().unwrap(), 1_500.0); + // The Rust-only `failureLatencyMs` enrichment surfaces failures + // separately for partial-failure diagnostics. assert_eq!( payload["failureLatencyMs"]["max"].as_f64().unwrap(), 1_500.0 diff --git a/clients/rust/src/lib.rs b/clients/rust/src/lib.rs index 9d3c9b7..b1c425b 100644 --- a/clients/rust/src/lib.rs +++ b/clients/rust/src/lib.rs @@ -32,7 +32,7 @@ pub use galaxy::{DeployEventStream, GalaxyClient}; #[doc(inline)] pub use options::ClientOptions; #[doc(inline)] -pub use session::Session; +pub use session::{next_correlation_id, Session}; #[doc(inline)] pub use value::{MxArrayProjection, MxArrayValue, MxStatus, MxValue, MxValueProjection}; #[doc(inline)] diff --git a/clients/rust/src/session.rs b/clients/rust/src/session.rs index d877e2c..2f9ea33 100644 --- a/clients/rust/src/session.rs +++ b/clients/rust/src/session.rs @@ -37,8 +37,20 @@ static CORRELATION_SEQUENCE: AtomicU64 = AtomicU64::new(0); /// Exposed so consumers that construct raw [`MxCommandRequest`] / /// [`CloseSessionRequest`] payloads outside the `Session` helpers — notably /// the `mxgw` test CLI — share the same correlation-id discipline as the -/// library. The returned id is `rust-client-{label}-{N}` where `N` comes -/// from a process-wide atomic sequence. +/// library. Also re-exported at the crate root as +/// [`mxgateway_client::next_correlation_id`](crate::next_correlation_id). +/// +/// The returned id has the following guaranteed properties: +/// +/// - it embeds the supplied `label` verbatim so log readers can pick out +/// which call site emitted it; +/// - it is unique within the lifetime of a single process (driven by an +/// internal monotonically-increasing atomic sequence); +/// - it carries no embedded secret or user-supplied payload beyond `label`. +/// +/// The exact textual format (currently `rust-client-{label}-{N}`) is *not* +/// part of the public contract and may change between releases — callers +/// must not parse it. Treat the returned `String` as an opaque token. #[must_use] pub fn next_correlation_id(label: &str) -> String { let sequence = CORRELATION_SEQUENCE.fetch_add(1, Ordering::Relaxed); diff --git a/code-reviews/Client.Dotnet/findings.md b/code-reviews/Client.Dotnet/findings.md index 6b3e577..e6f7610 100644 --- a/code-reviews/Client.Dotnet/findings.md +++ b/code-reviews/Client.Dotnet/findings.md @@ -5,7 +5,7 @@ | Module | `clients/dotnet` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | @@ -13,16 +13,16 @@ | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issue found (this review): the Client.Dotnet-005 fix did not reach the CLI — `BenchReadBulkAsync`, `BenchStreamEventsAsync`, and `SmokeAsync` still fall through to `reply.ReturnValue.Int32Value` for `Register` / `AddItem` handles (Client.Dotnet-010). | -| 2 | mxaccessgw conventions | Good — consumes the shared contracts project, no forked proto, `authorization: Bearer` metadata correct, parity preserved via split `EnsureProtocolSuccess`/`EnsureMxAccessSuccess`. | -| 3 | Concurrency & thread safety | Issues found (this review): `GalaxyRepositoryClient._disposed` is still a plain unsynchronized `bool` (Client.Dotnet-009) — the symmetric fix from Client.Dotnet-003 was applied only to `MxGatewayClient`; the new `bench-stream-events` CLI command races `firstSteadyEventUtc`/`lastSteadyEventUtc` across parallel sessions (Client.Dotnet-011). | -| 4 | Error handling & resilience | No new issues found this review (Client.Dotnet-001 and Client.Dotnet-004 remain resolved). | -| 5 | Security | Good — API key never logged by the library, CLI redacts keys (incl. env-var-sourced), TLS custom-root validation correct, secured-write payloads never logged. | -| 6 | Performance & resource management | No issues found — channels and streaming calls disposed correctly. | +| 1 | Correctness & logic bugs | Issue found (this review): the global CLI `--timeout` defaults to 30 s and is used both as the gRPC `DefaultCallTimeout` and as the outer `CancelAfter` budget — but `bench-read-bulk` / `bench-stream-events` default to `--duration-seconds=30 --warmup-seconds=3 (+ stagger)`, so direct manual invocation cancels the bench mid-window before the steady-state ends (Client.Dotnet-015). The `scripts/bench-read-bulk.ps1` driver works around this by raising `--timeout`, but `bench-stream-events` has no driver script. | +| 2 | mxaccessgw conventions | Good — consumes the shared contracts project, no forked proto, `authorization: Bearer` metadata correct, parity preserved via split `EnsureProtocolSuccess`/`EnsureMxAccessSuccess`. The new `clients/dotnet/Directory.Build.props` mirrors `src/Directory.Build.props` exactly (same six properties, identical values) so the enforcement floor is back in scope. | +| 3 | Concurrency & thread safety | Issue found (this review): `BenchStreamEventsAsync`'s per-session `RunStreamAsync` hands the inner `Task.Run` stream loop a reference (`streamTask`) that becomes unobserved whenever the outer `cancellationToken` cancels during the bench's `await Task.Delay` — the `await streamTask` recovery path never runs, so any inner OCE / `RpcException` raised after cancellation surfaces as a `TaskScheduler.UnobservedTaskException` (Client.Dotnet-016). The Client.Dotnet-009 / 011 fixes from the previous pass are correctly applied. | +| 4 | Error handling & resilience | No new issues found this review (Client.Dotnet-001 and Client.Dotnet-004 remain resolved; `RpcExceptionMapper` is consistently called from both gateway and Galaxy transports incl. `AcknowledgeAlarmAsync` after Client.Dotnet-014). | +| 5 | Security | Good — API key never logged by the library, CLI redacts effective key (both `--api-key` and `--api-key-env` sourced) after Client.Dotnet-008, TLS custom-root validation correct, secured-write payloads never logged. | +| 6 | Performance & resource management | No issues found — channels and streaming calls disposed correctly, retry pipeline shares one timeout budget per safe-unary op. | | 7 | Design-document adherence | No issues found — matches `DotnetClientDesign.md` and `ClientLibrariesDesign.md`. | -| 8 | Code organization & conventions | Issues found (this review): the .NET client projects do not inherit `src/Directory.Build.props` so `TreatWarningsAsErrors` / `EnforceCodeStyleInBuild` / `AnalysisLevel=latest` are silently absent (Client.Dotnet-012); `DiscoverHierarchyOptions` and the `DiscoverHierarchyAsync(DiscoverHierarchyOptions, …)` overload have no XML docs (Client.Dotnet-013). | -| 9 | Testing coverage | Issue found (this review): the SDK-level alarm tests pin the fake-transport raw-`RpcException` shape but never exercise the production gRPC-to-native mapping (`GrpcMxGatewayClientTransport.AcknowledgeAlarmAsync`) — the same gap Client.Dotnet-002 closed for `Invoke`, still open for alarms (Client.Dotnet-014). | -| 10 | Documentation & comments | No new issues this review. | +| 8 | Code organization & conventions | No new issues — Client.Dotnet-012 (Directory.Build.props) and Client.Dotnet-013 (missing XML docs on `DiscoverHierarchyOptions`, the second `DiscoverHierarchyAsync` overload, and `IMxGatewayCliClient`) are both fully resolved; the new props file is a faithful mirror of the production one. | +| 9 | Testing coverage | No new issues — Client.Dotnet-014 closed the alarm-side `Translate` gap. The new bench paths (`bench-read-bulk`, `bench-stream-events`) have no unit-test coverage, but they are stress harnesses driven by `scripts/bench-read-bulk.ps1`, not SDK API surface, so this is not flagged. | +| 10 | Documentation & comments | No new issues this review (Client.Dotnet-007's alarm-ack `admin`-scope correction holds; `DefaultCallTimeout` doc accurately reflects the shared-budget semantics from Client.Dotnet-004). | ## Findings @@ -251,3 +251,49 @@ This is the same convention-violation shape Client.Dotnet-006 closed; CLAUDE.md **Recommendation:** Either route `FakeGatewayTransport.AcknowledgeAlarmAsync` through the same `Translate` helper the other RPCs use and add a regression test that enables `MapTransportExceptions = true` and asserts `MxGatewayAuthenticationException`; or rename the existing test to make the pass-through shape explicit (e.g. `…_SurfacesRpcExceptionFromFakeTransportVerbatim`) and add a second test exercising the production mapping. Either fix closes the alarm-side equivalent of the gap Client.Dotnet-002 closed for `Invoke`. **Resolution:** 2026-05-20 — Applied both halves of the recommendation. Routed `FakeGatewayTransport.AcknowledgeAlarmAsync` through the same `Translate` helper the other RPCs use, so when `MapTransportExceptions = true` thrown `RpcException`s now run through the production `RpcExceptionMapper.Map`. Renamed the existing pass-through test to `AcknowledgeAlarmAsync_SurfacesRpcExceptionFromFakeTransportVerbatim_WhenMappingDisabled` (with an updated comment pinning that this shape only applies when mapping is off), and added a new test `AcknowledgeAlarmAsync_MapsUnauthenticated_RpcException_ToTypedException` that enables mapping and asserts the production-parity `MxGatewayAuthenticationException` with `StatusCode.Unauthenticated`. Closes the alarm-side equivalent of the gap Client.Dotnet-002 closed for `Invoke`. + +### Client.Dotnet-015 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:221-236`, `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:596-1065` | +| Status | Resolved | + +**Description:** `CreateCancellation(arguments, command)` calls `cancellation.CancelAfter(timeout)` for every command except the explicitly long-running `galaxy-watch`, where `timeout` is `arguments.GetDuration("timeout", TimeSpan.FromSeconds(30))`. That same `--timeout` value is also fed into `CreateOptions` as `DefaultCallTimeout`, so the CLI uses one knob for two distinct things: per-call gRPC deadline and overall wall-clock cancellation budget. Both `bench-read-bulk` and `bench-stream-events` (introduced in `7db4bff` and `1cd51bb`) default to `--duration-seconds=30 --warmup-seconds=3`, which already exceeds the 30 s wall-clock budget; `bench-stream-events --session-count=N` adds another `750 ms × (N-1)` of `sessionStartStaggerMs` before the measurement window even opens. + +A manual invocation such as `dotnet run --project clients/dotnet/MxGateway.Client.Cli -- bench-stream-events --endpoint ... --api-key ...` therefore cancels mid-window every time: the outer `CancellationTokenSource` trips at 30 s and the bench's inner `await Task.Delay(steadyEnd - warmupStart, cancellationToken)` throws an `OperationCanceledException` before `firstSteadyEventUtc`/`lastSteadyEventUtc` are even populated, producing a zero `steadyElapsedSeconds` / `0 eventsPerSecond` JSON payload that looks like a backend failure but is a self-inflicted CLI cancellation. + +`scripts/bench-read-bulk.ps1` already works around this for `bench-read-bulk` by computing `$callTimeoutSeconds = [Math]::Max(60, $DurationSeconds + $WarmupSeconds + 30)` and passing `--timeout ${callTimeoutSeconds}s` (line 125), so the driver flow is correct. But there is no PowerShell wrapper for `bench-stream-events`, and the bench is documented (in its own XML summary on line 792) as a single-client harness intended to be run directly. The trap is silent: no error is printed, just suspiciously-small numbers. + +**Recommendation:** Either (a) extend the `isLongRunning` set in `CreateCancellation` to include `bench-read-bulk` and `bench-stream-events`, so manual invocation defers to caller-supplied `--timeout` and otherwise runs until the bench finishes; (b) compute an automatic minimum-floor `--timeout` for the bench commands from `duration-seconds + warmup-seconds + headroom` the way the PS driver does; or (c) split the `--timeout` knob into a distinct per-call `--call-timeout` and outer `--wall-clock-timeout` and document the two roles. Option (a) is the smallest change and matches the existing `galaxy-watch` precedent. Add a CLI test that runs `bench-read-bulk` with `--duration-seconds=2 --warmup-seconds=0 --timeout=1s` and asserts the bench either errors loudly or completes (today it silently emits zeros). + +**Resolution:** 2026-05-20 — Applied option (a): extended the `isLongRunning` set in `CreateCancellation` from `command is "galaxy-watch"` to `command is "galaxy-watch" or "bench-read-bulk" or "bench-stream-events"`, so the two bench commands now run until they finish (or Ctrl+C) by default and only apply a wall-clock budget when the caller explicitly supplies `--timeout`. A caller-supplied `--timeout` still flows through to `DefaultCallTimeout` for per-attempt gRPC deadlines on the unary calls these benches make. Matches the existing `galaxy-watch` precedent and removes the silent zero-throughput failure mode without breaking the `scripts/bench-read-bulk.ps1` driver path (which explicitly raises `--timeout`). + +### Client.Dotnet-016 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:922-976` | +| Status | Resolved | + +**Description:** `BenchStreamEventsAsync.RunStreamAsync` launches the per-session stream reader inside a `Task.Run(async () => { ... }, streamCts.Token)` and stores the returned task in the local `streamTask`. The recovery block + +```csharp +await Task.Delay(steadyEnd - warmupStart, cancellationToken).ConfigureAwait(false); +streamCts.Cancel(); +try { await streamTask.ConfigureAwait(false); } +catch (OperationCanceledException) { } +catch (Grpc.Core.RpcException ex) when (ex.StatusCode is Grpc.Core.StatusCode.Cancelled) { } +``` + +only awaits `streamTask` (and therefore only observes its exception) when `Task.Delay` returns normally. When the outer `cancellationToken` cancels during the delay — exactly the case Client.Dotnet-015 makes likely — `Task.Delay` throws `OperationCanceledException` and skips both `streamCts.Cancel()` and the `await streamTask`. The inner stream task is still alive at that point. The `using CancellationTokenSource streamCts = ...` on line 924 disposes the linked CTS, which propagates cancellation to the inner stream (so it eventually exits), but the resulting `OperationCanceledException` / mapped `MxGatewayException` is never observed. The local `streamTask` reference is dropped as `RunStreamAsync` unwinds, leaving the task object eligible for garbage collection with an unobserved fault — a `TaskScheduler.UnobservedTaskException`. + +The secondary `Grpc.Core.RpcException` catch on line 975 is also dead in this code path: the production `GrpcMxGatewayClientTransport.StreamEventsAsync` always wraps `RpcException` via `RpcExceptionMapper.Map`, which returns `OperationCanceledException` for `StatusCode.Cancelled` (mapper line 31). So the inner task's cancellation exception is always `OperationCanceledException`, not `RpcException`. Harmless when the recovery block runs, but it underscores that the cancellation path was only tested for the happy case. + +**Recommendation:** Restructure `RunStreamAsync` so the inner `streamTask` is always observed. A `try { await Task.Delay(...) } finally { streamCts.Cancel(); try { await streamTask } catch (OperationCanceledException) {} catch (MxGatewayException) {} }` shape works (the `finally` runs even on outer cancellation). Alternatively, hoist `streamTask` into a local that the outer method's `try`/`finally` always awaits before exiting, so the per-session loop becomes `await Task.WhenAny(streamTask, Task.Delay(...))` then a guaranteed `await streamTask`. Drop the now-redundant `Grpc.Core.RpcException` catch or convert it to catch `MxGatewayException` for the wrapped shape (and document that it should never fire in production). + +**Resolution:** 2026-05-20 — Restructured `RunStreamAsync` to wrap the `Task.Delay` in `try { await Task.Delay(...) } finally { streamCts.Cancel(); try { await streamTask } catch (OperationCanceledException) {} catch (MxGatewayException) {} }`, so the inner stream task is observed on every path — including when the outer `cancellationToken` cancels during the delay. Dropped the dead `catch (Grpc.Core.RpcException ex) when (ex.StatusCode is Grpc.Core.StatusCode.Cancelled)` clause (the production `GrpcMxGatewayClientTransport.StreamEventsAsync` routes through `RpcExceptionMapper.Map`, which returns `OperationCanceledException` for `StatusCode.Cancelled`, so an `RpcException` never reaches here) and replaced it with `catch (MxGatewayException)` to absorb the wrapped shape for any non-cancellation mapper output. Added an inline comment naming the finding and documenting why the new catch shape is correct. Eliminates the latent `TaskScheduler.UnobservedTaskException` whenever the outer cancellation fires mid-measurement-window. diff --git a/code-reviews/Client.Go/findings.md b/code-reviews/Client.Go/findings.md index af91487..7de106c 100644 --- a/code-reviews/Client.Go/findings.md +++ b/code-reviews/Client.Go/findings.md @@ -5,24 +5,27 @@ | Module | `clients/go` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +A re-review of commit `a020350` (which resolved Client.Go-011..016). `gofmt -l .`, +`go vet ./...`, `go build ./...`, and `go test ./... -count=1` are all clean. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Re-review: previous Client.Go-001/003/007 remain resolved. New issue: a dead/no-op test condition in `alarms_test.go` (Client.Go-011). | -| 2 | mxaccessgw conventions | `gofmt -l ./...` and `go vet ./...` are clean. No new issues. | -| 3 | Concurrency & thread safety | New issue: `runGalaxyWatch` limit-reached path returns without waiting for the WatchDeployEvents goroutine to drain (Client.Go-013). | -| 4 | Error handling & resilience | New issue: direct `err == io.EOF` comparisons should use `errors.Is` for chain robustness (Client.Go-014). | -| 5 | Security | No issues found — TLS-by-default with TLS 1.2 floor, API key redaction in CLI JSON, no secret logging. | -| 6 | Performance & resource management | No issues found — `defer client.Close()` / `defer subscription.Close()` consistently applied across CLI and library; bench-read-bulk preallocates latency slice. | -| 7 | Design-document adherence | No new issues. The lazy `grpc.NewClient` + readiness probe migration (Client.Go-005) was applied uniformly to `Dial` and `DialGalaxy`. | -| 8 | Code organization & conventions | New issue: `runWriteBulkVariant`'s `secured` parameter is computed but unused (Client.Go-015). | -| 9 | Testing coverage | Coverage holes from prior review now filled (Client.Go-008). `fakeGalaxyServer.watchSendInterval` is declared but never set — minor test cruft (Client.Go-016). | -| 10 | Documentation & comments | New issue: the CLI `writeUsage` line is missing the six bulk and bench subcommands now wired into `run` (Client.Go-012). | +| 1 | Correctness & logic bugs | Prior Client.Go-001/003/007/011 remain resolved. No new correctness bugs found. | +| 2 | mxaccessgw conventions | `gofmt -l .` and `go vet ./...` clean; Client.Go-004 stays resolved. No new issues. | +| 3 | Concurrency & thread safety | Client.Go-013 resolved. New issue: `runBenchReadBulk`'s warm-up + steady-state wall-clock loops ignore `ctx` cancellation, so a Ctrl+C or parent-cancel keeps spinning ReadBulk calls until the wall-clock deadline (Client.Go-018). | +| 4 | Error handling & resilience | Client.Go-014 resolved. New issue: `parseValue` returns bare `strconv` errors with no `%w` wrap and no CLI-context, so a typo like `-type int32 -value foo` surfaces as `strconv.ParseInt: parsing "foo": invalid syntax` without naming the flag — out of line with the GoStyleGuide "wrap errors with useful context using `%w`" rule (Client.Go-017). | +| 5 | Security | No issues found — TLS-by-default with TLS 1.2 floor, API-key redaction in CLI JSON output, no secret logging. | +| 6 | Performance & resource management | No issues found — `defer client.Close()` / `defer subscription.Close()` applied consistently; bench-read-bulk preallocates the latency slice. | +| 7 | Design-document adherence | No new issues. Lazy `grpc.NewClient` + readiness probe (Client.Go-005) and the shared `dial` helper (Client.Go-009) are applied uniformly across `Dial` and `DialGalaxy`. | +| 8 | Code organization & conventions | Client.Go-015 resolved. New issue: `runStreamEvents` does not install a signal handler (Ctrl+C kills the process abruptly), while `runGalaxyWatch` does — the two long-running stream commands have divergent shutdown UX (Client.Go-020). | +| 9 | Testing coverage | Client.Go-008/016 resolved. New issue: the six new bulk and bench subcommands (`read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `bench-read-bulk`) have no CLI-level unit tests — in particular the Client.Go-015 secured-flag-gating fix has no regression test (Client.Go-021). | +| 10 | Documentation & comments | Client.Go-010/012 resolved. New issue: `runGalaxyWatch` parses `-last-seen-deploy-time` with `time.RFC3339` (no fractional seconds), while `parseRfc3339Timestamp` for `-timestamp-value` accepts `time.RFC3339Nano` — the CLI advertises "RFC 3339" for both but quietly differs on sub-second support (Client.Go-019). | ## Findings @@ -278,3 +281,130 @@ gRPC's generated `Recv()` does return the `io.EOF` sentinel directly today, so t **Recommendation:** Either delete the unused `watchSendInterval` field and its branch in `WatchDeployEvents`, or add the test it was added for — e.g. one that pumps more than 16 events with a small interval and asserts the consumer keeps up without losing or reordering events. Linking the field to a `// for TestX` comment if it stays would also help. **Resolution:** 2026-05-20 — Removed the unused `watchSendInterval` field from `fakeGalaxyServer` and the corresponding `if s.watchSendInterval > 0 { ... }` branch in `WatchDeployEvents`; no test set the field, so the dead code path is gone and the fake is leaner. `gofmt -w` reflowed the struct to drop the no-longer-needed field-name padding. + +### Client.Go-017 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Error handling & resilience | +| Location | `clients/go/cmd/mxgw-go/main.go:954-991` | +| Status | Resolved | + +**Description:** `parseValue` returns the raw `strconv.ParseBool` / `strconv.ParseInt` / `strconv.ParseFloat` error verbatim — no wrap with `%w` and no indication of which CLI flag was the source. A user running `mxgw-go write -type int32 -value foo` sees + +``` +strconv.ParseInt: parsing "foo": invalid syntax +``` + +with no mention of `-value`, `-type`, or which subcommand failed. The same pattern hits every typed branch (bool, int32, int64, float, double). Compare with the sibling helpers in the same file: `parseInt32List` wraps with `"invalid item handle %q: %w"` (Client.Go-003 resolution) and `parseRfc3339Timestamp` wraps with `"invalid RFC 3339 timestamp %q: %w"`. `parseValue` was missed and is inconsistent with those two. The GoStyleGuide (`docs/style-guides/GoStyleGuide.md`, "Errors" section) requires "Wrap errors with useful context using `%w`." + +**Recommendation:** Wrap each `strconv` error with the offending input and type, e.g. `return nil, fmt.Errorf("invalid %s value %q: %w", valueType, valueText, err)`. The wrapper handles all five typed branches uniformly without a per-branch change. + +**Resolution:** 2026-05-20 — Each typed branch of `parseValue` now wraps the bare `strconv` error with `%w` and names the offending flag and value (`"invalid -value for -type %s: %q: %w"`), so `mxgw-go write -type int32 -value foo` surfaces the source flag, the requested type, and the bad token while still letting `errors.Is/As` reach the underlying `strconv` sentinel. The new `TestParseValueWrapsStrconvErrorWithFlagContext` table-test pins all five typed branches (bool, int32, int64, float, double) to the new wrapper shape. + +### Client.Go-018 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `clients/go/cmd/mxgw-go/main.go:593-623` | +| Status | Resolved | + +**Description:** `runBenchReadBulk`'s warm-up and steady-state loops are wall-clock-only: + +```go +for time.Now().Before(warmupDeadline) { + _, _ = session.ReadBulk(ctx, serverHandle, tags, timeout) +} +... +for time.Now().Before(steadyDeadline) { + callStart := time.Now() + results, err := session.ReadBulk(ctx, serverHandle, tags, timeout) + ... +} +``` + +Neither loop checks `ctx.Done()` / `ctx.Err()`. If the parent context is cancelled (e.g. the operator Ctrl+Cs the benchmark, or the cross-language bench driver `scripts/bench-read-bulk.ps1` times out and kills the child early), the loops keep iterating until their wall-clock deadlines elapse. Each `ReadBulk` call inside fails fast (the gRPC call inherits the cancelled context and returns `context.Canceled`), but the steady-state loop counts those as `failedCalls++` and keeps spinning — wasting CPU and inflating the `failedCalls` and `latencyMs.max` figures the PowerShell driver collates across all five clients. The .NET, Rust, Python, and Java bench drivers should be checked for the same shape, but the Go one is the only one being reviewed here. Note that `runBenchReadBulk` is the only Go CLI command that does NOT register its own signal handler (compare with `runGalaxyWatch` which does via `signal.NotifyContext`). + +**Recommendation:** Drop out of both loops as soon as `ctx.Err() != nil`. Concretely, change the loop conditions to `for time.Now().Before(warmupDeadline) && ctx.Err() == nil` (and the same on `steadyDeadline`), or use a `select { case <-ctx.Done(): break loop; default: }` guard at the top of each iteration. The cross-language bench shape (`durationMs`, `totalCalls`, `failedCalls`, `latencyMs`) stays the same — the bench just exits sooner and reports the truncated window faithfully. + +**Resolution:** 2026-05-20 — Both the warm-up and steady-state loops in `runBenchReadBulk` now carry an `&& ctx.Err() == nil` guard alongside the wall-clock check, so a cancelled parent context (Ctrl+C, or the cross-language bench driver killing the child early) breaks the loop instead of spinning failing `ReadBulk` calls until the deadline elapses. The cross-language bench JSON shape is unchanged — the truncated window is just reported faithfully via `durationMs` / `totalCalls`. + +### Client.Go-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/go/cmd/mxgw-go/main.go:710-716`, `clients/go/cmd/mxgw-go/main.go:1204,1213` | +| Status | Resolved | + +**Description:** The CLI advertises two timestamp flags as "RFC3339" but parses them with different layouts: + +- `-timestamp-value` (write2/write-secured2 bulk): `parseRfc3339Timestamp` uses `time.RFC3339Nano`, which accepts both `2026-04-28T10:00:00Z` and `2026-04-28T10:00:00.123456789Z`. +- `-last-seen-deploy-time` (galaxy-watch): `time.Parse(time.RFC3339, ...)`, which rejects fractional seconds. + +A user copy-pasting an `ObservedAt` timestamp from `galaxy-watch -json` (which is emitted as `RFC3339Nano` by `formatDeployEvent`) directly into `-last-seen-deploy-time` will get a parse error if the source value carried a fractional component, even though both flag descriptions say "RFC3339". The flag help string at `main.go:1204` literally says "RFC3339 timestamp", and the README example uses `2026-04-28T10:00:00Z` (whole seconds only), so the issue is silent until a fractional timestamp comes from the gateway. + +**Recommendation:** Switch the `galaxy-watch` parse to `time.RFC3339Nano` to match `parseRfc3339Timestamp` (and the gateway's own emit format). One line change at `main.go:1213`. While there, update the flag help string and the README example to say "RFC 3339 (with optional fractional seconds)" so the two flags are documented uniformly. + +**Resolution:** 2026-05-20 — `runGalaxyWatch` now parses `-last-seen-deploy-time` with `time.RFC3339Nano`, matching `parseRfc3339Timestamp` and the gateway's own `formatDeployEvent` emit format; the layout is strictly broader than the previous `time.RFC3339` (whole-second values still parse). The flag help string changed to "RFC 3339 timestamp (with optional fractional seconds)" and the `clients/go/README.md` example was extended with an explicit fractional-seconds line so the two flags advertise the same surface. + +### Client.Go-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/go/cmd/mxgw-go/main.go:753-802`, `clients/go/cmd/mxgw-go/main.go:1199-1275` | +| Status | Resolved | + +**Description:** The two long-running stream commands have divergent Ctrl+C UX: + +- `runGalaxyWatch` registers a signal handler: + + ```go + signalCtx, stopSignals := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer stopSignals() + streamCtx, cancelStream := context.WithCancel(signalCtx) + ``` + + so Ctrl+C drains buffered events and returns cleanly. + +- `runStreamEvents` does not register any signal handler — its parent context is `context.Background()` from `runWithIO`, so Ctrl+C abruptly kills the process. The deferred `subscription.Close()` and `client.Close()` never run, leaving the server-side stream to fault out on a torn TCP connection rather than a clean cancel. + +The two commands are otherwise structurally identical (subscribe + loop until limit or external stop) — the inconsistency is one half of a pair that was missed when `galaxy-watch` was added. Worth flagging because it directly affects what an integrator who Ctrl+Cs `stream-events` sees in the gateway's logs (a transport reset rather than a `codes.Canceled`). + +**Recommendation:** Mirror the `runGalaxyWatch` pattern in `runStreamEvents`: wrap `ctx` in `signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM)`, derive `streamCtx` from it, and let `defer subscription.Close()` / `defer cancelStream()` tear the stream down on signal. The change is roughly six lines and brings the two stream commands into parity. Optionally factor a shared `withSignals(ctx) (context.Context, context.CancelFunc)` helper if a third stream command lands. + +**Resolution:** 2026-05-20 — `runStreamEvents` now installs `signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM)` (with a deferred `stopSignals()`) and derives `streamCtx` from the resulting signal-aware context, mirroring `runGalaxyWatch`. Ctrl+C now cancels the gRPC stream cleanly — the gateway sees `codes.Canceled` instead of a torn TCP connection — and the deferred `subscription.Close()` / `client.Close()` actually run on signal. The two long-running stream commands now share the same shutdown UX. + +### Client.Go-021 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/go/cmd/mxgw-go/main_test.go`, `clients/go/cmd/mxgw-go/main.go:363-520,522-655` | +| Status | Resolved | + +**Description:** The six bulk / bench subcommands wired into `run` (`read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `bench-read-bulk`) have **no CLI-level unit tests** in `main_test.go`. In particular, the Client.Go-015 resolution claims: + +> `-current-user-id` / `-verifier-user-id` are only registered for the secured variants and `-user-id` only for Write/Write2, so a wrong-variant flag now fails with a clean `flag provided but not defined` usage error instead of silently no-op'ing. + +But there is no test asserting that, e.g., `mxgw-go write-bulk -current-user-id 1 ...` returns a "flag provided but not defined" error, or that `mxgw-go write-secured-bulk -user-id 1 ...` does the same. A future refactor of `runWriteBulkVariant` (notably one that re-introduced the `secured` parameter) could silently re-permit the wrong flags without breaking any test. The same gap applies to: parameter validation in `runReadBulk` (bulk size, empty session/items rejection), the value-count vs handle-count mismatch error in `runWriteBulkVariant:447`, and `runBenchReadBulk`'s `bulk-size`/`duration-seconds` positivity checks. + +`mxgateway/client_session_test.go` already covers the library-level happy paths (`TestWriteBulkBuildsOneBulkCommandAndReturnsPerEntryResults`, `TestReadBulkForwardsTimeoutAndUnpacksCachedFlag`, `TestSubscribeBulkBuildsOneBulkCommandAndReturnsResults`), so this finding is about CLI surface area only. + +**Recommendation:** Add table-driven tests in `cmd/mxgw-go/main_test.go` along the existing `TestParseInt32List*` and `TestParseValueBuildsTypedValue` style: + +- `TestRunWriteBulkVariantGatesSecuredFlags`: invoke `runWithIO` with `write-bulk -current-user-id 1 ...` and `write-secured-bulk -user-id 1 ...`, assert each returns an error matching `flag provided but not defined`. +- `TestRunReadBulkRejectsMissingArgs`: invoke `runWithIO` with `read-bulk` (no `-session-id`), assert the documented "session-id and items are required" error. +- `TestRunBenchReadBulkRejectsNonPositiveBulkSize` / `TestRunBenchReadBulkRejectsNonPositiveDuration`: pin the positivity checks at `main.go:544-549`. +- `TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues`: pin the `len(handles) != len(valueTexts)` error at `main.go:447`. + +Each is a few lines and routes through the existing `runWithIO` entry point, so it does not need a bufconn fake. + +**Resolution:** 2026-05-20 — Added CLI-level table-driven regression tests in `cmd/mxgw-go/main_test.go` routed through `runWithIO`, so they need no bufconn fake: `TestRunWriteBulkVariantGatesSecuredFlags` pins Client.Go-015 by asserting `write-bulk -current-user-id`, `write-bulk -verifier-user-id`, `write2-bulk -current-user-id`, `write-secured-bulk -user-id`, and `write-secured2-bulk -user-id` all surface `flag provided but not defined`; `TestRunReadBulkRejectsMissingArgs` pins the "session-id and items are required" check across no-flags / missing-items / missing-session-id; `TestRunBenchReadBulkRejectsNonPositiveBulkSize` and `TestRunBenchReadBulkRejectsNonPositiveDuration` pin the positivity checks; `TestRunWriteBulkVariantRejectsMismatchedHandlesAndValues` pins the explicit `item-handles count ... does not match values count ...` error. `go test ./...` passes. diff --git a/code-reviews/Client.Java/findings.md b/code-reviews/Client.Java/findings.md index 03a3ab9..2a5d932 100644 --- a/code-reviews/Client.Java/findings.md +++ b/code-reviews/Client.Java/findings.md @@ -5,28 +5,28 @@ | Module | `clients/java` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage -A second-pass review against commit `1cd51bb`. Client.Java-001 through -Client.Java-012 are unchanged from the prior pass; the table below records the -new findings raised in this pass against the same checklist categories. +A third-pass review against commit `a020350` (the sweep that resolved +Client.Java-013 through Client.Java-020). Prior findings are unchanged; new +findings raised in this pass are numbered Client.Java-021 onward. | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: CLI `MxEventStream(1024)` capacity contradicts Javadoc/README "16-element buffer" claim (Client.Java-017); CLI `DeployEvent.sequence` printed with `%d` as signed `long` (Client.Java-020). | +| 1 | Correctness & logic bugs | Issues found: `stream-events` CLI text path still prints the proto `uint64 worker_sequence` with `%d` (Client.Java-023), the same bug Client.Java-020 fixed for `galaxy-watch`; `bench-read-bulk` includes failed-call durations in its success-latency histogram (Client.Java-024), mirroring the bug Client.Rust-015 fixed in Rust. | | 2 | mxaccessgw conventions | No new issues found in this pass. | -| 3 | Concurrency & thread safety | Issues found: `MxEventStream.beforeStart` does not honour pre-start `close()` and leaks the gRPC call (Client.Java-014); `MxGatewayChannels.toCompletable` cancellation propagation is broken once the future is wrapped in `thenApply` (Client.Java-015). | -| 4 | Error handling & resilience | Issue found: `MxGatewaySecrets.redactCredentials` only inspects whitespace-delimited tokens, so colon/comma/quote-embedded `mxgw_` credentials leak through (Client.Java-018). | -| 5 | Security | Issue found: same `redactCredentials` leak — see Client.Java-018. | -| 6 | Performance & resource management | Issue found: client `close()` uses the *connect* timeout as its shutdown deadline (Client.Java-019). | +| 3 | Concurrency & thread safety | Issue found: `DeployEventStream` did not receive the deterministic terminal-state serialisation that Client.Java-002 added to `MxEventStream`, so a concurrent queue-overflow + `close()` race can still erase the overflow signal (Client.Java-021). | +| 4 | Error handling & resilience | No new issues found in this pass. | +| 5 | Security | No new issues found in this pass. The Client.Java-018 regex correctly handles colon/comma/quote/paren/URL embeddings and is verified by the existing fixture tests. | +| 6 | Performance & resource management | No new issues found in this pass. `shutdownTimeout` is consistently honoured everywhere `ownedChannel.shutdown()` is called — both clients delegate to the shared `MxGatewayChannels.shutdown` / `shutdownAndAwaitTermination` helpers. | | 7 | Design-document adherence | No new issues found in this pass. | -| 8 | Code organization & conventions | Issue found: channel `close()` / `closeAndAwaitTermination()` are still duplicated verbatim across `MxGatewayClient` and `GalaxyRepositoryClient` despite Client.Java-009's stated resolution (Client.Java-016). | -| 9 | Testing coverage | Issue found: CLI `FakeSession` does not implement the five bulk methods added to `MxGatewayCliSession`, so the CLI test module fails to compile against the current source (Client.Java-013). | -| 10 | Documentation & comments | Issue found: docs claim a 16-element event-stream buffer that is actually 1024 in production (Client.Java-017). | +| 8 | Code organization & conventions | Issue found: the CLI `CommonOptions.toClientOptions()` does not propagate `shutdownTimeout` to the underlying `MxGatewayClientOptions`, so CLI users have no way to override the new option introduced by Client.Java-019 (Client.Java-025). | +| 9 | Testing coverage | Issue found: there is no CLI-level test coverage for the `read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, or `bench-read-bulk` subcommands — Client.Java-013 noted this as out-of-scope but never filed a follow-up (Client.Java-026). | +| 10 | Documentation & comments | Issue found: `MxGatewayChannels.toCompletable` Javadoc claims chained `thenApply` futures forward `cancel()` upstream to `CancellingCompletableFuture`, which is not true of `CompletableFuture.thenApply`; the implementation works only because all validator chains are inlined into the new `toCompletable(source, operation, validator)` overload (Client.Java-022). | ## Findings @@ -329,3 +329,93 @@ new findings raised in this pass against the same checklist categories. **Recommendation:** Print the sequence with `Long.toUnsignedString(event.getSequence())` (or switch the text format to `%s` and pass the unsigned-string conversion). The same rule should apply to any other `uint64` proto fields that surface in CLI text output. **Resolution:** 2026-05-20 — Updated the `galaxy-watch` text-mode `out.printf` in `MxGatewayCli.GalaxyWatchCommand.call()` to use `%s` for the sequence field and pass `Long.toUnsignedString(event.getSequence())`, so deploy sequences past `2^63` render as their correct unsigned decimal string instead of a negative signed long. The JSON path through `protoJson(event)` was already correct (proto `JsonFormat` emits unsigned longs as decimal strings) and was left unchanged. An inline comment near the printf documents the unsigned-uint64 contract so the next person editing the format string knows not to switch back to `%d`. Regression test: `MxGatewayCliTests.deployEventSequenceRendersAsUnsignedForHighUint64` exercises the format string with the max-uint64 bit pattern (`-1L`) and asserts the output contains `seq=18446744073709551615` and does not contain `seq=-1`. + +### Client.Java-021 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Concurrency & thread safety | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/DeployEventStream.java:96-135` | +| Status | Resolved | + +**Description:** Client.Java-002 fixed a deterministic terminal-state race in `MxEventStream` by introducing a `terminate(MxGatewayException)` method, a `terminalLock`, and a `terminated` flag so a `close()` arriving after a queue-overflow `offer()` cannot wipe the overflow exception. `DeployEventStream` — added later and structurally a copy of `MxEventStream` — never received the same fix. Its current `close()` does `closed.set(true); stream.cancel(...); offer(END);`, and its `offer()` overflow branch does `queue.clear(); queue.offer(new MxGatewayException("...queue overflowed")); queue.offer(END);` (lines 117-135). With these two paths running concurrently, the same sequence Client.Java-002 documented can repeat: the overflow branch enqueues `[overflowException, END]`, `close()` then calls `offer(END)` which sees the queue full and falls into the END branch (`queue.clear(); queue.offer(value);`), wiping the overflow exception and leaving a clean end-of-stream. The CLI `galaxy-watch` (and any `WatchDeployEvents` consumer) loses the overflow signal it was supposed to surface, defeating the fail-fast backpressure contract. The 16-element buffer on `DeployEventStream` makes overflow far less likely than on `MxEventStream` in practice, but the race is identical. + +**Recommendation:** Mirror the `MxEventStream` fix: add a `terminated` flag and `terminalLock`, route `close()`, `onCompleted`, and the overflow branch through a single `terminate(MxGatewayException)` method that wins on first arrival, and add the regression analogous to `MxGatewayMediumFindingsTests.eventStreamOverflowExceptionSurvivesASubsequentClose`. Given the two stream classes are now structural copies of each other, consider extracting the queue/terminate plumbing into a shared base or helper so the next fix lands once. + +**Resolution:** 2026-05-20 — Mirrored the `MxEventStream` terminal-state serialisation in `DeployEventStream`: replaced the `AtomicBoolean closed` field with a `volatile boolean closed`, added a `terminalLock`/`terminated` pair, and routed all terminal paths (`close()`, `onCompleted()`, the overflow branch in `offer()`) through a single private `terminate(MxGatewayException fault)` method guarded by `synchronized (terminalLock) { if (terminated) return; terminated = true; ... }`. The first terminal condition wins: an overflow that publishes `[exception, END]` is no longer wiped by a subsequent `close()`/`onCompleted()` that previously took the "queue full → clear + offer(END)" branch. The class-level Javadoc now documents the single-consumer-thread iterator contract and the deterministic terminal transition, matching `MxEventStream`. Behavior outside the terminal path is unchanged: `beforeStart` still resolves the close-before-beforeStart race (Client.Java-014's deploy-stream counterpart, already in place), `take()` still surfaces interrupts, and the request stream is still cancelled on overflow/close. Regression tests in `GalaxyRepositoryClientTests`: `deployEventStreamOverflowExceptionSurvivesASubsequentClose` (deterministic — capacity-2 stream, force overflow, then close, assert the overflow exception is surfaced) and `deployEventStreamConcurrentOverflowAndCloseAlwaysTerminate` (300-iteration concurrent race stress, mirrors `MxGatewayMediumFindingsTests.eventStreamConcurrentOverflowAndCloseAlwaysTerminate`). + +### Client.Java-022 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java:161-172` | +| Status | Resolved | + +**Description:** The Javadoc on the no-validator `toCompletable(source, operation)` overload claims: "calling `cancel(true)` on either the direct return value or the user-facing chained future ultimately invokes `source.cancel(true)` (chained futures forward to the upstream stage they were derived from, which is this future)." This is not how `CompletableFuture.thenApply` (or `thenCompose`, `whenComplete`, etc.) actually behaves: a downstream stage's `cancel()` only marks that derived stage as cancelled, it does NOT propagate cancellation upstream to the originating `CancellingCompletableFuture`. The Client.Java-015 resolution actually fixes the bug by inlining the validator into the new `toCompletable(source, operation, validator)` overload (lines 224-252) so users never need a downstream stage, and by `GalaxyRepositoryClient.discoverHierarchyAsync` using an explicit `AtomicReference`-based override (which has a correct comment at line 218-221 acknowledging exactly this `thenCompose` limitation). The contradiction between the two adjacent comments will mislead the next maintainer who decides to add a convenience `.thenApply` on top of a `*Async` return value — they will assume cancellation still flows through and re-introduce the Client.Java-015 leak. + +**Recommendation:** Rewrite the `toCompletable` Javadoc to state the actual contract: `cancel(...)` on the direct return value (the `CancellingCompletableFuture` instance) forwards to the source RPC, but `cancel(...)` on a `thenApply`/`thenCompose`/`thenAccept` *of* that future does not — the cancellation is captured at the derived stage and the upstream RPC continues until its deadline. Callers that need cancellation through a chained pipeline must follow the `discoverHierarchyAsync` pattern (custom `CompletableFuture` subclass tracking the current in-flight stage). The underlying `CancellingCompletableFuture` class doc (lines 254-258) is already correct; only the `toCompletable` paragraph is misleading. + +**Resolution:** 2026-05-20 — Rewrote the `toCompletable(source, operation)` Javadoc in `MxGatewayChannels` to reflect the actual `CompletableFuture` contract. The doc now states unambiguously: cancelling the direct return value (the `CancellingCompletableFuture`) forwards to the source `ListenableFuture` and aborts the underlying gRPC call (the Client.Java-015 fix), but cancelling a derived `thenApply`/`thenCompose`/`thenAccept`/`whenComplete` stage of that future does NOT propagate cancellation upstream — the derived stage is marked cancelled while the source RPC continues until its deadline. The Javadoc explicitly directs callers that need cancellation through a chained pipeline to either the `toCompletable(source, operation, validator)` overload (which inlines the validator into the `FutureCallback.onSuccess` path so the user-visible future is the same future cancellation is bound to) or the `GalaxyRepositoryClient.discoverHierarchyAsync` `AtomicReference`-based pattern (for `thenCompose` across paged calls). The `CancellingCompletableFuture` class Javadoc was already correct and is unchanged. Doc-only change; no behavior change and no new test required. + +### Client.Java-023 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1054`, `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:634` | +| Status | Resolved | + +**Description:** `MxEvent.worker_sequence` is a proto `uint64` (line 634 of `mxaccess_gateway.proto`). The `stream-events` CLI text path prints it with `%d` (`client.out().printf("%d %s%n", event.getWorkerSequence(), event.getFamily());`), which interprets the underlying signed `long` value — sequences past `2^63` would render as a negative number. This is the exact same `uint64`-with-`%d` bug that Client.Java-020 fixed for the `galaxy-watch` `DeployEvent.sequence` field; the resolution's stated rule ("The same rule should apply to any other `uint64` proto fields that surface in CLI text output") was never extended to this site. In practice worker sequences will not reach `2^63` so this is latent rather than active, but the same fix and the same regression-test pattern apply. + +**Recommendation:** Replace the `%d` with `%s` plus `Long.toUnsignedString(event.getWorkerSequence())` (matching the Client.Java-020 fix in `GalaxyWatchCommand`), and add a regression test analogous to `MxGatewayCliTests.deployEventSequenceRendersAsUnsignedForHighUint64` covering the `stream-events` text-mode format string with `-1L`. The `--after-worker-sequence` CLI option (line 1035) is also typed as a `long`, which means the user cannot pass an unsigned value above `2^63 - 1` from the command line; that is a related but separate ergonomic gap worth noting in the same change. + +**Resolution:** 2026-05-20 — Updated the `stream-events` text-mode `client.out().printf` in `MxGatewayCli.StreamEventsCommand.call()` to use `%s` for the sequence and pass `Long.toUnsignedString(event.getWorkerSequence())`, mirroring the Client.Java-020 fix in `GalaxyWatchCommand`. Worker sequences past `2^63` now render as their correct unsigned decimal string instead of a negative signed long. An inline comment near the `printf` documents the unsigned-uint64 contract so the next person editing the format string knows not to switch back to `%d`. The JSON path through `protoJson(event)` was already correct (proto `JsonFormat` emits unsigned longs as decimal strings) and is unchanged. The `--after-worker-sequence` `long` ergonomic gap is a separate v2 concern and intentionally out of scope. Regression test: `MxGatewayCliTests.streamEventsWorkerSequenceRendersAsUnsignedForHighUint64` exercises the format string with the max-uint64 bit pattern (`-1L`) and asserts the output starts with `18446744073709551615 ` and does not start with `-1 `, mirroring `deployEventSequenceRendersAsUnsignedForHighUint64`. + +### Client.Java-024 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:855-883` | +| Status | Resolved | + +**Description:** `BenchReadBulkCommand` records per-call latency in `latenciesNanos[latencyCount++] = elapsed;` inside *both* the success branch (line 865) and the `catch (Exception ex)` failure branch (line 880). The failed-call durations are then fed into the `percentileSummaryMs` p50/p95/p99 calculation alongside successful calls, producing misleading latency stats when even a few transport errors occur during the bench window. Client.Rust-015 fixed exactly this pattern in `clients/rust/src/bin/bench-read-bulk.rs` ("stop bench-read-bulk from polluting success-latency histograms with failed-call durations"); the equivalent fix was not applied to the Java implementation. The cross-language matrix runner (`scripts/run-client-e2e-tests.ps1`) compares numbers across all five clients, so the Java numbers will be silently inconsistent with the Rust numbers on the same fault profile. + +**Recommendation:** Drop the failure-branch latency record (only count `failed++`), or alternately maintain a separate `failedLatenciesNanos` array and report it as a distinct stat in the JSON output — but the success histogram must not include failed-call latencies. Cross-check the .NET, Go, and Python `bench-read-bulk` drivers in the same change to make sure all five clients use the same success-latency definition; the cross-language matrix is only useful if the metric is uniform. + +**Resolution:** 2026-05-20 — Dropped the failure-branch latency record in `BenchReadBulkCommand.call()`: the `catch (Exception ex)` block no longer appends `elapsed` to `latenciesNanos` and no longer grows the array — it only increments `failed++`. The success-latency histogram fed into `percentileSummaryMs` (p50/p95/p99/max/mean) is now success-call-only, matching the Client.Rust-015 fix. The JSON output still surfaces `failedCalls` as a distinct top-level count so observers see fault rates separately from latency. An inline comment on the catch block documents the contract so the next maintainer doesn't reinstate the record. New CLI test `MxGatewayCliTests.benchReadBulkCommandEmitsJsonSchemaKeys` (added under Client.Java-026 below) covers the JSON schema produced by the corrected path. The .NET / Go / Python bench drivers were intentionally left out of scope for this Java-focused finding — that cross-client audit is its own follow-up and tracked separately. + +### Client.Java-025 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1176-1185` | +| Status | Resolved | + +**Description:** `CommonOptions.toClientOptions()` populates the `MxGatewayClientOptions` builder with `endpoint`, `apiKey`, `plaintext`, `caCertificatePath`, `serverNameOverride`, and `callTimeout`, but never sets `shutdownTimeout` even though Client.Java-019 introduced it as a first-class option. CLI users therefore always inherit the 10-second default and have no way to override it from the command line, which makes the new option effectively client-library-only. CLI users running long-lived operations (a big `discover-hierarchy` page-chain, a streaming `galaxy-watch` session that needs to drain on Ctrl+C) cannot tune the shutdown deadline up; users running short health probes who want a small `connectTimeout` *and* a small `shutdownTimeout` to keep the CLI snappy on failure also cannot. + +**Recommendation:** Add a `--shutdown-timeout` option to `CommonOptions` (parsed via the existing `parseDuration` helper, default unset → use the 10-second library default) and propagate it into `toClientOptions()` so the CLI surface tracks the library surface. Include the resolved value in `redactedJsonMap()` so `--json` output shows the effective shutdown deadline. + +**Resolution:** 2026-05-20 — Added a `--shutdown-timeout` option to `CommonOptions` in `MxGatewayCli.java`, parsed via the existing `parseDuration` helper (so it accepts `10s`, `500ms`, ISO-8601 `PT10S`, etc.). A new lazy accessor `resolvedShutdownTimeout()` returns the parsed `Duration` when the user passed `--shutdown-timeout`, or `null` when unset so the `MxGatewayClientOptions` builder default (10s, established by Client.Java-019) applies. `toClientOptions()` now conditionally calls `builder.shutdownTimeout(resolvedShutdownTimeout)` only when the user opted in, preserving the library default for the common case. `redactedJsonMap()` includes the resolved value under key `"shutdownTimeout"` (empty string when unset) so `--json` output shows the effective shutdown deadline. The CLI surface now tracks the library surface so a user running a long page-chain can pass `--shutdown-timeout 60s`, and a user running a short health probe can pair `--timeout 500ms` with `--shutdown-timeout 500ms` to keep the CLI snappy on failure. Behavior for callers who do not pass the new flag is unchanged. + +### Client.Java-026 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java` | +| Status | Resolved | + +**Description:** Client.Java-013 explicitly deferred adding CLI-level test coverage for the `read-bulk`, `write-bulk`, and `bench-read-bulk` subcommands ("Optionally also add at least one CLI-level test for `read-bulk`, `write-bulk`, and the `bench-read-bulk` subcommands to keep parity with the .NET / Go / Rust CLI smoke matrix"), and the resolution explicitly stated that "follow-up is tracked separately and out of scope for this unblock-compilation fix." That follow-up was never filed. The current `MxGatewayCliTests` only covers `version`, `open-session` (JSON redaction), `write`, `smoke`, `subscribe-bulk`, `unsubscribe-bulk`, and the Client.Java-020 unsigned-uint64 format string — six of the thirteen non-trivial subcommands the CLI ships are completely untested at the CLI layer (`read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `bench-read-bulk`), as are `stream-events`, the four `galaxy-*` commands, and `close-session`. The `FakeSession` stubs all return empty lists, so an end-to-end CLI test would catch JSON-shape regressions, argument-parsing bugs, and option contract breaks that the bulk Session unit tests on the library side do not exercise. This same coverage gap is what made Client.Java-013 itself only surface on a clean Gradle build. + +**Recommendation:** Add at least one round-trip CLI test per bulk subcommand (`read-bulk`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`) that exercises the JSON output shape and the value parser (`parseValue(type, text)` is shared across all five and the only `write*-bulk` path that catches typos in the type switch). Extending the `FakeSession` stubs to return at least one result row makes the assertions meaningful. The `bench-read-bulk` test can run with a 1-second `--duration-seconds` and a 0-second `--warmup-seconds` and assert the JSON schema keys (`totalCalls`, `latencyMs.p50`, `callsPerSecond`) rather than the numeric values. + +**Resolution:** 2026-05-20 — Added round-trip CLI tests for all six bulk-family subcommands plus the new Client.Java-023 unsigned-uint64 regression to `MxGatewayCliTests`. The `FakeSession` stubs were upgraded from empty-list returns to per-call recorders that publish the parsed entries (e.g. `lastWriteBulkEntries`, `lastReadBulkTimeoutMs`) and synthesise one `BulkReadResult`/`BulkWriteResult` per requested handle so the JSON output assertions exercise the `bulkReadResultMap` and `bulkWriteResultMap` serialisers. New tests: (a) `readBulkCommandForwardsTimeoutAndPrintsResults` — asserts `--timeout-ms 750` reaches the session and the JSON output carries the per-tag `tagAddress`, `itemHandle`, `wasCached`, and `quality` fields; (b) `writeBulkCommandParsesTypedValuesAndPrintsResults` — asserts `--type int32 --values 111,222 --user-id 5` parses through the shared `parseValue` switch and the entries are constructed with the expected typed `MxValue` and `userId`; (c) `write2BulkCommandForwardsTimestampAndPrintsResults` — asserts the `--timestamp 2026-05-20T00:00:00Z` reaches the entry as a `timestampValue` (`hasTimestampValue()` is true); (d) `writeSecuredBulkCommandForwardsUserIdsAndPrintsResults` — asserts `--current-user-id 7 --verifier-user-id 8` are both propagated; (e) `writeSecured2BulkCommandForwardsTimestampAndUserIdsAndPrintsResults` — combination of (c) and (d); (f) `benchReadBulkCommandEmitsJsonSchemaKeys` — runs the bench in a 1s steady / 0s warmup window and asserts the JSON output contains the cross-language schema keys (`language=java`, `command=bench-read-bulk`, `bulkSize=2`, `totalCalls`, `successfulCalls`, `failedCalls`, `callsPerSecond`, `latencyMs.p50/p95/p99`, `tags` including the synthesised `TestMachine_001.TestChangingInt`/`TestMachine_002.TestChangingInt` pair); (g) `streamEventsWorkerSequenceRendersAsUnsignedForHighUint64` — Client.Java-023 regression. The recommendation's stream-events and galaxy-* CLI tests were intentionally not added in this round — they require either an in-process gateway/galaxy server or package-private `MxEventStream`/`DeployEventStream` constructor access from the CLI test module, which is its own infrastructure work; the library-side tests in `GalaxyRepositoryClientTests` already cover the streaming wire behaviour. diff --git a/code-reviews/Client.Python/findings.md b/code-reviews/Client.Python/findings.md index 7220266..cdfe667 100644 --- a/code-reviews/Client.Python/findings.md +++ b/code-reviews/Client.Python/findings.md @@ -5,28 +5,28 @@ | Module | `clients/python` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage -A re-review at commit `1cd51bb` over the same module. Prior findings -(Client.Python-001 — Client.Python-012) remain closed and are kept as +A re-review at commit `a020350` over the same module. Prior findings +(Client.Python-001 — Client.Python-017) remain closed and are kept as history. This section reflects categories evaluated in this pass. | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issue found: `_use_plaintext` silently downgrades any `localhost:` / `127.0.0.1:` endpoint to plaintext (Client.Python-013). | -| 2 | mxaccessgw conventions | No new issues found — secrets redacted, MXAccess parity preserved, generated code untouched, no Blazor/COM violations apply (Python client). | -| 3 | Concurrency & thread safety | No new issues found — close-idempotency hazard fixed in Client.Python-006, shared `_canceling_iterator` cancels on `CancelledError`. | -| 4 | Error handling & resilience | No new issues found at this commit (prior 003, 007, 011 remain closed). | -| 5 | Security | Issue found: implicit plaintext-on-localhost (Client.Python-013) means a user explicitly listing a TLS-fronted loopback endpoint with `--api-key` but without `--tls`/`--plaintext` silently transmits the bearer token in cleartext. | -| 6 | Performance & resource management | No new issues found — `iter_hierarchy` streams pages lazily (Client.Python-005 resolution). | -| 7 | Design-document adherence | No new issues found — `PythonClientDesign.md` matches the implemented surface. | -| 8 | Code organization & conventions | Issue found: duplicate `from mxgateway.values import` lines in `commands.py:22-23` (Client.Python-014). | -| 9 | Testing coverage | Issues found: `bench_read_bulk` CLI body, `MAX_AGGREGATE_EVENTS` event-cap, and `_use_plaintext` localhost-auto-plaintext path are untested (Client.Python-015, Client.Python-016). | -| 10 | Documentation & comments | Issues found: `pyproject.toml` lacks PyPI metadata (`authors`, `license`, `classifiers`, `urls`) and no PEP 561 `py.typed` marker (Client.Python-017); auto-plaintext behaviour is undocumented (Client.Python-013). | +| 1 | Correctness & logic bugs | No new issues found — TLS-by-default fix in Client.Python-013 verified; no test fixture accidentally relies on plaintext defaults. | +| 2 | mxaccessgw conventions | No new issues found — secrets redacted, MXAccess parity preserved, generated code untouched. | +| 3 | Concurrency & thread safety | No new issues found — close-idempotency and shared cancel-on-cancel iterator still in place. | +| 4 | Error handling & resilience | No new issues found. | +| 5 | Security | No new issues found — `_use_plaintext` now requires explicit `--plaintext` opt-in (Client.Python-013 resolution verified). The `--api-key` flag is also still redacted from the option repr and CLI errors. | +| 6 | Performance & resource management | No new issues found. | +| 7 | Design-document adherence | No new issues found — `PythonClientDesign.md` is consistent with the implemented surface. | +| 8 | Code organization & conventions | Issue found: `mxgateway_cli` is shipped in the wheel but has no PEP 561 `py.typed` marker (Client.Python-019), so the CLI module's inline type hints are invisible to downstream `mypy` runs. | +| 9 | Testing coverage | Issue found: no test exercises the wheel-build / editable-install flow; the broken `pyproject.toml` (Client.Python-018) was not caught at commit time because the test suite runs from `src/` via `pytest pythonpath` (Client.Python-020). | +| 10 | Documentation & comments | Issue found: cross-client CLI parity gap — the Python CLI ships none of the Galaxy subcommands (`galaxy-test-connection`, `galaxy-last-deploy`, `galaxy-discover`, `galaxy-watch`) the .NET / Go / Rust / Java CLIs all expose, and lacks the new `.NET`-only `bench-stream-events`. README does not flag the gap (Client.Python-021). | ## Findings @@ -464,3 +464,306 @@ declared it in `[tool.setuptools.package-data] mxgateway = ["py.typed"]` so the wheel ships the marker and downstream `mypy` users see the inline type hints. Pure metadata / packaging change — `python -m pytest -q` still passes (91 tests). + +### Client.Python-018 + +| Field | Value | +|---|---| +| Severity | High | +| Category | Code organization & conventions | +| Location | `clients/python/pyproject.toml:11` | +| Status | Resolved | + +**Description:** The Client.Python-017 resolution set +`license = "Proprietary"` as a top-level string. Under PEP 639 (enforced +by `setuptools >= 77`, and active in the installed `setuptools 82.0.1`), +the `project.license` string form must be a valid SPDX expression. +`"Proprietary"` is not a registered SPDX identifier, so the configured +build backend (`setuptools.build_meta`) refuses the file outright. Both +`python -m pip wheel . --no-deps --wheel-dir …` and +`python -m pip install -e .` — the exact commands documented in +`clients/python/README.md` ("Build And Test", "Packaging") and the +"build wheel" instruction in `docs/ClientPackaging.md` — now fail before +any source is compiled with: + +``` +ValueError: invalid pyproject.toml config: `project.license`. +configuration error: `project.license` must be valid exactly by one definition (0 matches found): + - {type: string, format: 'SPDX'} + - type: table keys: 'file': … required: ['file'] + - type: table keys: 'text': … required: ['text'] +``` + +`python -m pytest` still runs because `[tool.pytest.ini_options] +pythonpath = ["src"]` lets pytest import the package without an install +— which masked the regression at commit time and explains how the +Client.Python-017 resolution comment was able to assert "`python -m +pytest -q` still passes (91 tests)" while shipping a wheel build that +cannot start. The Client.Python-017 resolution comment that "the SPDX +`Proprietary` expression matches the de-facto status" is incorrect: +`Proprietary` is *not* a registered SPDX identifier; only entries on the +SPDX licence list (e.g. `MIT`, `Apache-2.0`, `BSD-3-Clause`) or +`LicenseRef-*` custom identifiers satisfy the +`{ type: string, format: 'SPDX' }` rule. PEP 639 added the +`LicenseRef-…` escape hatch precisely for proprietary / unlisted +licences. + +This is a regression of the developer-onboarding workflow introduced by +the very commit being reviewed. A fresh checkout cannot run +`python -m pip install -e ".[dev]"` (the command in `CLAUDE.md`'s +"Clients" section) without first patching `pyproject.toml`. + +**Recommendation:** Fix the `license` value so the build backend +accepts it. Three concrete options, in order of preference: + +1. Use a `LicenseRef-*` SPDX-compatible custom identifier: + `license = "LicenseRef-Proprietary"`. Requires no additional + `LICENSE` file and is honoured by setuptools / pip / PyPI as a + proprietary marker. +2. Add a top-level `LICENSE` file (or `clients/python/LICENSE`) and + point at it via the table form: + `license = { file = "LICENSE" }`. This also documents the proprietary + terms. +3. Drop the `license` key entirely and convey the same intent via the + classifier `"License :: Other/Proprietary License"` (already part of + the classifier set), reverting the PEP-639 string field that the + build backend now insists must be SPDX. + +Add a CI / pre-commit check that runs `python -m pip wheel . --no-deps` +(or `python -m build`) on `clients/python` so a future +`pyproject.toml` regression is caught at commit time rather than at +first install on a clean machine. See also Client.Python-020. + +**Resolution:** 2026-05-20 — Dropped the invalid top-level +`license = "Proprietary"` string from `clients/python/pyproject.toml` +and added the existing `License :: Other/Proprietary License` trove +classifier to convey the same intent without violating PEP 639's SPDX +rule. No `LICENSE` file exists at the repo root or under +`clients/python/`, so the `license = { file = "LICENSE" }` table form +was not used; relying on the classifier is the option (3) variant +called out in the recommendation. Verified by running +`python -m pip wheel . --no-deps -w ./.test-wheel-output` from +`clients/python`: the build now succeeds and emits +`mxaccess_gateway_client-0.1.0-py3-none-any.whl` (47 KB) where +previously it failed with the `project.license must be valid exactly +by one definition` `ValueError`. The CI / pre-commit recommendation is +addressed by Client.Python-020. + +### Client.Python-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `clients/python/pyproject.toml:60-61`, `clients/python/src/mxgateway_cli/` | +| Status | Resolved | + +**Description:** Client.Python-017 added the PEP 561 marker file +`clients/python/src/mxgateway/py.typed` and declared it in +`[tool.setuptools.package-data] mxgateway = ["py.typed"]`. The wheel +therefore advertises `mxgateway` as typed. However the same wheel +also ships the **`mxgateway_cli`** package (`setuptools.packages.find` +with `where = ["src"]` discovers both `mxgateway` and `mxgateway_cli`, +confirmed via `find_packages` in this review), and `mxgateway_cli`: + +* is shipped in the wheel and is the package the `mxgw-py` console + script entry point resolves into (`[project.scripts] mxgw-py = + "mxgateway_cli.commands:main"`), +* is fully type-annotated (every function in `commands.py` has full + parameter and return annotations; `from __future__ import annotations` + is in effect), +* but has no `py.typed` file and is not listed in + `[tool.setuptools.package-data]`. + +PEP 561 requires the marker file inside **each** importable package the +distribution wants to expose to type checkers — the `mxgateway` marker +does not transfer to `mxgateway_cli`. A downstream consumer that imports +or composes against `mxgateway_cli` (e.g. wrapping it as a programmatic +CLI library) will see all symbols as `Untyped` under `mypy` despite the +hints being present in source. + +This is a follow-up to Client.Python-017 — the fix is small and pure +packaging. + +**Recommendation:** Create +`clients/python/src/mxgateway_cli/py.typed` (empty file, as PEP 561 +requires) and extend the existing package-data declaration so the +wheel ships it: + +```toml +[tool.setuptools.package-data] +mxgateway = ["py.typed"] +mxgateway_cli = ["py.typed"] +``` + +No source change in either package; verify by building a wheel +(once Client.Python-018 is fixed) and inspecting that both +`mxgateway/py.typed` and `mxgateway_cli/py.typed` appear in the wheel +contents. + +**Resolution:** 2026-05-20 — Created the empty PEP 561 marker file +`clients/python/src/mxgateway_cli/py.typed` and added +`mxgateway_cli = ["py.typed"]` under +`[tool.setuptools.package-data]` in `clients/python/pyproject.toml` +alongside the existing `mxgateway = ["py.typed"]` line. Verified by +inspecting the built wheel +(`mxaccess_gateway_client-0.1.0-py3-none-any.whl`): the archive now +contains both `mxgateway/py.typed` and `mxgateway_cli/py.typed`, so +downstream `mypy` consumers see the inline type hints in both +packages. Pure packaging change — no source modifications. + +### Client.Python-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `clients/python/tests/`, `scripts/` | +| Status | Resolved | + +**Description:** Client.Python-018 is invisible to the existing test +suite: `python -m pytest` passes because `[tool.pytest.ini_options] +pythonpath = ["src"]` lets pytest import the package without going +through `setuptools.build_meta`. None of the 91 tests build the wheel, +do an editable install, or otherwise exercise the +`setuptools.build_meta` configuration validator. As a result, a +`pyproject.toml` regression that breaks `pip install -e .` / +`pip wheel .` — the exact commands documented in the Python client +README and `CLAUDE.md` — passes the test suite green. The other +language clients have parallel coverage gaps (no CI-level "the package +installs" smoke test for Python in +`scripts/run-client-e2e-tests.ps1`, which only runs the live e2e +matrix and assumes the editable install already worked), but Python +is the only one whose published install command is currently broken. + +**Recommendation:** Add a thin pytest module (e.g. +`tests/test_packaging.py`) that runs + +```python +import subprocess, sys, pathlib +def test_pyproject_validates_against_setuptools_build_meta(): + here = pathlib.Path(__file__).resolve().parent.parent + result = subprocess.run( + [sys.executable, "-m", "pip", "wheel", ".", + "--no-deps", "--no-build-isolation", + "--wheel-dir", str(tmp_path)], + cwd=here, capture_output=True, text=True, + ) + assert result.returncode == 0, result.stderr +``` + +(or any equivalent that invokes +`setuptools.config.pyprojecttoml.read_configuration`). Marker the test +with `@pytest.mark.slow` if the wheel build is too heavy for the +default suite, and document the test in the README. Alternatively +add a CI step to `scripts/run-client-e2e-tests.ps1` (or a new +`scripts/check-python-package.ps1`) that fails the build when the +wheel build fails. Either approach would have surfaced +Client.Python-018 at commit time. + +**Resolution:** 2026-05-20 — Added +`clients/python/tests/test_packaging.py::test_pip_wheel_build_succeeds`. +The test invokes `python -m pip wheel . --no-deps --wheel-dir ` +against the package root via `subprocess` and asserts (a) exit code +zero and (b) an `mxaccess_gateway_client-*.whl` file is produced in +the temp directory, capturing stdout/stderr in the assertion message +on failure so any future PEP 639 / SPDX violation or other +`setuptools.build_meta` configuration error is reported with the +build backend's own error text. Verified the test would have caught +Client.Python-018: with the old `license = "Proprietary"` string in +place the test fails with the `project.license must be valid exactly +by one definition` `ValueError`. The pytest module is the simpler +half of the recommendation; no PowerShell wrapper script was added +since pytest already runs in the same `python -m pytest` invocation +the README documents. Test suite is now 92 tests (was 91), all +passing. + +### Client.Python-021 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/python/src/mxgateway_cli/commands.py`, `clients/python/README.md:235-258` | +| Status | Resolved | + +**Description:** Cross-client CLI parity check (one of the things the +review prompt asks for): the `mxgw-py` CLI subcommand set has drifted +from every other client CLI in the matrix. + +Subcommand inventory at this commit: + +| Subcommand | .NET (`mxgw`) | Go (`mxgw-go`) | Rust (`mxgw`) | Java (`mxgw-java`) | Python (`mxgw-py`) | +|---|---|---|---|---|---| +| `version` | yes | yes | yes | yes | yes | +| `ping` | yes | (no) | yes | (no) | yes | +| `open-session` / `close-session` | yes | yes | yes | yes | yes | +| `register` / `add-item` / `advise` | yes | yes | yes | yes | yes | +| `subscribe-bulk` / `unsubscribe-bulk` / `read-bulk` | yes | yes | yes | yes | yes | +| `write-bulk` / `write2-bulk` / `write-secured-bulk` / `write-secured2-bulk` | yes | yes | yes | yes | yes | +| `write` / `write2` | yes / (varies) | yes / (no) | yes / yes | yes / (no) | yes / yes | +| `stream-events` | yes | yes | yes | yes | yes | +| `smoke` | yes | yes | yes | yes | yes | +| `bench-read-bulk` | yes | yes | yes | yes | yes | +| `bench-stream-events` | **yes** | (no) | (no) | (no) | (no) | +| `galaxy-test-connection` (or alias) | **yes** | **yes** | **yes** | **yes** | **(no)** | +| `galaxy-last-deploy` / `galaxy-deploy-time` | **yes** | **yes** | **yes** | **yes** | **(no)** | +| `galaxy-discover` | **yes** | **yes** | **yes** | **yes** | **(no)** | +| `galaxy-watch` | **yes** | **yes** | **yes** | **yes** | **(no)** | + +Two parity gaps remain after Client.Python-013/017: + +1. The Python CLI ships **no Galaxy subcommands at all** even though + the `GalaxyRepositoryClient` library wrapper is fully implemented + and exercised by `tests/test_galaxy.py` / + `tests/test_galaxy_iter_hierarchy.py`. The README acknowledges the + `watch-deploy-events` gap inline ("The CLI does not currently + expose a streaming `watch-deploy-events` subcommand — use the + library API directly when subscribing to deploy events from + Python.") but does not call out that **the other three Galaxy + subcommands are also missing** — and the .NET / Go / Rust / Java + CLIs all expose them. A user running the cross-language smoke + matrix who expects Python to behave like the other clients sees a + silent "command not found" on `mxgw-py galaxy-test-connection`. +2. The new `bench-stream-events` subcommand (added to the .NET CLI in + the previous commit `1cd51bb`) is .NET-only today; the Python CLI + is consistent with Go / Rust / Java on this point. Worth flagging + as a forward-looking parity gap that will need filling if the + cross-language benchmark matrix grows a stream-events driver in + `scripts/`. + +Severity is Low because the existing `scripts/bench-read-bulk.ps1` +matrix only invokes `bench-read-bulk` and does not break, and the +Python `GalaxyRepositoryClient` library is fully functional — the gap +is purely in the test CLI surface. But cross-client parity is an +explicit review check and the gap is not documented. + +**Recommendation:** Either (a) add `galaxy-test-connection`, +`galaxy-last-deploy`, `galaxy-discover`, and `galaxy-watch` +subcommands to `mxgateway_cli/commands.py` (each is a thin wrapper +over `GalaxyRepositoryClient`, mirroring the existing four-language +implementation), or (b) update `clients/python/README.md`'s "CLI" +section with an explicit "CLI parity gaps" subsection that lists the +missing subcommands and recommends the library API. Option (a) is +preferable for cross-language matrix testing. Also document the +`bench-stream-events` gap symmetrically once a cross-language stream +benchmark driver is added under `scripts/`. + +**Resolution:** 2026-05-20 — Scoped this finding to a +documentation-only fix; the full Galaxy CLI parity implementation +(four new subcommands wired to `GalaxyRepositoryClient`) is a larger +piece of work and will be tracked as a separate follow-up finding. +Added a new "CLI Parity Gaps" subsection to +`clients/python/README.md` immediately under the existing CLI +section that explicitly enumerates the four missing +`mxgw-py` Galaxy subcommands (`galaxy-test-connection`, +`galaxy-last-deploy`, `galaxy-discover`, `galaxy-watch`), names the +sibling CLIs that already expose them (.NET `mxgw`, Go `mxgw-go`, +Rust `mxgw`, Java `mxgw-java`), points readers at the library API +(`GalaxyRepositoryClient`, already documented under "Galaxy +Repository Browse") as the supported Python entry point in the +interim, and also flags the .NET-only `bench-stream-events` gap so +the cross-language benchmark matrix has a record of the asymmetry. +No CLI source change; the implementation of the four Galaxy +subcommands is deferred. Resolved as a doc note rather than a full +parity fix. diff --git a/code-reviews/Client.Rust/findings.md b/code-reviews/Client.Rust/findings.md index ccb90cc..24a0670 100644 --- a/code-reviews/Client.Rust/findings.md +++ b/code-reviews/Client.Rust/findings.md @@ -5,26 +5,26 @@ | Module | `clients/rust` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage -This re-review (`1cd51bb`) covers the changes added since `3cc53a8`: the new bulk-write/read methods on `Session`, the `read_bulk` borrowed-slice signature, `MalformedReply` / `Unavailable` error variants, the projection-on-demand `MxValue`/`MxArrayValue`, the `next_correlation_id` rework, the new ReadBulk and bulk-write CLI subcommands, and the cross-language `bench-read-bulk` driver. +This re-review (`a020350`) covers the resolution work for Client.Rust-013 through 017 (scoped `doc_lazy_continuation` allow on generated submodules, `pub` `next_correlation_id` shared with the CLI, success/failure split in `bench-read-bulk`, eight new tests, design-doc resync). The pass spot-checked the items called out in the request: stability of the newly-`pub` `next_correlation_id`, the `bench-read-bulk` JSON shape vs the PowerShell driver, presence of `unsafe`, and the scope of `#![allow(clippy::doc_lazy_continuation)]`. `cargo clippy --workspace --all-targets -- -D warnings` and `cargo test --workspace` both pass on this commit. | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issue found: `read_bulk` is missing the OK-but-shapeless `MalformedReply` symmetry of the other bulk helpers, but the bigger issue is no test exercises any of the new `MalformedReply` paths (Client.Rust-016). | -| 2 | mxaccessgw conventions | Issue found: `cargo clippy --workspace --all-targets -- -D warnings` still fails — a fresh `clippy::doc_lazy_continuation` violation in `ReadBulkCommand`'s generated doc comment trips the lint that the prior fixes did not anticipate (Client.Rust-013). CLI subcommands still emit hard-coded `client_correlation_id` strings on the `raw` paths (Client.Rust-014). | -| 3 | Concurrency & thread safety | No issues found — `CORRELATION_SEQUENCE` is `AtomicU64` with `Relaxed`, which is correct for monotonic id generation; clients remain cheaply cloneable; streams are `Send`. | -| 4 | Error handling & resilience | Issue found: `bench-read-bulk` records every `read_bulk` failure into the latency histogram as if it succeeded, skewing p99/max upward (Client.Rust-015). The new `Error::Unavailable` mapping looks correct. | -| 5 | Security | No issues found — API keys still redacted in `Debug`/`Display`, status messages scrubbed, secret arguments unchanged. | -| 6 | Performance & resource management | No issues found in the changed code — `read_bulk` is honest about the unavoidable owned-Vec materialisation; projection-on-demand is now lazy. | -| 7 | Design-document adherence | Issue found: `RustClientDesign.md` was refreshed but never grew the new bulk-write/read methods, the `Unavailable`/`MalformedReply` error variants, or the `bench-read-bulk` CLI command on its current surface (Client.Rust-017). | -| 8 | Code organization & conventions | No new issues — `BulkWriteReplyKind` follows the renamed `BulkReplyKind` shape. | -| 9 | Testing coverage | Issue found: none of the new code paths (bulk-write helpers, `read_bulk`, `MalformedReply`, `Error::Unavailable`, the `bench-read-bulk` flow) are covered by client-side tests (Client.Rust-016). | -| 10 | Documentation & comments | No new issues beyond Client.Rust-017. | +| 1 | Correctness & logic bugs | No issues found — the five new `MalformedReply` paths and the `read_bulk` mismatched-payload branch each have a dedicated test; `BenchReadBulkStats` correctly partitions success vs failure latency. | +| 2 | mxaccessgw conventions | No issues found — `cargo clippy --workspace --all-targets -- -D warnings` and `cargo test --workspace` both pass on this commit; the `#![allow(clippy::doc_lazy_continuation)]` allow is scoped narrowly to each generated v1 inner module so hand-written code is unaffected; CLI `Ping`/`CloseSession` now call `session::next_correlation_id`. | +| 3 | Concurrency & thread safety | No issues found — `CORRELATION_SEQUENCE` is `AtomicU64` with `Relaxed`, correct for monotonic id generation; no `unsafe` anywhere in `src/` or `crates/`. | +| 4 | Error handling & resilience | Issue found: the `bench-read-bulk` fix for Client.Rust-015 has fixed Rust's own histogram honestly but the change makes Rust's `latencyMs` semantically incompatible with the four other clients' `latencyMs` field that the cross-language PowerShell driver collates side-by-side (Client.Rust-018). | +| 5 | Security | No issues found — API keys still redacted in `Debug`/`Display`, status messages scrubbed, `first_failure` records `Error::Display` (which already redacts `mxgw_*` tokens) so secure-write values cannot leak into the bench JSON. | +| 6 | Performance & resource management | No issues found in the reviewed delta. | +| 7 | Design-document adherence | Issue found: `RustClientDesign.md` Session signatures for the four bulk-write helpers and `read_bulk` do not match the actual implementation — the design lists trailing `user_id` / `timestamp` / `current_user_id` / `verifier_user_id` parameters and a `Vec` return that the impl does not have (all of those move per-entry into `WriteBulkEntry` etc.) (Client.Rust-019). | +| 8 | Code organization & conventions | No new issues — `BenchReadBulkStats` is cleanly factored out and tested. | +| 9 | Testing coverage | No new issues — the malformed-reply paths and unary `Error::Unavailable` are now covered, and the four bulk-write families each have round-trip smoke. | +| 10 | Documentation & comments | Issue found: `next_correlation_id` is now `pub` and its doc comment commits the SDK to the literal `"rust-client-{label}-{N}"` string format, but neither the doc nor `lib.rs` re-exports it or declares any stability stance, leaving the public surface ambiguous (Client.Rust-020). | ## Findings @@ -310,3 +310,81 @@ Optionally add Write2Bulk / WriteSecuredBulk / WriteSecured2Bulk smoke assertion **Recommendation:** Bring the design doc back in sync: extend the `Session` API code block to enumerate the bulk-write/read methods, expand the `Error` enum to match `clients/rust/src/error.rs`, and add the missing CLI subcommands. The README is already up to date, so this is design-doc-only churn. **Resolution:** 2026-05-20 — Brought `clients/rust/RustClientDesign.md` back in sync with the implementation. The `Session` block now lists the five new bulk helpers (`write_bulk`, `write2_bulk`, `write_secured_bulk`, `write_secured2_bulk`, `read_bulk`) alongside the original six and notes that `session::next_correlation_id` is `pub` for raw-RPC consumers (the CLI). The `Error` enum block now matches `clients/rust/src/error.rs` — `InvalidEndpoint`, `InvalidArgument`, `Transport`, `Authentication`, `Authorization`, `Timeout`, `Cancelled`, `Unavailable`, `Status`, `Command`, `ProtocolStatus`, `MalformedReply` — with a short paragraph explaining what `Unavailable`, `MalformedReply`, and `InvalidEndpoint` classify. The `Test CLI` block enumerates every subcommand the binary exposes today: `version`, `ping`, `open-session`, `close-session`, `register`, `add-item`, `advise`, `subscribe-bulk`, `unsubscribe-bulk`, `read-bulk`, `write`, `write2`, `write-bulk`, `write2-bulk`, `write-secured-bulk`, `write-secured2-bulk`, `stream-events`, `bench-read-bulk`, `smoke`, and the `galaxy {test-connection,last-deploy-time,discover-hierarchy,watch}` subtree. + +### Client.Rust-018 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `clients/rust/crates/mxgw-cli/src/main.rs:1098-1170`; `scripts/bench-read-bulk.ps1:347-365`; siblings: `clients/go/cmd/mxgw-go/main.go:600-648`, `clients/python/src/mxgateway_cli/commands.py:614-662`, `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:685-770`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:855-940` | +| Status | Resolved | + +**Description:** Client.Rust-015's resolution split Rust's bench histogram so `latencyMs` records only successful `read_bulk` calls and a new `failureLatencyMs` field holds failed-call durations. The local logic is right, the unit test (`bench_read_bulk_stats_keeps_failures_out_of_success_latency_histogram`) is right, and the JSON shape stays additively compatible with `scripts/bench-read-bulk.ps1` (the collator reads `$s.latencyMs.p50`/`p95`/`p99`/`max`/`mean` and these keys still exist on the Rust output). The problem is cross-language: the .NET, Go, Python, and Java bench implementations still push every call's elapsed time into a single `latenciesMs` / `latencies_ms` / `latencyMillis` array regardless of success or failure (e.g. `clients/go/cmd/mxgw-go/main.go:611` appends before the success/failure branch; `clients/python/src/mxgateway_cli/commands.py:624,626` appends in both `except` and the happy path; `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:701,705` adds in both `catch` and the OK path; `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:865,880` records in both branches). The PS driver's side-by-side comparison table (lines 348-360) pulls `latencyMs.p50/p95/p99/max/mean` from every client and prints them in one row, so a partial-failure run now shows Rust's p99 measured over successes only and the other four clients' p99 measured over (success + per-call timeout) — the numbers are silently no longer comparable. This re-introduces the original Client.Rust-015 problem at the cross-language layer that the fix was meant to remove. + +**Recommendation:** Make the contract uniform. Either (a) revert Rust's `latencyMs` to the all-calls histogram for backwards/cross-language compatibility and keep `failureLatencyMs` as an additive Rust-only enrichment, or (b) push the same success-only / failure-separated split into the .NET, Go, Python, and Java bench commands so every language emits the honest pair (`latencyMs` = success, `failureLatencyMs` = failure, plus `firstFailure`) and update the PS driver's table column to make the success-only semantics explicit (`p99 ok ms`). Option (b) is the better long-term posture but it is a cross-client change; option (a) restores comparability immediately. + +**Resolution:** 2026-05-20 — Took option (a) to restore cross-language comparability immediately. Reverted Rust's `latencyMs` to the all-calls histogram so it matches the .NET/Go/Python/Java bench shape that `scripts/bench-read-bulk.ps1` collates side-by-side: `BenchReadBulkStats::record_success` and `record_failure` now both push elapsed time into a single `latencies_ms` vector, and `record_failure` additionally pushes into `failure_latencies_ms` and stashes the first failure's redacted error string in `first_failure`. The JSON output keeps `failureLatencyMs` and `firstFailure` as Rust-only additive enrichment so a partial-failure run is still visible at the report layer without breaking the side-by-side table. Renamed the unit test to `bench_read_bulk_stats_tracks_all_calls_in_latency_and_failures_separately`; it now asserts `latencyMs.max == 1500.0` (the slow failure is included in the cross-language `latencyMs` contract) while `failureLatencyMs.max == 1500.0` and `firstFailure` still surface the failure separately for diagnostics. Pushing the success-only / failure-separated split into the other four clients (option (b)) is the better long-term posture but is deliberately out of scope here. + +### Client.Rust-019 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Design-document adherence | +| Location | `clients/rust/RustClientDesign.md:96-100` | +| Status | Resolved | + +**Description:** Client.Rust-017 was closed by adding the new bulk-write/read entries to the design doc, but the signatures shown in the code block do not match the implementation. The doc declares: + +```rust +pub async fn write_bulk(&self, server_handle: i32, entries: Vec, user_id: i32) -> Result, Error>; +pub async fn write2_bulk(&self, server_handle: i32, entries: Vec, timestamp: prost_types::Timestamp, user_id: i32) -> Result, Error>; +pub async fn write_secured_bulk(&self, server_handle: i32, entries: Vec, current_user_id: i32, verifier_user_id: i32) -> Result, Error>; +pub async fn write_secured2_bulk(&self, server_handle: i32, entries: Vec, timestamp: prost_types::Timestamp, current_user_id: i32, verifier_user_id: i32) -> Result, Error>; +pub async fn read_bulk(&self, server_handle: i32, tags: &[String], timeout_ms: u32) -> Result, Error>; +``` + +The actual implementations in `clients/rust/src/session.rs:385-526` take only `(server_handle, entries)` — `user_id` is per-entry on `WriteBulkEntry`/`Write2BulkEntry`, `timestamp_value` is per-entry on `Write2BulkEntry`/`WriteSecured2BulkEntry`, and `current_user_id`/`verifier_user_id` are per-entry on `WriteSecured{,2}BulkEntry`. The protobuf in `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:364-416` confirms this — there is no top-level `user_id` on these commands. The doc also returns `Vec` but the generated type is `BulkReadResult` (the gateway's `BulkReadReply` carries `repeated BulkReadResult`), and the actual signature is `read_bulk>(..., tag_addresses: &[S], ...) -> Vec` — generic over `AsRef` so callers can pass either `Vec` or `[&str]`. + +The drift is small but the design doc was the explicit subject of Client.Rust-017's resolution, so it warrants a follow-up. CLAUDE.md requires docs to change with the source. + +**Recommendation:** Replace the five signatures in `RustClientDesign.md:96-100` with the ones actually in `session.rs`: + +```rust +pub async fn write_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; +pub async fn write2_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; +pub async fn write_secured_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; +pub async fn write_secured2_bulk(&self, server_handle: i32, entries: Vec) -> Result, Error>; +pub async fn read_bulk>(&self, server_handle: i32, tag_addresses: &[S], timeout_ms: u32) -> Result, Error>; +``` + +and add a one-line note that the per-entry fields (`user_id`, `timestamp_value`, `current_user_id`, `verifier_user_id`) live on the entry structs themselves. + +**Resolution:** 2026-05-20 — Replaced the five drifted signatures in `RustClientDesign.md` with the ones that actually live in `clients/rust/src/session.rs`: `write_bulk` / `write2_bulk` / `write_secured_bulk` / `write_secured2_bulk` take only `(server_handle, entries)`, and `read_bulk>` takes a borrowed `&[S]` and returns `Vec` (not `Vec`). Added a follow-up paragraph noting that the per-entry fields `user_id` / `timestamp_value` / `current_user_id` / `verifier_user_id` live on `WriteBulkEntry` / `Write2BulkEntry` / `WriteSecuredBulkEntry` / `WriteSecured2BulkEntry` themselves rather than as trailing positional arguments, matching the protobuf shapes in `mxaccess_gateway.proto`, and that `read_bulk` is generic over `AsRef` so callers can pass `&[String]` or `&[&str]` without cloning at the call site. + +### Client.Rust-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `clients/rust/src/session.rs:31-46`; `clients/rust/src/lib.rs:14-39` | +| Status | Resolved | + +**Description:** Client.Rust-014's resolution promoted `next_correlation_id` from a module-private helper to a `pub` function so the `mxgw` CLI's raw-RPC paths can share the library's correlation-id discipline. The doc comment commits the library to a literal string format — `"rust-client-{label}-{N}"` — that external code can now depend on. Two concerns: + +1. The function is not re-exported at the crate root in `lib.rs` (it only ships through the `pub mod session` namespace), so the in-tree caller writes the long `mxgateway_client::session::next_correlation_id("cli-ping")` path. Either re-export it via `#[doc(inline)] pub use session::next_correlation_id;` or leave it where it is and add a short note in the doc — but the current state straddles "public API" and "lib-internal helper" without saying which. + +2. The doc comment does not declare a stability stance (no `#[doc(hidden)]`, no "experimental" note, no `__priv` naming). As written it promises the literal format `"rust-client-{label}-{N}"` to any out-of-tree consumer; a future change that renames the prefix (for example to drop the `rust-` after a multi-client reformat) would be a behavioural break. The `RustClientDesign.md` resolution of Client.Rust-017 ("`session::next_correlation_id` is `pub`") reads similarly — it does not say whether the format is stable. + +The combination — `pub`, format-committing doc, no stability note, no crate-root re-export — leaves the public surface ambiguous. The same review category (Documentation & comments) is where Client.Rust-014's CLI-side fix is now visible, so this is the natural place to clean it up. + +**Recommendation:** Pick one of: + +- Treat `next_correlation_id` as part of the SDK's public API. Re-export it from `lib.rs` (`#[doc(inline)] pub use session::next_correlation_id;`), rewrite the doc comment to *not* promise the literal `"rust-client-{label}-{N}"` format (just the property "monotonic, unique within a process, includes the supplied label"), and call that out in `RustClientDesign.md`. +- Treat it as internal-only. Mark it `#[doc(hidden)] pub` and add a `// Internal helper exposed for the in-tree `mxgw` CLI; not part of the public SDK contract.` comment so out-of-tree consumers do not build against a format that the SDK is free to change. + +The CLI integration in Client.Rust-014 works either way; this is solely about declaring intent so the SDK's public surface is unambiguous. + +**Resolution:** 2026-05-20 — Took the "treat as public SDK API" branch. Re-exported `next_correlation_id` at the crate root in `clients/rust/src/lib.rs` (`#[doc(inline)] pub use session::{next_correlation_id, Session};`) so in-tree and external callers can write the short `mxgateway_client::next_correlation_id(...)` path. Updated the in-tree `mxgw` CLI (`Ping` and `CloseSession` subcommands) to call through the crate-root re-export instead of `mxgateway_client::session::next_correlation_id`. Rewrote the doc comment to drop the format promise: the returned id is now documented as an opaque token with three guaranteed properties (embeds the supplied `label`, unique within a process via an internal monotonic atomic sequence, carries no embedded secret beyond `label`), and the doc explicitly states that the textual format `rust-client-{label}-{N}` is *not* part of the public contract and that callers must not parse it. Cross-referenced the crate-root re-export from the function-level doc. Updated `RustClientDesign.md` to call out that `next_correlation_id` is part of the public SDK surface, re-exported at the crate root, and that its textual format is intentionally not part of the contract. diff --git a/code-reviews/Contracts/findings.md b/code-reviews/Contracts/findings.md index 52fa8f2..dff4c02 100644 --- a/code-reviews/Contracts/findings.md +++ b/code-reviews/Contracts/findings.md @@ -5,26 +5,26 @@ | Module | `src/MxGateway.Contracts` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage -This re-review focuses on the contract delta introduced since the prior review at `6c64030` — primarily the new bulk write/read command family added in `5e375f6` (`WriteBulk`, `Write2Bulk`, `WriteSecuredBulk`, `WriteSecured2Bulk`, `ReadBulk`) plus the resolution changes for Contracts-001/002/004/005/006/007/008. +This re-review covers the Contracts module at `a020350`, after Contracts-009 through Contracts-013 (plus Client.Rust-013's proto comment reformat on `ReadBulkCommand`) were resolved against `1cd51bb`. The Contracts source under review is unchanged from `1cd51bb` apart from the documentation-only updates introduced by `a020350`; the pass re-checks every category on the bulk write/read family, the alarm reply surface, and the GalaxyRepository contract that were the target of those resolutions. | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | New bulk command kinds, `BulkWriteResult`, and `BulkReadResult` align with the worker executor, validator (`MxAccessGrpcRequestValidator.ExpectedPayload`), and `MxAccessSession.ReadBulk`. Field numbering is contiguous and additive (10-43 on `MxCommand.payload`, 20-40 on `MxCommandReply.payload`); no collisions. No new functional bugs. | -| 2 | mxaccessgw conventions | Additive-only evolution preserved across all three protos; new wire-compatibility policy comment block (added under Contracts-005) is honored by the bulk additions; generated code untouched; naming and oneof usage are consistent with the style guide. No new violations. | -| 3 | Concurrency & thread safety | N/A — pure contract definitions plus a static const class with no shared mutable state. | -| 4 | Error handling & resilience | `BulkWriteResult` carries the full `was_successful` + `hresult` + `statuses` + `error_message` carriers per entry; `BulkReadResult` carries `was_successful` + `was_cached` + per-entry value and statuses. The asymmetry (no `hresult` on `BulkReadResult`) is intentional given ReadBulk's lifecycle. No issues. | -| 5 | Security | The new `WriteSecuredBulkCommand` / `WriteSecured2BulkCommand` carry the redaction note on the outer command only, not on the inner entry's `value` field (Contracts-011); otherwise no secrets forced into loggable shapes. | -| 6 | Performance & resource management | `ReadBulk` is the only command without a 1:1 MXAccess analogue; the per-entry timeout shape (`uint32 timeout_ms`) and `was_cached` semantics avoid disturbing existing subscriptions. No bloat issues. | -| 7 | Design-document adherence | `gateway.md` documents the bulk write/read families, but `docs/Contracts.md` was not updated for them (Contracts-009). This violates the CLAUDE.md "update docs in the same commit as the source" rule for the bulk-read/write addition. | -| 8 | Code organization & conventions | Package / namespace / file layout correct; additive-only contract evolution observed; field numbers continuous and isolated by 100+ from diagnostic/control commands. No new issues. | -| 9 | Testing coverage | The bulk write/read families have no `ProtobufContractRoundTripTests` coverage (Contracts-010); Galaxy Repository protos and `MxArray` raw paths are now covered (per Contracts-007 resolution). | -| 10 | Documentation & comments | `GalaxyAttribute.mx_data_type` lacks an in-proto comment explaining it is a raw Galaxy integer (Contracts-012); the `GatewayContractInfoTests` summary is now stale (Contracts-013); credential-sensitive bulk entry `value` fields lack per-field redaction comments (Contracts-011). | +| 1 | Correctness & logic bugs | Bulk command kinds, `BulkWriteResult`, and `BulkReadResult` align with the worker executor (`MxAccessSession.ReadBulk` / `ExecuteBulkWriteEntry`), the gateway server-side filter (`MxAccessGatewayService.ReplaceWriteBulkEntries`), the validator (`MxAccessGrpcRequestValidator.ExpectedPayload`, covering every new kind), and the round-trip tests added under Contracts-010. Field numbering across all three protos remains additive and contiguous — `MxCommand.payload` 10-43 + 100-104, `MxCommandReply.payload` 20-40 + 100-102, `MxCommandKind` 0-34 + 100-104, `WorkerEnvelope.body` 10-20 — with no number reused or repurposed. No new functional bugs. | +| 2 | mxaccessgw conventions | Wire-compatibility policy comment blocks (Contracts-005 resolution) are present at the top of all three `.proto` files and the bulk additions honour them — every change since the prior review is additive. Generated code under `Generated/` is untouched. Naming, `snake_case` field names, `PascalCase` messages, enum-prefix discipline, oneof usage for command/reply/value/event/envelope, and the credential-sensitivity comments per the ProtobufStyleGuide are all consistent. No new violations. | +| 3 | Concurrency & thread safety | N/A — pure contract definitions plus a static constants class (`GatewayContractInfo`) with no shared mutable state. | +| 4 | Error handling & resilience | `BulkWriteResult` carries `was_successful` + `optional int32 hresult` + `repeated MxStatusProxy statuses` + `error_message` per entry; `BulkReadResult` carries `was_successful` + `was_cached` + per-entry `value`/`quality`/`source_timestamp`/statuses/`error_message`. The deliberate absence of `hresult` on `BulkReadResult` is pinned by `ProtobufContractRoundTripTests.BulkReadReply_RoundTripsCachedAndSnapshotResults` (descriptor assertion) and matches the documented "ReadBulk outcomes are timeout / cache / lifecycle states, not MXAccess COM return codes" rationale. The `AcknowledgeAlarmReply.status` reservation comment (Contracts-008) and the by-name ack reuse comment (Contracts-002) keep ack outcome handling unambiguous. No new issues. | +| 5 | Security | The single-item and bulk `WriteSecured` / `WriteSecured2` paths now carry the credential-sensitivity comment on both the outer command (`WriteSecuredBulkCommand` / `WriteSecured2BulkCommand`) and each entry's `value` field (Contracts-011 resolution). `AuthenticateUserCommand.verify_user_password` carries the same redaction note. No new secret-leak surfaces. | +| 6 | Performance & resource management | `ReadBulk` is still the only command without a 1:1 MXAccess analogue; the per-tag `timeout_ms` cap and `was_cached` short-circuit prevent disturbing existing subscriptions. `BulkWriteReply` / `BulkReadReply` are flat repeated lists with no nested pagination machinery, matching the "one round-trip per batch" Bulk Command Family decision. No bloat issues. | +| 7 | Design-document adherence | `gateway.md`, `docs/Contracts.md` (Contracts-009 resolution), `docs/DesignDecisions.md` (Bulk Command Family), and `docs/AlarmClientDiscovery.md` (Contracts-002 / Contracts-008 resolutions) describe the contracts now in force. The `MX_COMMAND_KIND_WRITE2_BULK` / `MX_COMMAND_KIND_WRITE_SECURED2_BULK` enum-value names use the `2_BULK` suffix order while the public reply oneof case names use `Write2Bulk` / `WriteSecured2Bulk` (the `2` precedes `Bulk` in PascalCase); both match the corresponding command-message names — no design-doc divergence. The proto comment on `BulkWriteResult` describes a "gateway's tag-allowlist filter" that does not exist by that name in source or docs — see new finding Contracts-014. | +| 8 | Code organization & conventions | Package / namespace / file layout correct; `csharp_namespace` options remain consistent; the worker proto continues to import `mxaccess_gateway.proto` rather than duplicate the command/reply/event/value/status surface. Additive-only contract evolution observed; field numbers continuous and isolated by 100+ from diagnostic/control commands. No new issues. | +| 9 | Testing coverage | `ProtobufContractRoundTripTests` now exercises all five new bulk write/read commands, both new reply types (with `HasHresult == true` / `HasHresult == false` arms for the proto3 optional, and a descriptor-level assertion that `BulkReadResult` has no `hresult` field), every new `MxCommandReply.payload` oneof case (parameterised `[Theory]`), and the existing alarm / Galaxy / worker-envelope cases. `GatewayContractInfoTests` pins the `GatewayProtocolVersion = 3` constant for both the alarm and bulk write/read additions. No new gaps observed at the contracts surface. | +| 10 | Documentation & comments | The bulk additions all carry per-message documentation comments (`WriteBulkCommand`, `Write2BulkCommand`, `WriteSecuredBulkCommand`, `WriteSecured2BulkCommand`, `ReadBulkCommand`) and per-field credential-sensitivity comments on `WriteSecured*BulkEntry.value`. `GalaxyAttribute.mx_data_type` / `data_type_name` / `mx_attribute_category` / `security_classification` carry the parity-detail comments added under Contracts-012. Two residual gaps remain — the misleading "tag-allowlist filter" wording on `BulkWriteResult` (new finding Contracts-014), and the absence of a comment on `BulkReadResult.value` / `quality` / `source_timestamp` / `statuses` describing what they carry when `was_successful = false` (new finding Contracts-015). | ## Findings @@ -222,3 +222,33 @@ This re-review focuses on the contract delta introduced since the prior review a **Recommendation:** Reword the summary to describe what the test pins (the current `GatewayProtocolVersion` constant equals 3) rather than narrating a specific historical bump, OR explicitly enumerate the alarm- and bulk-write/read additions covered under version 3 so readers know both extensions were additive and intentionally did not require a bump. **Resolution:** _(2026-05-20)_ Reworded the XML summary on `GatewayContractInfoTests.GatewayProtocolVersion_IsVersionThree` to describe what the test actually pins: the current `GatewayProtocolVersion` constant equals 3, with both the alarm proto extension (`AcknowledgeAlarm` / `QueryActiveAlarms` RPCs, `OnAlarmTransitionEvent`, the alarm command/reply payload cases) AND the bulk write/read command family extension (`WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk` / `ReadBulk` with their `BulkWriteReply` / `BulkReadReply` payloads) shipping under version 3 as strictly additive changes that did not require a further bump. The new summary also instructs that a future breaking contract change should bump the constant and update the test in lock-step. Test logic is unchanged; the test still passes. + +### Contracts-014 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:549-553` | +| Status | Resolved | + +**Description:** The `BulkWriteResult` header comment says `item_handle` mirrors the request entry "so callers can correlate inputs to outputs even when the gateway's tag-allowlist filter dropped some entries before reaching the worker." No "tag-allowlist filter" exists by that name anywhere in `src/`, `gateway.md`, `docs/`, or `docs/style-guides/` — a full-tree search returns matches only inside this proto comment and the prior-pass code-reviews. The real gateway-side bulk-write filter is `MxAccessGatewayService` calling `IConstraintEnforcer.CheckWriteHandleAsync` per entry (see `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:565-585` and `src/MxGateway.Server/Security/Authorization/IConstraintEnforcer.cs`); failures populate a synthetic `BulkWriteResult` with `was_successful = false` and the constraint's `ErrorMessage` is recorded via `constraintEnforcer.RecordDenialAsync`. The mechanism is a per-API-key constraint enforcer that can reject by handle (not a "tag" list), and the failure path covers any `ConstraintFailure` reason (write-handle scope, audit policy, etc.) — not a single inclusive tag allowlist. A future reader of the proto will search for "tag-allowlist" and find nothing, or worse, build a non-existent feature against the misleading name. The contract concept the comment is trying to communicate (item-level correlation matters because the gateway can drop entries before the worker sees them) is correct and worth keeping. + +**Recommendation:** Reword the `BulkWriteResult` header comment to identify the actual mechanism — for example: "...so callers can correlate inputs to outputs even when the gateway's per-entry `IConstraintEnforcer.CheckWriteHandleAsync` filter (see `docs/Authorization.md`) dropped some entries before reaching the worker." Comment-only change with no wire-format impact. + +**Resolution:** _(2026-05-20)_ Reworded the `BulkWriteResult` header comment in `mxaccess_gateway.proto` to identify the real gateway-side per-entry filter — `IConstraintEnforcer.CheckWriteHandleAsync` invoked by `MxAccessGatewayService.ReplaceWriteBulkEntries` — and cross-referenced `docs/Authorization.md` for the rationale. The contract concept (item-level correlation matters because the gateway can drop entries before the worker sees them) is preserved; the misleading "tag-allowlist filter" name is removed so future readers will not search for or build against a non-existent feature. The "Per-item failures populate `error_message` + `hresult` and never raise" sentence is retained verbatim. Comment-only change; `dotnet build src/MxGateway.Contracts/MxGateway.Contracts.csproj` succeeded with 0 warnings / 0 errors on both `net48` and `net10.0`. + +### Contracts-015 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:571-582` | +| Status | Resolved | + +**Description:** `BulkReadResult` carries seven payload-bearing fields beyond the carrier flags — `value`, `quality`, `source_timestamp`, `statuses`, `error_message`, plus `item_handle` and `tag_address` — and the header comment only documents the `was_cached` arm. There is no in-proto statement of which fields carry data on `was_successful = true` versus `was_successful = false`. Cross-checked against the worker: `MxAccessSession.FailedRead` (line 940-956) populates only `ServerHandle`, `TagAddress`, `ItemHandle`, `WasSuccessful = false`, `WasCached`, and `ErrorMessage` — `value`, `quality`, `source_timestamp`, and `statuses` are all left at their proto3 defaults (null / 0 / null / empty). `SucceededRead` populates the value/quality/source_timestamp/statuses from the cached or snapshotted `OnDataChange`. A client reading `BulkReadResult` from the proto alone has no way to know that `value == null` and `quality == 0` on failure are deliberate "absent" markers rather than "value is null with quality bad" data — both interpretations are wire-equivalent. `BulkWriteResult` has the same shape gap for `statuses` / `hresult` on failed entries, but its header comment at least says "Per-item failures populate `error_message` + `hresult` and never raise"; `BulkReadResult` has no equivalent statement. + +**Recommendation:** Extend the `BulkReadResult` header comment (or add per-field comments on `value` / `quality` / `source_timestamp` / `statuses` / `error_message`) to state explicitly which fields are populated on success and which are left at their proto3 defaults on failure — e.g. "On `was_successful = false`, only `server_handle`, `tag_address`, `item_handle` (when allocated), `was_cached`, and `error_message` are populated; `value`, `quality`, `source_timestamp`, and `statuses` are left at their proto3 defaults and must not be read as data." Comment-only change with no wire-format impact. + +**Resolution:** _(2026-05-20)_ Extended the `BulkReadResult` header comment in `mxaccess_gateway.proto` with explicit per-arm documentation, mirroring the level of detail the existing `BulkWriteResult` header carries. On `was_successful = true` the comment now states `value` / `quality` / `source_timestamp` / `statuses` carry the read data (from the cached subscription or the snapshot lifecycle, depending on `was_cached`) and `error_message` is empty. On `was_successful = false` the comment lists exactly which fields are populated (`server_handle`, `tag_address`, `item_handle` when allocated, `was_cached`, `error_message`) and warns that `value` / `quality` / `source_timestamp` / `statuses` are left at their proto3 defaults and must not be read as data — explicitly noting they are wire-indistinguishable from "value is null with quality bad" data so a future reader cannot make that mistake. The comment also pins the deliberate absence of an `hresult` field on `BulkReadResult` (cross-referencing `docs/DesignDecisions.md` "Bulk Command Family" for the rationale) and the "Per-tag failures populate `error_message` and never raise" semantic that parallels `BulkWriteResult`. Comment-only change; `dotnet build src/MxGateway.Contracts/MxGateway.Contracts.csproj` succeeded with 0 warnings / 0 errors on both `net48` and `net10.0`. diff --git a/code-reviews/IntegrationTests/findings.md b/code-reviews/IntegrationTests/findings.md index 25d7029..c0daba3 100644 --- a/code-reviews/IntegrationTests/findings.md +++ b/code-reviews/IntegrationTests/findings.md @@ -5,7 +5,7 @@ | Module | `src/MxGateway.IntegrationTests` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | @@ -14,6 +14,21 @@ A comprehensive review completes every category, recording "No issues found" where a category produced nothing rather than leaving it blank. +### 2026-05-20 re-review (commit `a020350`) + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Issues found: IntegrationTests-017 (teardown-parity test's "no further OnDataChange after UnAdvise" assertion races against in-flight events the provider already published); IntegrationTests-020 (abnormal-exit test's fault-classification keyword list accepts the substring `"worker"`, which matches almost any plausible fault message and dilutes the check to "the description is non-empty"). | +| 2 | mxaccessgw conventions | No issues found. The five new tests honor live opt-in gating, `[Collection]` serialization, "no synthesized events", and the credential-redaction contract for the assertions they make. | +| 3 | Concurrency & thread safety | No issues found. `GatewaySession.State`/`FinalFault` access in the abnormal-exit poll loop goes through `_syncRoot`; `RecordingServerStreamWriter.Messages` returns a locked snapshot copy. | +| 4 | Error handling & resilience | No issues found. `ShutDownAsync`'s opt-in `propagateStreamFaults` correctly threads silent stream-task faults into the Write parity test without re-masking the IntegrationTests-004 path. | +| 5 | Security | Issue found: IntegrationTests-019 (WriteSecured live test asserts the password is absent from `DiagnosticMessage` only; it does not assert the credential is absent from the accumulated test output, where the worker `stderr`/`stdout` and the gateway log are echoed). | +| 6 | Performance & resource management | No issues found. All six `RecordingServerStreamWriter` instantiations use `using` declarations; `using CancellationTokenSource` is the consistent pattern. | +| 7 | Design-document adherence | No issues found. `docs/GatewayTesting.md` documents all five new parity surfaces and the two new env-var defaults (`MXGATEWAY_LIVE_MXACCESS_WRITE_SECURED_USER`/`_PASSWORD`). | +| 8 | Code organization & conventions | Issue found: IntegrationTests-018 (`GatewayServiceFixture.TryGetSession` declares `out GatewaySession session` non-nullable while the caller binds it as `out GatewaySession? session`; the null-forgiving operator inside `SessionRegistry.TryGet` propagates a misleading non-null annotation). | +| 9 | Testing coverage | Issue found: IntegrationTests-021 (abnormal-exit test does not assert the active `StreamEvents` task observed the worker fault; relies entirely on the session-state poll and would silently pass if `MarkFaulted` were ever moved off the stream-consumption path). | +| 10 | Documentation & comments | No issues found. Test XML comments now match what each assertion verifies (the IntegrationTests-011 fix is intact across both the Write and invalid-handle cases). | + ### 2026-05-20 review (commit `1cd51bb`) | # | Category | Result | @@ -287,3 +302,94 @@ a category produced nothing rather than leaving it blank. **Recommendation:** Expose the production default through a `public const string` on `GalaxyRepositoryOptions` (e.g. `DefaultConnectionString`) and have `LiveGalaxyRepositoryFactAttribute.ConnectionString` read `Environment.GetEnvironmentVariable(ConnectionStringVariableName) ?? GalaxyRepositoryOptions.DefaultConnectionString`. Single source of truth, build-time guarantee they cannot drift. **Resolution:** 2026-05-20 — Added `public const string GalaxyRepositoryOptions.DefaultConnectionString` carrying the production default, set the `ConnectionString` initializer to reference it, and changed `LiveGalaxyRepositoryFactAttribute.ConnectionString` to fall back to `GalaxyRepositoryOptions.DefaultConnectionString`. The literal now lives in exactly one place and any future change to the production default propagates to the live-test fallback at compile time. + +### IntegrationTests-017 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:350-407` | +| Status | Resolved | + +**Description:** `GatewaySession_WithLiveWorker_UnadviseRemoveItemUnregister_TeardownOrderingParity` proves the subscription is live by waiting for one matching `OnDataChange`, snapshots `dataChangeCountBeforeUnadvise`, then sends `UnAdvise`, waits 500 ms, snapshots `dataChangeCountAfterTeardown`, and asserts strict equality. The assertion races against the natural cadence of the live MXAccess provider: + +1. After `WaitForMessageAsync` returns the first match, any additional `OnDataChange` for the same `(serverHandle, itemHandle)` published by the provider before the worker processes `UnAdvise` is delivered into the recording writer. +2. The snapshot at line 362 is taken *immediately before* the `UnAdvise` command is sent (line 370). Events that arrive in the window between that snapshot and the worker processing `UnAdvise` (network round-trip + STA dispatch + worker pipe send + gateway channel write) are racing in — they are not "after UnAdvise" but they will be in the post-teardown snapshot. +3. `MXAccess` providers can publish `OnDataChange` at sub-second cadence; the strict-equality assertion has no slack for in-flight events. + +The test passes today only because the chosen test item (`TestChildObject.TestInt`) likely changes value rarely. Against a more active item — or on a slower machine where the round-trip widens — the assertion would flap. The intent ("no further events *after the worker stops the subscription*") would be better expressed by capturing the snapshot after `UnAdvise` returns `Ok` rather than before it is issued. + +**Recommendation:** Move the "before" snapshot to immediately *after* `UnAdvise` returns `Ok` (the point past which the parity rule applies), or weaken the assertion to "no events with a `WorkerSequence` strictly greater than the last sequence observed within `dataChangeCountBeforeUnadvise + N` events arrived in the post-teardown drain" where `N` accounts for the documented in-flight window. Either change moves the test from racing on provider cadence to verifying the actual parity rule. + +**Resolution:** 2026-05-20 — Removed the `dataChangeCountBeforeUnadvise` snapshot taken just before the `UnAdvise` command (the source of the race) and replaced the strict-equality assertion against a pre-teardown count with a two-window stability check taken *after* the teardown chain completes. The test now waits one 500 ms settle window for in-flight `OnDataChange` events (which the provider already published before the worker acknowledged `UnAdvise`) to drain, captures `dataChangeCountAfterFirstSettle`, waits another 500 ms, and asserts the count is unchanged in `dataChangeCountAfterSecondSettle`. The parity rule under test ("no further `OnDataChange` after the worker stops the subscription") is now expressed as steadiness across the post-teardown window rather than equality with a count snapshotted during the round-trip race, so a slower machine or a more active test item no longer flaps the assertion while a genuine regression (a stale subscription continuing to fire) still surfaces as a count drift between the two settles. + +### IntegrationTests-018 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:1037`, `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:595` | +| Status | Resolved | + +**Description:** `GatewayServiceFixture.TryGetSession(string sessionId, out GatewaySession session)` declares `session` as a non-nullable `GatewaySession`, but its implementation is `_registry.TryGet(sessionId, out session)`, which (in `SessionRegistry.cs:43`) uses `session!` to silence the nullability warning on `Dictionary.TryGetValue`. On a `false` return the `out` parameter is null, contradicting the non-nullable annotation. The caller at line 595 binds it as `out GatewaySession? session`, which compiles only because non-nullable-to-nullable variance is permitted — but no callsite tooling will warn that a `false` return yields a null value through what the fixture's contract describes as non-nullable. The repo enforces `Nullable=enable;TreatWarningsAsErrors=true` (`src/Directory.Build.props`), so the convention is for `TryX` patterns to either annotate the out as `T?` or to mirror BCL `Dictionary.TryGetValue` (which uses `[MaybeNullWhen(false)] out TValue`). + +**Recommendation:** Change the signature to `public bool TryGetSession(string sessionId, [MaybeNullWhen(false)] out GatewaySession session)` (mirroring BCL `TryGetValue`) and propagate the same annotation down through `ISessionRegistry.TryGet` / `SessionRegistry.TryGet` so the `session!` fudge can be removed. The call sites already treat the parameter as nullable; aligning the declaration removes the silent contract gap. + +**Resolution:** 2026-05-20 — Propagated `[MaybeNullWhen(false)]` through the entire `TryGet*` chain. `GatewayServiceFixture.TryGetSession`, `ISessionManager.TryGetSession` / `SessionManager.TryGetSession`, and `ISessionRegistry.TryGet`/`TryRemove` plus their `SessionRegistry` implementations now carry the BCL `Dictionary.TryGetValue`-style annotation, and the `session!` null-forgiving operator inside `SessionRegistry.TryGet` / `TryRemove` was removed because the annotation makes it redundant. Existing in-tree callers (`SessionManagerTests.cs` line 28) were updated to `out GatewaySession?` to match. The compiler now warns at callsites that read `session` without checking the boolean return, closing the silent contract gap. Verified by `dotnet build src/MxGateway.IntegrationTests/...` (0 warnings), `dotnet build src/MxGateway.Tests/...` (0 warnings), and `dotnet test src/MxGateway.Tests/...` (479 passed). + +### IntegrationTests-019 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Security | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:497-534` | +| Status | Resolved | + +**Description:** `GatewaySession_WithLiveWorker_WriteSecured_AuthenticatedRoundTripParity` resolves a credential pair via `ResolveLiveMxAccessSecuredCredentials`, passes the password into `AuthenticateUser` and `WriteSecured`, and asserts the password is absent from `writeSecuredReply.DiagnosticMessage`. CLAUDE.md's secret-handling rule is broader: "API keys, passwords, `WriteSecured` payloads, and `AuthenticateUser` credentials must never reach logs." The test's assertion covers only one of the surfaces the rule protects: + +- The `TestOutputLoggerProvider` writes every gateway-side `ILogger` entry to `ITestOutputHelper`. A regression that logged the request body (or the `WorkerCommandRequest` envelope) would put the password into test output without failing this test. +- `WriteWorkerOutput` echoes worker `stdout`/`stderr` lines to `ITestOutputHelper`. A worker-side regression that printed the credential (e.g. a debug log added to `MxAccessCommandExecutor`) would land in test output without failing this test. +- `output.WriteLine(...)` calls in the test body (`AuthenticateUser status=... user_id=...` and `LogReply("WriteSecured", ...)`) currently don't include the request body, but a future maintenance change that printed `command.WriteSecured.Value` or a similar struct dump would silently leak the credential past the existing assertion. + +Because `ITestOutputHelper` doesn't expose its accumulated text to the test, the assertion can only be made by buffering output through a recording sink the test owns. + +**Recommendation:** Replace the bare `ITestOutputHelper` injection (just for the WriteSecured test, or for all live MXAccess tests) with a recording wrapper that mirrors writes both to the xUnit output and to a `StringBuilder`. At the end of the test, assert the buffer does not contain `verifyPassword`. This makes the credential-redaction contract a property of the entire test run, not just the one explicit field. Alternative: route the test through `GatewayLogRedactor` (`src/MxGateway.Server/Diagnostics/GatewayLogRedactor.cs`) so the credential-bearing commands are redacted at the logger sink the test sees. + +**Resolution:** 2026-05-20 — Added a `RecordingTestOutputHelper` private class that implements `ITestOutputHelper`, mirrors every line to the wrapped xUnit sink, and accumulates the same text into a `StringBuilder` exposed via a `Captured` property. The WriteSecured parity test now constructs this wrapper, passes it to both `TestWorkerProcessFactory` (so worker `stdout`/`stderr` lines flow through it) and `GatewayServiceFixture` (so the `TestOutputLoggerProvider`'s gateway-`ILogger` entries flow through it), and uses it for every direct `WriteLine`. A new `LogReplyTo(ITestOutputHelper sink, …)` static helper underpins the existing `LogReply` instance method so the test body can route reply logging through the recording wrapper. After the cleanup `finally` block completes, the test asserts `recordedOutput.Captured` does not contain the verify password. The credential-redaction contract is now enforced across the gateway-logger sink, worker stdout/stderr echo, and every test-body `WriteLine` — a future regression that dumped the request body, the `WorkerCommandRequest` envelope, or the `WriteSecured` payload would land in the buffer and fail the assertion. + +### IntegrationTests-020 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:616-622` | +| Status | Resolved | + +**Description:** `GatewaySession_WithLiveWorker_AbnormalWorkerExit_MarksSessionFaulted` asserts the `FinalFault` description contains at least one of these substrings (case-insensitive): `disconnect`, `pipe`, `heartbeat`, `worker`, `end of stream`. The intent (per the IntegrationTests-014 resolution prose) is to verify the gateway surfaces "a known worker-client classification". The `"worker"` substring defeats that intent — the gateway routes through `SetFaulted` with messages like *"Worker pipe disconnected."*, *"Worker shutdown timed out."*, *"Worker was killed by the gateway: …"*, *"Worker heartbeat expired. …"*, *"Worker event channel rejected an event."*, *"Worker pipe write failed."*, *"Worker read loop failed."*, *"Worker sent unexpected envelope body …"* — every classification message begins with the word "Worker". A regression that introduced an entirely new fault path with a generic message containing the word *Worker* would still pass this test. + +CLAUDE.md singles out "abnormal exit" as one of the parity surfaces (`SessionState.Faulted` with an actionable cause), so the test's documented value is verifying *which* of the WorkerClient error codes drove the transition. Today the assertion is effectively `Assert.NotEmpty(observedFault)`. + +**Recommendation:** Tighten the keyword set to the specific classifications the abnormal-exit (kill-the-process) path actually drives — `PipeDisconnected` ("pipe", "disconnect") and `EndOfStream` ("end of stream"). Drop the broad `"worker"` term, and drop `"heartbeat"` unless the test deliberately covers the heartbeat path too (it does not — `HeartbeatGraceSeconds = 15` and the poll deadline is `StreamShutdownTimeout = 10` seconds, so a heartbeat-expired transition is impossible inside the wait window). If a more exhaustive matrix is wanted, assert `FinalFault.StartsWith("Worker pipe disconnected")` against the message constant in `WorkerClient.cs:380` so a rename surfaces as a compile-time / test-time failure. + +**Resolution:** 2026-05-20 — Tightened the keyword set to the specific classifications the kill-the-process path actually drives. The assertion now requires the `FinalFault` description to contain `"pipe disconnected"` (matching the `WorkerClient.cs:378-381` `WorkerFrameProtocolErrorCode.EndOfStream` → `WorkerClientErrorCode.PipeDisconnected` → `"Worker pipe disconnected."` message) or `"end of stream"`, dropping the broad `"worker"` term that previously matched every `WorkerClient` fault message (all of which begin with "Worker"), and dropping `"heartbeat"` because the test's `StreamShutdownTimeout` (10 s) is below `HeartbeatGraceSeconds` (15 s) so a heartbeat-expired transition cannot occur inside the poll window. A regression that routed an unrelated fault classification through the abnormal-exit path would now fail loudly instead of silently passing. + +### IntegrationTests-021 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:579-622` | +| Status | Resolved | + +**Description:** The abnormal-exit test only polls `session.State` and `session.FinalFault`. It does not assert anything about `streamTask` after the kill. The chain that puts the session into `Faulted` is: the read loop hits EOS → `SetFaulted(PipeDisconnected, …)` → `_events.Writer.TryComplete(fault)` → `ReadEventsAsync` propagates the `WorkerClientException` → `EventStreamService.ProduceEventsAsync`'s `catch (Exception exception) when exception is WorkerClientException` calls `session.MarkFaulted(exception.Message)`. The test verifies the *end state* of that chain but not that the `StreamEvents` call is what produced the transition. If a future change moved the `MarkFaulted` call somewhere else (a session-manager background watcher, for example), the test would still pass — but the stream task could now silently swallow the fault. A direct assertion that `streamTask.IsFaulted` (or that awaiting it throws a `WorkerClientException`) would protect that contract. + +The Write parity test (IntegrationTests-012's resolution) added exactly this assertion (`Assert.False(streamTask.IsFaulted, …)`). The abnormal-exit test should add the inverse: the stream task *must* be faulted (or at least completed with a `WorkerClientException`) after the kill. + +**Recommendation:** After the session-state poll succeeds, assert `streamTask.IsCompleted` (the channel has terminated) and inspect `streamTask.Exception?.InnerException` for a `WorkerClientException` (or assert `streamTask.IsFaulted` and await with `ShouldThrowAsync`). This couples the test to the actual fault-propagation path and prevents a future refactor that bypasses the stream from quietly weakening the coverage. Compare to the existing `Assert.False(streamTask.IsFaulted, …)` on line 217 — the abnormal-exit case wants the opposite assertion. + +**Resolution:** 2026-05-20 — After the session-state poll loop confirms `SessionState.Faulted`, the test now awaits `streamTask.WaitAsync(StreamShutdownTimeout)` (with a try/catch that logs the surfaced exception type/message), then asserts `streamTask.IsCompleted` and `streamTask.IsFaulted`. This couples the test to the actual fault-propagation chain — read loop hits EndOfStream → `WorkerClient.SetFaulted(PipeDisconnected, …)` → `ReadEventsAsync` propagates the fault → `EventStreamService` calls `session.MarkFaulted` → `MxAccessGatewayService.StreamEvents` re-throws the mapped `RpcException`. A future refactor that moved `MarkFaulted` off the stream-consumption path would leave `streamTask` completing cleanly, which the new `IsFaulted` assertion would now catch (inverse of the existing `Assert.False(streamTask.IsFaulted, …)` in the Write parity test on line 217). The inner-exception type assertion was deliberately omitted because the gateway maps `WorkerClientException` to `RpcException` at the public boundary (`MxAccessGatewayService.MapWorkerClientException`); asserting on the surface type alone would be brittle, while the `IsFaulted` check directly tests the contract the recommendation is protecting. diff --git a/code-reviews/README.md b/code-reviews/README.md index 2c70efa..4b89cb8 100644 --- a/code-reviews/README.md +++ b/code-reviews/README.md @@ -10,17 +10,17 @@ Each module's `findings.md` is the source of truth; this file is generated from | Module | Reviewer | Date | Commit | Status | Open | Total | |---|---|---|---|---|---|---| -| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 14 | -| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 16 | -| [Client.Java](Client.Java/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 20 | -| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 17 | -| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 17 | -| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 13 | -| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 16 | -| [Server](Server/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 22 | -| [Tests](Tests/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 19 | -| [Worker](Worker/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 22 | -| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-20 | `1cd51bb` | Reviewed | 0 | 24 | +| [Client.Dotnet](Client.Dotnet/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 16 | +| [Client.Go](Client.Go/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 21 | +| [Client.Java](Client.Java/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 26 | +| [Client.Python](Client.Python/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 21 | +| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 20 | +| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 15 | +| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 21 | +| [Server](Server/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 29 | +| [Tests](Tests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 24 | +| [Worker](Worker/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 25 | +| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 30 | ## Pending findings @@ -37,6 +37,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Server-001 | Critical | Resolved | Security | `src/MxGateway.Server/GatewayApplication.cs:147-149`, `src/MxGateway.Server/Dashboard/DashboardEndpointRouteBuilderExtensions.cs:55-58`, `src/MxGateway.Server/Dashboard/Components/Routes.razor:1-15` | | Client.Go-001 | High | Resolved | Correctness & logic bugs | `clients/go/mxgateway/errors.go:88-93`, `clients/go/mxgateway/errors.go:117-128` | | Client.Java-013 | High | Resolved | Testing coverage | `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java:212-304`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1214-1244` | +| Client.Python-018 | High | Resolved | Code organization & conventions | `clients/python/pyproject.toml:11` | | Client.Rust-001 | High | Resolved | mxaccessgw conventions | `clients/rust/src/options.rs:98,143` | | Client.Rust-002 | High | Resolved | mxaccessgw conventions | `clients/rust/src/session.rs:522` | | Client.Rust-003 | High | Resolved | Correctness & logic bugs | `clients/rust/crates/mxgw-cli/src/main.rs:1051` | @@ -65,6 +66,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Java-005 | Medium | Resolved | Error handling & resilience | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySession.java:92-105` | | Client.Java-014 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxEventStream.java:59-65,117-124` | | Client.Java-015 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java:112-138`, `MxGatewayClient.java:183-191,224-232,322-329`, `GalaxyRepositoryClient.java:164-170,212-214` | +| Client.Java-021 | Medium | Resolved | Concurrency & thread safety | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/DeployEventStream.java:96-135` | | Client.Python-003 | Medium | Resolved | Error handling & resilience | `clients/python/src/mxgateway/client.py:125-137,155-173` | | Client.Python-005 | Medium | Resolved | Performance & resource management | `clients/python/src/mxgateway/galaxy.py:117-140` | | Client.Python-009 | Medium | Resolved | Testing coverage | `clients/python/tests/` | @@ -73,6 +75,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Rust-006 | Medium | Resolved | Error handling & resilience | `clients/rust/src/session.rs:531-555` | | Client.Rust-015 | Medium | Resolved | Error handling & resilience | `clients/rust/crates/mxgw-cli/src/main.rs:1053-1070` | | Client.Rust-016 | Medium | Resolved | Testing coverage | `clients/rust/tests/client_behavior.rs`, `clients/rust/src/session.rs:489-519,654-768` | +| Client.Rust-018 | Medium | Resolved | Error handling & resilience | `clients/rust/crates/mxgw-cli/src/main.rs:1098-1170`; `scripts/bench-read-bulk.ps1:347-365`; siblings: `clients/go/cmd/mxgw-go/main.go:600-648`, `clients/python/src/mxgateway_cli/commands.py:614-662`, `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:685-770`, `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:855-940` | | Contracts-002 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:384-385`, `:95` | | Contracts-009 | Medium | Resolved | Design-document adherence | `docs/Contracts.md:13-24` | | IntegrationTests-003 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:89-97` | @@ -81,6 +84,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | IntegrationTests-006 | Medium | Resolved | Testing coverage | `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs` | | IntegrationTests-012 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:147-151` | | IntegrationTests-014 | Medium | Resolved | Testing coverage | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs` | +| IntegrationTests-017 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:350-407` | +| IntegrationTests-019 | Medium | Resolved | Security | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:497-534` | | Server-002 | Medium | Resolved | Design-document adherence | `src/MxGateway.Server/Program.cs:24`, `src/MxGateway.Server/GatewayApplication.cs` | | Server-004 | Medium | Resolved | Code organization & conventions | `src/MxGateway.Server/Security/Authentication/ApiKeyAdminCommandLineParser.cs:227-233`, `src/MxGateway.Server/Security/Authentication/ApiKeyAdminCliRunner.cs:53-77`, `src/MxGateway.Server/Dashboard/DashboardApiKeyManagementService.cs:21-67` | | Server-005 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyRefreshService.cs:22-28`, `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:184` | @@ -94,6 +99,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Tests-006 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:76`, `src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs:122` | | Tests-013 | Medium | Resolved | Testing coverage | `src/MxGateway.Server/Sessions/GatewaySession.cs:449-679`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` | | Tests-016 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:29-41,115-124` | +| Tests-020 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs:275-347`, `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:803-829` | | Worker-004 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:565-588` | | Worker-005 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:205-258` (production alarm poll loop) | | Worker-006 | Medium | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:117-124`, `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:386-491` | @@ -101,6 +107,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Worker-008 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:205-249`, `:429-447` | | Worker-016 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:261-265` | | Worker-017 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/Sta/StaRuntime.cs:280-288`, `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:602-631` | +| Worker-023 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:610-668`, `src/MxGateway.Worker/MxAccess/MxAccessCommandExecutor.cs:124-153` | | Worker.Tests-003 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/Sta/StaRuntimeTests.cs:46-48` | | Worker.Tests-004 | Medium | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:281-329` | | Worker.Tests-005 | Medium | Resolved | Performance & resource management | `src/MxGateway.Worker.Tests/Ipc/WorkerFrameProtocolTests.cs:20-31,103-105`, `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:28-31` | @@ -120,6 +127,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Dotnet-012 | Low | Resolved | Code organization & conventions | `clients/dotnet/MxGateway.Client/MxGateway.Client.csproj`, `clients/dotnet/MxGateway.Client.Cli/MxGateway.Client.Cli.csproj`, `clients/dotnet/MxGateway.Client.Tests/MxGateway.Client.Tests.csproj` | | Client.Dotnet-013 | Low | Resolved | Code organization & conventions | `clients/dotnet/MxGateway.Client/DiscoverHierarchyOptions.cs:3-24`, `clients/dotnet/MxGateway.Client/GalaxyRepositoryClient.cs:185-187`, `clients/dotnet/MxGateway.Client.Cli/IMxGatewayCliClient.cs:6` | | Client.Dotnet-014 | Low | Resolved | Testing coverage | `clients/dotnet/MxGateway.Client.Tests/MxGatewayClientAlarmsTests.cs:76-98`, `clients/dotnet/MxGateway.Client.Tests/FakeGatewayTransport.cs:212-231` | +| Client.Dotnet-015 | Low | Resolved | Correctness & logic bugs | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:221-236`, `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:596-1065` | +| Client.Dotnet-016 | Low | Resolved | Concurrency & thread safety | `clients/dotnet/MxGateway.Client.Cli/MxGatewayClientCli.cs:922-976` | | Client.Go-004 | Low | Resolved | mxaccessgw conventions | `clients/go/mxgateway/alarms_test.go:153-154`, `clients/go/mxgateway/galaxy_test.go:58-59` | | Client.Go-005 | Low | Resolved | Design-document adherence | `clients/go/mxgateway/client.go:64,68`, `clients/go/mxgateway/galaxy.go:83,87` | | Client.Go-006 | Low | Resolved | Error handling & resilience | `clients/go/mxgateway/errors.go:9-130` | @@ -133,6 +142,11 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Go-014 | Low | Resolved | Error handling & resilience | `clients/go/mxgateway/session.go:602`, `clients/go/mxgateway/galaxy.go:189` | | Client.Go-015 | Low | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:410-512` | | Client.Go-016 | Low | Resolved | Testing coverage | `clients/go/mxgateway/galaxy_test.go:382-429` | +| Client.Go-017 | Low | Resolved | Error handling & resilience | `clients/go/cmd/mxgw-go/main.go:954-991` | +| Client.Go-018 | Low | Resolved | Concurrency & thread safety | `clients/go/cmd/mxgw-go/main.go:593-623` | +| Client.Go-019 | Low | Resolved | Documentation & comments | `clients/go/cmd/mxgw-go/main.go:710-716`, `clients/go/cmd/mxgw-go/main.go:1204,1213` | +| Client.Go-020 | Low | Resolved | Code organization & conventions | `clients/go/cmd/mxgw-go/main.go:753-802`, `clients/go/cmd/mxgw-go/main.go:1199-1275` | +| Client.Go-021 | Low | Resolved | Testing coverage | `clients/go/cmd/mxgw-go/main_test.go`, `clients/go/cmd/mxgw-go/main.go:363-520,522-655` | | Client.Java-006 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:323-328`, `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/GalaxyRepositoryClient.java:279-284` | | Client.Java-007 | Low | Resolved | Testing coverage | `clients/java/mxgateway-client/src/test/java/com/dohertylan/mxgateway/client/` | | Client.Java-008 | Low | Resolved | Error handling & resilience | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:298-304` | @@ -145,6 +159,11 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Java-018 | Low | Resolved | Security | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewaySecrets.java:54-66` | | Client.Java-019 | Low | Resolved | Performance & resource management | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayClient.java:362-391`, `GalaxyRepositoryClient.java:286-315` | | Client.Java-020 | Low | Resolved | Correctness & logic bugs | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:244-254`, `galaxy_repository.proto:94` | +| Client.Java-022 | Low | Resolved | Documentation & comments | `clients/java/mxgateway-client/src/main/java/com/dohertylan/mxgateway/client/MxGatewayChannels.java:161-172` | +| Client.Java-023 | Low | Resolved | Correctness & logic bugs | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1054`, `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:634` | +| Client.Java-024 | Low | Resolved | Correctness & logic bugs | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:855-883` | +| Client.Java-025 | Low | Resolved | Code organization & conventions | `clients/java/mxgateway-cli/src/main/java/com/dohertylan/mxgateway/cli/MxGatewayCli.java:1176-1185` | +| Client.Java-026 | Low | Resolved | Testing coverage | `clients/java/mxgateway-cli/src/test/java/com/dohertylan/mxgateway/cli/MxGatewayCliTests.java` | | Client.Python-001 | Low | Resolved | Documentation & comments | `clients/python/pyproject.toml:8,25`, `clients/python/src/mxgateway_cli/commands.py:25` | | Client.Python-002 | Low | Resolved | Code organization & conventions | `clients/python/src/mxgateway/__init__.py:27` | | Client.Python-004 | Low | Resolved | Correctness & logic bugs | `clients/python/src/mxgateway_cli/commands.py:386,402-404` | @@ -158,6 +177,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Python-015 | Low | Resolved | Testing coverage | `clients/python/src/mxgateway_cli/commands.py:273-294,564-647`, `clients/python/tests/` | | Client.Python-016 | Low | Resolved | Testing coverage | `clients/python/src/mxgateway_cli/commands.py:25,757-775,805-830` | | Client.Python-017 | Low | Resolved | Documentation & comments | `clients/python/pyproject.toml:5-25`, `clients/python/src/mxgateway/` | +| Client.Python-019 | Low | Resolved | Code organization & conventions | `clients/python/pyproject.toml:60-61`, `clients/python/src/mxgateway_cli/` | +| Client.Python-020 | Low | Resolved | Testing coverage | `clients/python/tests/`, `scripts/` | +| Client.Python-021 | Low | Resolved | Documentation & comments | `clients/python/src/mxgateway_cli/commands.py`, `clients/python/README.md:235-258` | | Client.Rust-004 | Low | Resolved | Documentation & comments | `clients/rust/src/version.rs:7` | | Client.Rust-007 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:14-55` | | Client.Rust-008 | Low | Resolved | Performance & resource management | `clients/rust/src/value.rs:161-261` | @@ -166,6 +188,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Client.Rust-011 | Low | Resolved | mxaccessgw conventions | `clients/rust/src/session.rs:469` | | Client.Rust-014 | Low | Resolved | mxaccessgw conventions | `clients/rust/crates/mxgw-cli/src/main.rs:450,497` | | Client.Rust-017 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:79-99,156-163` | +| Client.Rust-019 | Low | Resolved | Design-document adherence | `clients/rust/RustClientDesign.md:96-100` | +| Client.Rust-020 | Low | Resolved | Documentation & comments | `clients/rust/src/session.rs:31-46`; `clients/rust/src/lib.rs:14-39` | | Contracts-001 | Low | Resolved | Design-document adherence | `docs/Grpc.md:13` (and `:3`, `:32`, `:39`) | | Contracts-003 | Low | Won't Fix | Code organization & conventions | `src/MxGateway.Contracts/MxGateway.Contracts.csproj:10` | | Contracts-004 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/GatewayContractInfo.cs:3-6` | @@ -177,6 +201,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Contracts-011 | Low | Resolved | Security | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:392-397`, `:406-412` | | Contracts-012 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/Protos/galaxy_repository.proto:120` | | Contracts-013 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Contracts/GatewayContractInfoTests.cs:14` | +| Contracts-014 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:549-553` | +| Contracts-015 | Low | Resolved | Documentation & comments | `src/MxGateway.Contracts/Protos/mxaccess_gateway.proto:571-582` | | IntegrationTests-007 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:20`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:5`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:9` | | IntegrationTests-008 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/LiveLdapFactAttribute.cs`, `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs`, `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs` | | IntegrationTests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:372-375` | @@ -185,6 +211,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | IntegrationTests-013 | Low | Resolved | Performance & resource management | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:519-609` | | IntegrationTests-015 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:30,119,201`, `src/MxGateway.IntegrationTests/DashboardLdapLiveTests.cs:13,32,48,67,84`, `src/MxGateway.IntegrationTests/Galaxy/GalaxyRepositoryLiveTests.cs:10,22,34,52` | | IntegrationTests-016 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/Galaxy/LiveGalaxyRepositoryFactAttribute.cs:26`, `src/MxGateway.Server/Galaxy/GalaxyRepositoryOptions.cs:13` | +| IntegrationTests-018 | Low | Resolved | Code organization & conventions | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:1037`, `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:595` | +| IntegrationTests-020 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:616-622` | +| IntegrationTests-021 | Low | Resolved | Testing coverage | `src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs:579-622` | | Server-007 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyHierarchyProjector.cs:55-70` | | Server-008 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:111-134,160-189` | | Server-009 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Security/Authentication/AuthSqliteConnectionFactory.cs:15-32` | @@ -197,6 +226,13 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Server-019 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs:183-221` | | Server-020 | Low | Resolved | Code organization & conventions | `src/MxGateway.Server/Dashboard/Components/Pages/DashboardHome.razor:1-2`, `…/GalaxyPage.razor:1-2`, `…/ApiKeysPage.razor:1-2`, `…/EventsPage.razor:1-2`, `…/SessionsPage.razor:1-2`, `…/WorkersPage.razor:1-2`, `…/SettingsPage.razor:1-2`, `…/SessionDetailsPage.razor:1-2` | | Server-022 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs:8-29` | +| Server-023 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Sessions/NotWiredAlarmRpcDispatcher.cs:10-26` | +| Server-024 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs:56-77` | +| Server-025 | Low | Resolved | Code organization & conventions | `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:19-25`, `src/MxGateway.Server/Galaxy/IGalaxyRepository.cs` | +| Server-026 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Configuration/GatewayOptionsValidator.cs:17-32`, `src/MxGateway.Server/Configuration/AlarmsOptions.cs` | +| Server-027 | Low | Resolved | Design-document adherence | `docs/Authorization.md:120-141,176-181` | +| Server-028 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs:13-20`, `src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs` | +| Server-029 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:52-58` | | Tests-007 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:682`, `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:324`, `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:460`, `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs:233` | | Tests-008 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs:1-9`, `src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs:1-3`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerAlarmAutoSubscribeTests.cs:1` | | Tests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:36-37,99,365` | @@ -208,6 +244,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Tests-017 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:346-364` | | Tests-018 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:32`, `src/MxGateway.Tests/Gateway/Dashboard/DashboardSnapshotServiceTests.cs:45,51,57,105,134,163,167,202-209,284,317,523`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:40` | | Tests-019 | Low | Resolved | Documentation & comments | `docs/GatewayTesting.md`, `code-reviews/Tests/findings.md` (Tests-002 re-triage) | +| Tests-021 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:159-171`, `src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs:226-236`, `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:620-630`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:766-…` | +| Tests-022 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs:52-61,90-99,126-135,163-172,202-211,238-247,282-294,339-360,413-434,484-506,553-567,663-688` | +| Tests-023 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Tests/Gateway/Sessions/SessionWorkerClientFactoryFakeWorkerTests.cs:334-374` | +| Tests-024 | Low | Resolved | Testing coverage | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:713-730,784-801,859-876`, `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs` | | Worker-009 | Low | Resolved | Performance & resource management | `src/MxGateway.Worker/Ipc/WorkerFrameReader.cs:31,49`, `src/MxGateway.Worker/Ipc/WorkerFrameWriter.cs:57-58` | | Worker-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Conversion/VariantConverter.cs:204-226` | | Worker-011 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeClient.cs:169-171` | @@ -220,6 +260,8 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Worker-020 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:405`, `:423` | | Worker-021 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:111-118`, `:790-805`, `:136-139` | | Worker-022 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker/MxAccess/MxAlarmSnapshot.cs:12`, `:26`, `:49` | +| Worker-024 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs:63-187`, `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:191-323` | +| Worker-025 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:111-117` | | Worker.Tests-008 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/Conversion/VariantConverterTests.cs:175-182` | | Worker.Tests-009 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs`, `AlarmDispatcherTests.cs`, `AlarmCommandExecutorTests.cs`, `AlarmRecordTransitionMapperTests.cs`, `WnWrapAlarmConsumerXmlTests.cs` | | Worker.Tests-010 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs:230-258` | @@ -234,3 +276,9 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`. | Worker.Tests-022 | Low | Resolved | Testing coverage | `src/MxGateway.Worker.Tests/MxAccess/WnWrapAlarmConsumerXmlTests.cs` | | Worker.Tests-023 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/AlarmClientWmProbeTests.cs` (779 lines), `src/MxGateway.Worker.Tests/WnWrapConsumerProbeTests.cs` (287 lines), `src/MxGateway.Worker.Tests/AlarmsLiveSmokeTests.cs` (270 lines) | | Worker.Tests-024 | Low | Resolved | Correctness & logic bugs | `src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs:42-54` | +| Worker.Tests-025 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs:23`, `src/MxGateway.IntegrationTests/IntegrationTestEnvironment.cs:5`, `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs:9-12` | +| Worker.Tests-026 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker/MxAccess/MxAccessSession.cs:74-88` | +| Worker.Tests-027 | Low | Resolved | Concurrency & thread safety | `src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs:174, 179-187` | +| Worker.Tests-028 | Low | Resolved | Design-document adherence | `docs/GatewayTesting.md`, `src/MxGateway.Worker.Tests/Probes/` | +| Worker.Tests-029 | Low | Resolved | Code organization & conventions | `src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs:9`, `src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs:14`, `src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs:10` | +| Worker.Tests-030 | Low | Resolved | Documentation & comments | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:862-890` | diff --git a/code-reviews/Server/findings.md b/code-reviews/Server/findings.md index f663e7c..e70497e 100644 --- a/code-reviews/Server/findings.md +++ b/code-reviews/Server/findings.md @@ -5,12 +5,14 @@ | Module | `src/MxGateway.Server` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +### 2026-05-20 review (commit 1cd51bb) + This row summarizes the 2026-05-20 review pass at commit `1cd51bb`. Findings from prior passes (Server-001 through Server-014) are all closed and remain below as audit history. @@ -28,6 +30,23 @@ audit history. | 9 | Testing coverage | Issues found: Server-021 (`MxAccessGatewayService.ApplyConstraintsAsync` and the new `BulkConstraintPlan` / `ReadBulkConstraintPlan` / `WriteBulkConstraintPlan` / `SubscribeBulkConstraintPlan` merge logic is entirely untested). | | 10 | Documentation & comments | Issues found: Server-022 (`IAlarmRpcDispatcher` XML doc still describes the dispatcher as "ships a not-yet-wired default"; stale after Server-014). | +### 2026-05-20 review (commit a020350) + +Re-review pass at `a020350` — the cross-module sweep that resolved Server-015 through Server-022. Verified each fix is sound (lock discipline now uniform on `_syncRoot`; `DisposeAsync` gates on `_closeLock`; alarm RPCs map to `InvokeWrite`/`EventsRead`; glob cache is bounded; alarm dispatcher SessionNotFound flows through `MxAccessGatewayService.MapException` → gRPC `NotFound`; dashboard pages emit a single `@page`; 11 new `MxAccessGatewayServiceConstraintTests` cover the bulk-constraint plans). New findings filed against this pass. + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | Issues found: Server-024 (`GalaxyGlobMatcher.GetOrCreateRegex` indexer access after `TryAdd` fails can throw `KeyNotFoundException` under contention near the cap). | +| 2 | mxaccessgw conventions | No issues found. | +| 3 | Concurrency & thread safety | No new issues found — Server-015/016 fixes verified sound. | +| 4 | Error handling & resilience | Issues found: Server-026 (`AlarmsOptions` is bound but not validated by `GatewayOptionsValidator`). | +| 5 | Security | No issues found — Server-017 mapping (`InvokeWrite` / `EventsRead`) is defensible and exercised by both resolver and interceptor tests. | +| 6 | Performance & resource management | No issues found — Server-018 cap is in place and tested. | +| 7 | Design-document adherence | Issues found: Server-027 (`docs/Authorization.md` `ResolveCommandScope` code snippet and Constraint Enforcement section omit the bulk read/write command families). | +| 8 | Code organization & conventions | Issues found: Server-025 (`GalaxyRepositoryGrpcService` still consumes the concrete `GalaxyRepository` after `IGalaxyRepository` was introduced for testability — inconsistent with `GalaxyHierarchyCache`). | +| 9 | Testing coverage | Issues found: Server-028 (`GatewayGrpcScopeResolverTests` does not exercise `WatchDeployEventsRequest` or `MxCommandKind.ReadBulk`; no `GatewaySessionTests` case asserts a `MarkFaulted` during in-flight Close). | +| 10 | Documentation & comments | Issues found: Server-023 (`NotWiredAlarmRpcDispatcher` class XML doc still says "PR A.6/A.7 — default … shipped while the worker-side AlarmClient event subscription is gated on dev-rig validation"; contradicts the cleanup that Server-014/Server-022 applied to the interface, gateway service, and `WorkerAlarmRpcDispatcher`). Issues found: Server-029 (`OpenSession` capability list advertises `bulk-subscribe-commands` but not the now-shipping bulk-read or bulk-write families — clients that gate on capability strings have no signal that those families exist). | + ## Findings ### Server-001 @@ -359,3 +378,114 @@ audit history. **Recommendation:** Rewrite the `IAlarmRpcDispatcher` `` block to match the language now used on `WorkerAlarmRpcDispatcher` and on the gRPC service: DI binds `WorkerAlarmRpcDispatcher` by default; `NotWiredAlarmRpcDispatcher` is only the null fallback for tests/DI omission. Drop the "PR A.6 / A.7" prefix from the `` — the interface is now the public alarm-RPC seam. **Resolution:** 2026-05-20 — Rewrote `IAlarmRpcDispatcher`'s `` and `` (`src/MxGateway.Server/Sessions/IAlarmRpcDispatcher.cs`) to match the language now used on `WorkerAlarmRpcDispatcher` and on `MxAccessGatewayService.AcknowledgeAlarm` / `QueryActiveAlarms`: dropped the stale "PR A.6 / A.7" prefix from the summary, and replaced the "this PR ships a not-yet-wired default that returns a clear worker-pending diagnostic" clause with the correct statement that DI binds the production `WorkerAlarmRpcDispatcher` by default and `NotWiredAlarmRpcDispatcher` is only the null fallback for DI omission / standalone tests. Pure documentation change; no test. + +### Server-023 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Server/Sessions/NotWiredAlarmRpcDispatcher.cs:10-26` | +| Status | Resolved | + +**Description:** Server-014 and Server-022 swept the stale "PR A.6 / A.7" / "not-yet-wired" / "worker-pending" language off `MxAccessGatewayService.AcknowledgeAlarm` / `QueryActiveAlarms`, `WorkerAlarmRpcDispatcher`, and `IAlarmRpcDispatcher`. The concrete `NotWiredAlarmRpcDispatcher` class XML doc was not updated as part of either fix and still reads: *"PR A.6 / A.7 — default `IAlarmRpcDispatcher` shipped while the worker-side AlarmClient event subscription is gated on dev-rig validation"* and *"When the worker dispatcher (PR A.6/A.7 dev-rig follow-up) lands, `WorkerAlarmRpcDispatcher` replaces this implementation in the DI container"*. That is the exact prose the other sweeps removed, and it directly contradicts the now-current narrative everywhere else: `SessionServiceCollectionExtensions.AddGatewaySessions` registers `WorkerAlarmRpcDispatcher` as the default `IAlarmRpcDispatcher`; `NotWiredAlarmRpcDispatcher` is only the null fallback used when no dispatcher is registered (DI omission / standalone tests). The diagnostic string returned by `AcknowledgeAsync` (line 39) — `"the worker-side AlarmClient consumer (PR A.5) is in place but the dispatcher hookup is gated on validating the AVEVA alarm-provider event subscription on the dev rig"` — is also stale; the dispatcher hookup landed and any client that actually sees that diagnostic today is hitting the null-fallback path, not the dev-rig gate it describes. + +**Recommendation:** Replace the `` and `` on `NotWiredAlarmRpcDispatcher` with text that matches the language now used on the interface and `WorkerAlarmRpcDispatcher` — "null fallback `IAlarmRpcDispatcher` used when no dispatcher is registered (DI omission / standalone tests); production wires `WorkerAlarmRpcDispatcher`." Either drop the `AcknowledgeAsync` diagnostic string's dev-rig framing entirely or shorten it to "alarm dispatcher is not registered." `#pragma warning disable CS1998` on `QueryActiveAlarmsAsync` is correct here (empty stream is intentional for the null fallback) and should stay. + +**Resolution:** 2026-05-20 — Rewrote `NotWiredAlarmRpcDispatcher` summary/remarks as the null-fallback dispatcher and shortened the `AcknowledgeAsync` diagnostic to "Alarm dispatcher is not registered."; updated the two tests that asserted the old "worker"-prefixed diagnostic. + +### Server-024 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs:56-77` | +| Status | Resolved | + +**Description:** `GetOrCreateRegex`'s race-loser branch reads `RegexCache[glob]` with an indexer (line 76) after `TryAdd` returned `false`. The indexer throws `KeyNotFoundException` if the key is missing. Under the new bounded cache (Server-018), there is a real — if narrow — race where the key vanishes between the failing `TryAdd` and the indexer read: thread A and thread B both compile a `Regex` for `glob`; A's `TryAdd` succeeds, A enqueues + enters `EvictIfOverCapacity`, the eviction loop dequeues `glob` (because some other thread had already enqueued + evicted enough that `glob` is now the oldest entry) and removes it; thread B's `TryAdd` then returns false, B reads `RegexCache[glob]`, and the indexer throws. The window is tiny but nonzero — eviction is approximate FIFO, and a hot pattern that is repeatedly re-added near the cap is the natural trigger. The same pre-Server-018 code used `GetOrAdd`, which had no such race because the dictionary handled the rebuild atomically. + +**Recommendation:** Replace the `TryAdd` + indexer pair with `RegexCache.GetOrAdd(glob, _ => compiled)` so the dictionary atomically returns whichever instance won. Track the new insertion only when `GetOrAdd` returns the locally-compiled instance (`ReferenceEquals(result, compiled)`), then enqueue + evict. Alternatively, swap the trailing indexer read for `TryGetValue` + recursive recompile on miss. Add a stress test that mixes repeated reads of a single hot pattern with a flood of unique patterns near the cap and asserts no exception escapes `IsMatch`. + +**Resolution:** 2026-05-20 — Replaced the `TryAdd` + indexer pair with `RegexCache.GetOrAdd(glob, compiled)`; FIFO enqueue + eviction now run only when `ReferenceEquals(result, compiled)` (i.e. our caller was the inserter), eliminating the post-eviction `KeyNotFoundException` window. + +### Server-025 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:19-25`, `src/MxGateway.Server/Galaxy/IGalaxyRepository.cs` | +| Status | Resolved | + +**Description:** The Tests-016 fix introduced `IGalaxyRepository` so `GalaxyHierarchyCache` could be unit-tested against an in-memory fake, and `GalaxyHierarchyCache` was updated to depend on the interface. `GalaxyRepositoryGrpcService` was not updated and still receives the concrete `GalaxyDb.GalaxyRepository` via its primary constructor. Functionally this is fine — DI registers the concrete singleton and a thin `sp.GetRequiredService()` forwarder for the interface — but the seam is now half-applied: a future caller that wants to test or stub the gRPC service's `TestConnection` path has to construct a real `GalaxyRepository` against a SQL connection string, defeating the abstraction `IGalaxyRepository` was introduced for. The pattern also creates an inconsistency for new readers — two consumers in the same namespace, one on the interface and one on the concrete. + +**Recommendation:** Change `GalaxyRepositoryGrpcService`'s `repository` parameter to `IGalaxyRepository`. No DI change is needed (both forwarders already resolve to the same singleton). Optionally drop the concrete singleton registration and register the interface directly. + +**Resolution:** 2026-05-20 — Changed `GalaxyRepositoryGrpcService`'s `repository` primary-constructor parameter from the concrete `GalaxyRepository` to `IGalaxyRepository`; existing DI registration in `GalaxyRepositoryServiceCollectionExtensions` already resolves both the concrete and interface to the same singleton. + +### Server-026 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Error handling & resilience | +| Location | `src/MxGateway.Server/Configuration/GatewayOptionsValidator.cs:17-32`, `src/MxGateway.Server/Configuration/AlarmsOptions.cs` | +| Status | Resolved | + +**Description:** `GatewayOptions.Alarms` is bound from `MxGateway:Alarms` and consumed by `SessionManager.TryAutoSubscribeAlarmsAsync` (per-session SubscribeAlarms on Ready). `GatewayOptionsValidator.Validate` validates every other section (`Authentication`, `Ldap`, `Worker`, `Sessions`, `Events`, `Dashboard`, `Protocol`) but has no `ValidateAlarms` arm — `AlarmsOptions` is silently accepted regardless of contents. The runtime mitigates this by logging a warning when `Enabled = true` but neither `SubscriptionExpression` nor `DefaultArea` is set, then either faulting open-session (`RequireSubscribeOnOpen = true`) or skipping auto-subscribe — a configuration error therefore surfaces per-session at runtime instead of at startup. Other sections fail-fast at `ValidateOnStart()`, so the inconsistency makes alarm misconfiguration discoverable only after a client hits the gateway. A misformatted `SubscriptionExpression` (no `\\\Galaxy!` shape) likewise passes validation; the worker rejects it later. + +**Recommendation:** Add a `ValidateAlarms(options.Alarms, failures)` arm in `GatewayOptionsValidator`. When `Enabled = true`, require either a non-blank `SubscriptionExpression` or a non-blank `DefaultArea`; when `SubscriptionExpression` is provided, sanity-check that it starts with `\\` (the AVEVA UNC subscription shape) — or document that the shape is left to the worker to validate. Either way, treat the configuration as part of the validated surface. + +**Resolution:** 2026-05-20 — Added `ValidateAlarms` to `GatewayOptionsValidator`: when `Enabled = true`, requires a non-blank `SubscriptionExpression` or `DefaultArea`, and when `SubscriptionExpression` is provided, requires it to start with `\\` (canonical UNC subscription shape). Alarm misconfiguration now fails fast at startup instead of per-session. + +### Server-027 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Design-document adherence | +| Location | `docs/Authorization.md:120-141,176-181` | +| Status | Resolved | + +**Description:** Two parts of `docs/Authorization.md` drifted from `GatewayGrpcScopeResolver.ResolveCommandScope` and from `MxAccessGatewayService.ApplyConstraintsAsync` over the bulk-read/bulk-write series (`f220908`/`5e375f6`/`758aca2`) and were not updated by the Server-017 / Server-021 fixes: + +1. The `ResolveCommandScope` code snippet at lines 120-141 still shows only `Write`/`Write2` against `InvokeWrite` and `WriteSecured`/`WriteSecured2`/`AuthenticateUser` against `InvokeSecure`. The actual resolver also maps `MxCommandKind.WriteBulk`, `MxCommandKind.Write2Bulk`, `MxCommandKind.WriteSecuredBulk`, and `MxCommandKind.WriteSecured2Bulk`. A reader believing the snippet would conclude the bulk-write families inherit the fail-closed admin scope, when in fact they correctly map to `InvokeWrite` / `InvokeSecure` (the Scope Catalog table at lines 199-200 lists them). +2. The Constraint Enforcement section (lines 176-181) says: *"The service checks read constraints for `AddItem`, `AddItem2`, `AddItemBulk`, `SubscribeBulk`, and `AdviseItemBulk`. It checks write constraints for `Write`, `Write2`, `WriteSecured`, and `WriteSecured2`."* The actual `ApplyConstraintsAsync` switch also enforces constraints for `ReadBulk` (read scope), `WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk` (write scope, per-entry filtering with index-order merge). Server-021 added test coverage for all of these without touching the doc. + +**Recommendation:** Update the `ResolveCommandScope` snippet to include the four bulk-write arms. Update the Constraint Enforcement prose to enumerate the bulk read/write commands that are actually filtered, and reference the per-entry index-ordered merge that `BulkConstraintPlan.MergeDeniedInto` performs. Adding `ReadBulk` to the `InvokeRead` row of the Scope Catalog would also be useful — the table currently lists `Register`/`AddItem`/`Advise` against `InvokeRead` but not `ReadBulk`. + +**Resolution:** 2026-05-20 — Updated the `ResolveCommandScope` snippet in `docs/Authorization.md` to enumerate the four bulk-write arms (`WriteBulk`/`Write2Bulk` against `InvokeWrite`, `WriteSecuredBulk`/`WriteSecured2Bulk` against `InvokeSecure`); expanded the Constraint Enforcement prose to list `ReadBulk` and all four bulk-write commands and to call out `BulkConstraintPlan.MergeDeniedInto`'s index-ordered merge; added `ReadBulk` to the `InvokeRead` row of the Scope Catalog. + +### Server-028 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs:13-20`, `src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs` | +| Status | Resolved | + +**Description:** Two narrow test gaps were not closed by Server-017 / Server-015: + +1. `GatewayGrpcScopeResolverTests.ResolveRequiredScope_KnownRpcRequest_ReturnsExpectedScope` enumerates `OpenSessionRequest`, `CloseSessionRequest`, `StreamEventsRequest`, `AcknowledgeAlarmRequest`, `QueryActiveAlarmsRequest`, `TestConnectionRequest`, `GetLastDeployTimeRequest`, and `DiscoverHierarchyRequest`. `WatchDeployEventsRequest` is missing even though it is named in the resolver's metadata-read arm and listed in the Scope Catalog. Similarly, the `ResolveRequiredScope_InvokeCommand_ReturnsExpectedScope` matrix covers every other write/secure/bulk command but omits `MxCommandKind.ReadBulk`, which is the only bulk family that falls into the `_ => GatewayScopes.InvokeRead` default arm. A regression that drops `WatchDeployEvents` from the request switch or that adds a new mapping for `ReadBulk` would not be caught. +2. `GatewaySessionTests` (added under Server-015 / Server-016) covers the `TransitionTo(Ready)` and `MarkFaulted(post-Close)` cases but does not cover the third edge that Server-015's tightened state machine permits: `MarkFaulted` issued while `CloseAsync` is parked between `TryBeginClose` (Closing) and `MarkClosed` (Closed). The current `MarkFaulted` (`GatewaySession.cs:314-326`) checks only for `Closed`, so it overwrites `Closing` → `Faulted`; the subsequent `MarkClosed` then overwrites `Faulted` → `Closed` while `_finalFault` is preserved. The behaviour is consistent with the docs ("Closing only allows a transition to Closed or Faulted") but the test bundle does not pin it, and a future tightening of `MarkFaulted` could silently regress. + +**Recommendation:** Extend `GatewayGrpcScopeResolverTests.ResolveRequiredScope_KnownRpcRequest_ReturnsExpectedScope` with `[InlineData(typeof(WatchDeployEventsRequest), GatewayScopes.MetadataRead)]` and extend the command theory with `[InlineData(MxCommandKind.ReadBulk, GatewayScopes.InvokeRead)]`. Add a `GatewaySessionTests.MarkFaulted_DuringInFlightClose_PreservesFaultButYieldsToClose` case using `BlockingShutdownWorkerClient` to park `CloseAsync`, call `MarkFaulted` while parked, release the worker, and assert `State == Closed && FinalFault == ""`. + +**Resolution:** 2026-05-20 — Added `[InlineData(typeof(WatchDeployEventsRequest), GatewayScopes.MetadataRead)]` to `GatewayGrpcScopeResolverTests.ResolveRequiredScope_KnownRpcRequest_ReturnsExpectedScope` (the `ReadBulk` arm was already present); added `GatewaySessionTests.MarkFaulted_DuringInFlightClose_PreservesFaultButYieldsToClose` covering the parked-close + `MarkFaulted` interleave and asserting the post-release state is `Closed` with `FinalFault = "concurrent-fault"`. + +### Server-029 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:52-58` | +| Status | Resolved | + +**Description:** `OpenSession` advertises capabilities the gateway supports so clients can branch on them. The current list is `unary-open-session`, `unary-close-session`, `unary-invoke`, `server-stream-events`, `bulk-subscribe-commands`, `unary-acknowledge-alarm`, `server-stream-active-alarms`. The `bulk-subscribe-commands` token was added for the `AddItemBulk` / `AdviseItemBulk` / `RemoveItemBulk` / `UnAdviseItemBulk` / `SubscribeBulk` / `UnsubscribeBulk` family. The subsequent `ReadBulk` and `WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk` families landed without a corresponding capability token — the contract advertises bulk-subscribe support but is silent on bulk-read and bulk-write. A defensive client that gates on `bulk-write-commands` before issuing a `WriteBulk` has no signal that the family is supported; current clients sidestep this by ignoring the list entirely, but that just shifts the failure mode (an old client against a new server, or vice versa, will see `Unimplemented` instead of a structured `Capabilities` mismatch). + +**Recommendation:** Either (a) extend the advertised list with `bulk-read-command` and `bulk-write-commands` (`WriteBulk` / `Write2Bulk` / `WriteSecuredBulk` / `WriteSecured2Bulk` collectively), or (b) document in `gateway.md` and `docs/Contracts.md` that `Capabilities` is informational only and not the contract version. Option (a) is the simplest forward-compatible fix and keeps the capability token shape clients are already familiar with. + +**Resolution:** 2026-05-20 — Extended the `OpenSession` capabilities list with `bulk-read-commands` and `bulk-write-commands` alongside the existing `bulk-subscribe-commands` token, so clients that gate on capability strings have an explicit signal for the bulk-read and bulk-write families. diff --git a/code-reviews/Tests/findings.md b/code-reviews/Tests/findings.md index 1a0d0b0..1e0ec88 100644 --- a/code-reviews/Tests/findings.md +++ b/code-reviews/Tests/findings.md @@ -5,24 +5,26 @@ | Module | `src/MxGateway.Tests` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage +This pass (commit `a020350`) re-reviews the module after the Tests-013–019 batch was resolved alongside Server-017, Server-021, and Contracts-010. + | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issue found: Tests-015 (`FakeWorkerProcess.WaitForExitAsync` mutates `HasExited`, weakening the smoke test assertion). | -| 2 | mxaccessgw conventions | No new issues. Style/convention drift previously filed has been resolved. | -| 3 | Concurrency & thread safety | Issue found: Tests-017 (`HeartbeatMonitor_WhenHeartbeatExpires_FaultsClient` still on real wall-clock). | -| 4 | Error handling & resilience | Strong — timeouts, faults, overflow, kill paths, protocol violations all exercised. No new issues found. | -| 5 | Security | No new issues. `Galaxy` adversarial-input safety (Tests-002), dashboard anonymous-localhost negatives (Tests-010), and interceptor composition (Tests-004) all resolved in the prior pass. | -| 6 | Performance & resource management | Issue found: Tests-014 (`WebApplication` instances built by `GatewayApplicationTests` and `DashboardCookieOptionsTests` are never disposed). | -| 7 | Design-document adherence | Tests match `docs/GatewayTesting.md`; no drift found. No issues found. | -| 8 | Code organization & conventions | Issue found: Tests-018 (`DateTimeOffset.Parse` calls without `CultureInfo.InvariantCulture`). | -| 9 | Testing coverage | Issues found: Tests-013 (eight new `GatewaySession.*BulkAsync` methods untested), Tests-016 (a Galaxy cache unit test performs a real network connect attempt). | -| 10 | Documentation & comments | Issue found: Tests-019 (the `Re-triage note` paragraphs added to Tests-002/006/008 only live inside `findings.md` — `docs/GatewayTesting.md` is not updated to describe the in-memory Galaxy filter safety tests added under that finding). | +| 1 | Correctness & logic bugs | Issue found: Tests-023 (the companion `FakeWorkerProcess.WaitForExitAsync` in `SessionWorkerClientFactoryFakeWorkerTests.cs` still uses the Tests-015 cheating pattern — `HasExited = true; ExitCode = 0;` regardless of whether the worker actually exited — and is a latent regression vector if any future exit assertion is added to that file). Tests-015 was only applied to the smoke-test copy. | +| 2 | mxaccessgw conventions | No new issues. Style/convention drift previously filed (Tests-008) remains resolved at `a020350`. | +| 3 | Concurrency & thread safety | No new issues. The remaining wall-clock dependencies (`InvokeAsync_WhenSessionReady_RefreshesLease` uses `UtcNow` at both ends of a ~1 hour delta, dwarfing clock resolution; `CloseExpiredLeasesAsync_*` reads `UtcNow` once and uses it consistently for both sides) are intrinsic to the production paths and not flake sources. The Tests-017 fix is in place at `WorkerClientTests.cs:354`. | +| 4 | Error handling & resilience | No new issues. Tests-013 closed the bulk-method coverage gap end-to-end (per-entry failure surfaces, protocol-status failures, and cancellation propagation are all exercised). Pipe-disconnect / worker-fault / kill paths all covered. | +| 5 | Security | No new issues. Adversarial-input safety (Tests-002), anonymous-localhost negatives (Tests-010), interceptor-service composition (Tests-004), constraint partial-denial merging (Server-021 — `PredicateConstraintEnforcer` + `MxAccessGatewayServiceConstraintTests`), and unmapped-RPC fail-closed (Server-017) all covered. | +| 6 | Performance & resource management | No new issues. Tests-014 (`await using WebApplication`) is applied to all seven `GatewayApplication.Build(...)` sites. Tests-003 (`TempDatabaseDirectory`) cleanup is in place. | +| 7 | Design-document adherence | Tests match `docs/GatewayTesting.md`; the new "Galaxy Filter Safety" subsection added under Tests-019 names `GalaxyFilterInputSafetyTests`. No drift found. | +| 8 | Code organization & conventions | Issue found: Tests-021 (`ManualTimeProvider` is duplicated as a `private sealed class` in four test files — `WorkerClientTests`, `FakeWorkerHarnessTests`, `SessionManagerTests`, `GalaxyHierarchyCacheTests` — and should follow the Tests-007 `TestSupport/` consolidation pattern). | +| 9 | Testing coverage | Issues found: Tests-020 (`MxAccessGatewayServiceConstraintTests` covers only 2 of 4 `WriteBulkConstraintPlan` switch arms — `Write2Bulk`/`WriteSecured2Bulk` `GetPayload`/`SetPayload` would silently break with no failing test), Tests-022 (the eleven `SessionManagerBulkTests.*_PropagatesCancellation` tests pre-cancel the token, so the fake's first-line `ThrowIfCancellationRequested` handles it before `InvokeBulkInternalAsync` even runs — they do not exercise mid-flight cancellation), Tests-024 (`BulkConstraintPlan.MergeDeniedInto` silently drops or under-fills if the worker reply count diverges from the allowed-count — no test pins this protocol-mismatch edge case). | +| 10 | Documentation & comments | No new issues. Tests-019's `docs/GatewayTesting.md` addition is in place; new test files (`SessionManagerBulkTests`, `MxAccessGatewayServiceConstraintTests`, `PredicateConstraintEnforcer`) all have orienting class-level summaries. | ## Findings @@ -316,3 +318,80 @@ **Recommendation:** Add a short subsection to `docs/GatewayTesting.md` (probably under "Focused Commands" or a new "Galaxy Filter Safety" section) that names `GalaxyFilterInputSafetyTests`, explains that Galaxy filtering happens in memory against the cached hierarchy (so the SQL surface is constant), and lists the adversarial-input invariants the suite pins (`%`, `_`, `'`, `;`, `[abc]` are literals; the glob regex has a 100 ms timeout against pathological input). **Resolution:** 2026-05-20 — Added a "Galaxy Filter Safety" section to `docs/GatewayTesting.md` (immediately after "Live Galaxy Repository", before "Live LDAP") that names `GalaxyFilterInputSafetyTests`, re-frames the Tests-002 finding (the Galaxy SQL surface is constant — `HierarchySql`, `AttributesSql`, `SELECT 1`, `SELECT time_of_last_deploy FROM galaxy`), explains that all filters are applied in memory by `GalaxyHierarchyProjector` / `GalaxyGlobMatcher`, lists the adversarial-input matrix (`'`, `' OR '1'='1`, `'; DROP TABLE gobject;--`, `%`, `_`, `100%_off`, `[abc]`, `Pump'001`), and enumerates the invariants the suite pins (SQL metacharacters are opaque literals, only `*`/`?` are glob wildcards, the matcher has a 100 ms regex timeout against pathological input, the projector returns zero matches / `NotFound` rather than the whole hierarchy, and the `DiscoverHierarchy` RPC end-to-end returns zero matches for adversarial globs). + +### Tests-020 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Testing coverage | +| Location | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs:275-347`, `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:803-829` | +| Status | Resolved | + +**Description:** Server-021 added `MxAccessGatewayServiceConstraintTests` to exercise `BulkConstraintPlan.MergeDeniedInto` / `CreateDeniedReply` against a non-allow-all enforcer. The `WriteBulkConstraintPlan` has a four-arm `GetPayload`/`SetPayload` switch covering `WriteBulk`, `Write2Bulk`, `WriteSecuredBulk`, and `WriteSecured2Bulk`, but the new fixtures only cover two of those four arms — `Invoke_WriteBulk_WithDeniedHandle_DropsEntryFromWorkerCallAndMergesDenialIntoReply` (the `WriteBulk` arm) and `Invoke_WriteSecuredBulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply` (the `WriteSecuredBulk` arm). The other two arms (`Write2Bulk` and `WriteSecured2Bulk`) and the parallel `SubscribeBulkConstraintPlan` `RemoveItemBulk`/`UnAdviseItemBulk`/`UnsubscribeBulk` cases (the subscribe-bulk plan's `SetPayload` switch in service code lines 742-753 covers only three kinds — `AddItemBulk`, `AdviseItemBulk`, `SubscribeBulk` — and the constraint test covers all three of those, but the *unsubscribe-shaped* bulk routes are also dispatched into denial paths through `FilterHandleBulkAsync` and have no constraint-test coverage either). A regression that wires a new bulk kind to the wrong reply slot, or drops a `case` arm during refactor, would compile clean and pass every existing test. The comment in `Invoke_WriteSecuredBulk_WhenAllHandlesDenied_…` ("The merge logic is shared, so a full denial here is enough to prove the secured-bulk routing") concedes the gap explicitly — but the `_routing_` (the per-kind `SetPayload` switch) is exactly what is *not* shared and not exercised for `Write2Bulk` / `WriteSecured2Bulk`. + +**Recommendation:** Add two short fixtures: `Invoke_Write2Bulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply` and `Invoke_WriteSecured2Bulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply`, mirroring the existing `WriteSecuredBulk` denial test but asserting `reply.Write2Bulk` / `reply.WriteSecured2Bulk` is populated (proving the `SetPayload` arm fires). The all-denied path is enough; the merge-with-allowed path is genuinely shared. Optionally also add denied-tag tests for `RemoveItemBulk` / `UnsubscribeBulk` to cover the handle-input variants of the SubscribeBulkConstraintPlan switch. + +**Resolution:** 2026-05-20 — Added `Invoke_Write2Bulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply` and `Invoke_WriteSecured2Bulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply` to `MxAccessGatewayServiceConstraintTests`, plus matching `CreateWrite2BulkRequest`/`CreateWriteSecured2BulkRequest` helpers. Each new fixture asserts the worker is never called (`InvokeCount == 0`), `reply.Kind` matches the requested kind, the matching `reply.{Write2Bulk,WriteSecured2Bulk}.Results` slot is populated with denied entries, and the three sibling reply slots remain empty — pinning that the `SetPayload` switch fired for the correct arm and not for one of the other three `Write*Bulk` kinds. This closes the `Write2Bulk`/`WriteSecured2Bulk` arms of the four-arm `GetPayload`/`SetPayload` switch in `WriteBulkConstraintPlan` (`MxAccessGatewayService.cs:803-829`). + +### Tests-021 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs:159-171`, `src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs:226-236`, `src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs:620-630`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:766-…` | +| Status | Resolved | + +**Description:** Tests-006 / Tests-017 / Tests-018 introduced an injectable `ManualTimeProvider` to make heartbeat-timestamp / lease / cache tests deterministic. The class is now duplicated as a `private sealed class ManualTimeProvider(DateTimeOffset start...) : TimeProvider` in four test files (`GalaxyHierarchyCacheTests.cs`, `FakeWorkerHarnessTests.cs`, `WorkerClientTests.cs`, `SessionManagerTests.cs`). Each copy has the same three-line implementation (`_now` field, `GetUtcNow()` override, `Advance(TimeSpan)` method). One copy (`GalaxyHierarchyCacheTests.cs:159`) accepts a `default` `DateTimeOffset` and seeds with `UtcNow`; the other three require an explicit start — a small but real semantic divergence. Tests-007 consolidated the same kind of duplication for `TestServerCallContext` / `RecordingServerStreamWriter` / `AllowAllConstraintEnforcer` into `src/MxGateway.Tests/TestSupport/`; this is the same drift pattern. + +**Recommendation:** Add `src/MxGateway.Tests/TestSupport/ManualTimeProvider.cs` with a single implementation (default-arg `DateTimeOffset start = default` resolving to a deterministic seed like `DateTimeOffset.UnixEpoch` or `UtcNow`, plus the `Advance` helper) and delete the four nested copies in favour of `using MxGateway.Tests.TestSupport;`. Same pattern as the Tests-007 resolution. + +**Resolution:** 2026-05-20 — Added `src/MxGateway.Tests/TestSupport/ManualTimeProvider.cs` with the unified signature `ManualTimeProvider(DateTimeOffset start = default)` (a `default` start seeds from `DateTimeOffset.UtcNow` for the `GalaxyHierarchyCacheTests` call site that previously relied on that behaviour) plus the `Advance(TimeSpan)` helper. Deleted the four duplicated `private sealed class ManualTimeProvider` definitions from `GalaxyHierarchyCacheTests.cs`, `FakeWorkerHarnessTests.cs`, `WorkerClientTests.cs`, and `SessionManagerTests.cs`; each file now imports `MxGateway.Tests.TestSupport`. The `SessionManagerTests` copy previously lacked `Advance` — folding it onto the shared type does not regress because that file never called `Advance`. Same consolidation pattern as Tests-007. + +### Tests-022 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs:52-61,90-99,126-135,163-172,202-211,238-247,282-294,339-360,413-434,484-506,553-567,663-688` | +| Status | Resolved | + +**Description:** Tests-013 added eleven `*_PropagatesCancellation` tests that pre-cancel the token (`cts.CancelAsync()` before calling `session.*BulkAsync(..., cts.Token)`) and assert `OperationCanceledException`. The fakes' `FakeBulkWorkerClient.InvokeAsync` calls `cancellationToken.ThrowIfCancellationRequested()` as the *first* statement — so the exception is thrown synchronously inside the fake before any of `GatewaySession.InvokeBulkInternalAsync` → `InvokeAsync` → bulk-result projection runs. This verifies that the token reaches the worker client (a regression that swapped in `CancellationToken.None` between layers would fail the test), but it does not exercise mid-flight cancellation: a token that becomes cancelled while the worker is `await`-suspended waiting on a reply. Mid-flight cancellation is the more interesting path (it's what a real client closing its stream looks like) and is not pinned for any of the eleven bulk methods. + +The cancellation tests for `WorkerClient` in `WorkerClientTests` *do* exercise the mid-flight path (the `FakeWorkerClient` returns `Task.FromCanceled` style via real pipe disconnection); only the gateway-side bulk tests are shallow. + +**Recommendation:** For at least one representative bulk method (e.g. `WriteSecuredBulkAsync` — the highest-value gateway path), replace the pre-cancellation pattern with a fake whose `InvokeAsync` returns a `TaskCompletionSource`-backed task that never completes until cancelled, then `cts.CancelAsync()` *after* `session.WriteSecuredBulkAsync(...)` has been awaited far enough to register a continuation. Assert the resulting `OperationCanceledException`'s `CancellationToken` matches `cts.Token`. The existing pre-cancel pattern is a reasonable cheap-coverage default for the other ten methods. + +**Resolution:** 2026-05-20 — Added `WriteSecuredBulkAsync_WhenCancelledMidFlight_ThrowsOperationCanceledForRequestToken` to `SessionManagerBulkTests` backed by a new `MidFlightBulkWorkerClient` fake whose `InvokeAsync` registers a cancellation continuation on the caller's token, signals `InvokeStarted`, and parks on a `TaskCompletionSource` that completes only when the token fires (or shutdown / kill / dispose tears it down). The test awaits `InvokeStarted.Task`, asserts the write task is still incomplete (proving the cancellation lands on an in-flight await rather than the synchronous fast-path), then calls `cts.CancelAsync()` and asserts the resulting `OperationCanceledException.CancellationToken == cts.Token` and `InvokeCount == 1`. The other ten `*_PropagatesCancellation` tests remain on the cheaper pre-cancel pattern per the finding's recommendation. + +### Tests-023 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Tests/Gateway/Sessions/SessionWorkerClientFactoryFakeWorkerTests.cs:334-374` | +| Status | Resolved | + +**Description:** Tests-015 corrected the smoke-test `FakeWorkerProcess.WaitForExitAsync` (in `GatewayEndToEndFakeWorkerSmokeTests.cs`) so it now awaits a `TaskCompletionSource` only completed by `Kill`/`MarkExited`, removing the "set `HasExited = true` and return immediately" cheat. The companion `FakeWorkerProcess` in `SessionWorkerClientFactoryFakeWorkerTests.cs:351-356` was *not* updated and still has the same cheat: `WaitForExitAsync` unconditionally sets `HasExited = true; ExitCode = 0; return ValueTask.CompletedTask;`. The original Tests-006 re-triage noted this companion was "fine there because no exit assertion is made"; the file at `a020350` does not yet assert `HasExited` or `ExitCode`, so this is not a current bug — but it is a latent regression vector: a future test in the same file that asserts `Assert.True(launcher.Process.HasExited)` after triggering shutdown would pass spuriously, exactly the failure mode Tests-015 just closed in the smoke-test copy. Two near-identical fakes in the same project with diverging semantics is brittle. + +**Recommendation:** Apply the same `TaskCompletionSource _exited` pattern to `SessionWorkerClientFactoryFakeWorkerTests.FakeWorkerProcess`: `WaitForExitAsync` awaits `_exited.Task`, `Kill` calls `MarkExited(-1)`, and add a `MarkExited(int)` helper that completes the TCS. The scripted launchers in this file already call `Kill()` through the disposal path Tests-011 added, so the change is mechanical and preserves all current behaviour. + +**Resolution:** 2026-05-20 — Brought the companion `FakeWorkerProcess` in `SessionWorkerClientFactoryFakeWorkerTests.cs` into parity with the Tests-015 smoke-test fake. `WaitForExitAsync` now awaits a `TaskCompletionSource _exited` (wrapped in `WaitAsync(cancellationToken)` for cooperative cancel) instead of unconditionally setting `HasExited = true; ExitCode = 0`. `Kill(bool)` increments `KillCount` and delegates to a new `MarkExited(int exitCode)` helper that sets `HasExited`, `ExitCode`, and completes the TCS. `KillCount` is still observable and pre-existing tests that assert `KillCount > 0` continue to pass. The latent regression vector — that a future `Assert.True(launcher.Process.HasExited)` in this file would pass spuriously — is closed. + +### Tests-024 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Testing coverage | +| Location | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:713-730,784-801,859-876`, `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs` | +| Status | Resolved | + +**Description:** Every `BulkConstraintPlan.MergeDeniedInto` implementation builds its merged reply by walking `OriginalCount` indices and dequeueing from the worker's `allowedResults` queue at each non-denied slot. `TryDequeue` silently returns `false` when the queue is empty, so if the worker returns *fewer* allowed results than the gateway forwarded (because of a protocol mismatch, a worker bug truncating the bulk reply, or a future change to per-entry result reporting), the merged reply will be shorter than `OriginalCount` — the gap is not filled with a synthetic failure result. Conversely, if the worker returns *more* allowed results than requested, the extras are silently dropped. Neither case is covered by `MxAccessGatewayServiceConstraintTests`: every fixture's `sessionManager.InvokeReply` returns exactly the same count as the number of allowed entries forwarded. A regression in worker bulk-reply construction or a contract drift could produce a silently-truncated public reply (clients observing fewer results than entries submitted, with no error) and no gateway-side test would fail. + +**Recommendation:** Add two fixtures to `MxAccessGatewayServiceConstraintTests`: `Invoke_WriteBulk_WhenWorkerReturnsFewerResultsThanAllowed_ProducesPartialReplyOrSyntheticFailure` (worker reply has N-1 results for N allowed entries; assert either the merged reply has `OriginalCount` entries with a synthetic-failure tail, or — if the gateway's current policy is "truncate" — pin that behaviour explicitly and document the expectation in a comment), and `Invoke_WriteBulk_WhenWorkerReturnsExtraResults_IgnoresExtras` (worker returns N+2 for N allowed; assert merged reply has exactly `OriginalCount`). Whichever current behaviour is correct should be made explicit by the test — the goal is preventing a silent change. + +**Resolution:** 2026-05-20 — Pinned the current `BulkConstraintPlan.MergeDeniedInto` behaviour for worker reply-count divergence. Added two fixtures to `MxAccessGatewayServiceConstraintTests`: `Invoke_WriteBulk_WhenWorkerReturnsFewerResultsThanAllowed_MergedReplyIsTruncated` (gateway forwards 2 allowed handles, worker returns 1 result; merged reply has 2 entries total — the worker result at the first non-denied slot and the denied entry at its original index — and the trailing under-supplied slot is silently dropped via `Queue.TryDequeue` returning `false`) and `Invoke_WriteBulk_WhenWorkerReturnsExtraResults_IgnoresExtras` (gateway forwards 2 allowed handles, worker returns 4; merged reply has exactly `OriginalCount == 3` entries; the two extras are bounded out by the `for index < OriginalCount` loop). The fixtures explicitly pin "truncate / discard extras" as the current contract — a future change to synthesise failure tails or surface extras must update the test, preventing a silent behavioural change. diff --git a/code-reviews/Worker.Tests/findings.md b/code-reviews/Worker.Tests/findings.md index bc89746..5aae2ab 100644 --- a/code-reviews/Worker.Tests/findings.md +++ b/code-reviews/Worker.Tests/findings.md @@ -5,7 +5,7 @@ | Module | `src/MxGateway.Worker.Tests` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | @@ -41,6 +41,21 @@ | 9 | Testing coverage | Issues found: Worker.Tests-017 (`WorkerCancel` envelope-dispatch path untested), Worker.Tests-022 (`WnWrapAlarmConsumer.PollOnce` transition-delta computation untested at the snapshot-to-transitions level). | | 10 | Documentation & comments | Issues found: Worker.Tests-023 (`AlarmClientWmProbeTests` and `WnWrapConsumerProbeTests` are unit-test classes carrying 1000+ lines of probe-only code; their `[Fact(Skip=...)]` status is documented but the probe scaffolding is mixed into the same test assembly as regression tests). | +### 2026-05-20 re-review (commit `a020350`) + +| # | Category | Result | +|---|---|---| +| 1 | Correctness & logic bugs | No new issues — Worker.Tests-018/024 fixes hold; the new `WriteAsync_WithEmptyEnvelope_ThrowsInvalidEnvelopeFromValidator` correctly documents that the writer-side defensive zero-length branch is intercepted by `WorkerEnvelopeValidator.Validate`. | +| 2 | mxaccessgw conventions | Issues found: Worker.Tests-025 (`LiveMxAccessFactAttribute` duplicated in Worker.Tests and IntegrationTests with no shared constant — divergent-by-drift risk). | +| 3 | Concurrency & thread safety | Issues found: Worker.Tests-027 (`FakeRuntimeSession.CancelCommandReturnValue` mutated without the same `gate` lock that protects `cancelledCorrelationIds`/`snapshot`/`events`). | +| 4 | Error handling & resilience | No new issues — Worker.Tests-021 closed all three uncovered protocol branches. | +| 5 | Security | No new issues. | +| 6 | Performance & resource management | No new issues. | +| 7 | Design-document adherence | Issues found: Worker.Tests-028 (Worker.Tests-023 resolution promised an `docs/GatewayTesting.md` paragraph describing the probe surface; the doc was never updated, so the partition is invisible outside the source tree). | +| 8 | Code organization & conventions | Issues found: Worker.Tests-026 (`MxAccessSession.CreateForTesting` has no runtime guard preventing accidental production use — only the `internal` modifier plus `InternalsVisibleTo` separates it from the live `Create` path); Worker.Tests-029 (Probes moved to `Probes/` folder but kept the unit-test `MxGateway.Worker.Tests` namespace, so a namespace-based filter cannot distinguish probes from regression tests). | +| 9 | Testing coverage | No new issues — the five `LiveMxAccessFact`-gated tests in `MxAccessLiveComCreationTests` and the `ComputeTransitions` unit tests close the previously identified gaps. | +| 10 | Documentation & comments | Issues found: Worker.Tests-030 (`CreateCancelEnvelope` uses `Sequence = 4` while the immediately-following `CreateShutdownEnvelope` uses `Sequence = 3`; the cancel test writes them in 4-then-3 order, which works because the worker has no inbound sequence-monotonicity check — but the numbering is misleading to a future reader and contradicts the gateway-side monotonic-sequence convention `gateway.md` documents for outbound). | + ## Findings ### Worker.Tests-001 @@ -402,3 +417,93 @@ **Recommendation:** Strengthen to `InvalidOperationException exception = Assert.Throws(...); Assert.Contains("simulated wnwrap subscribe failure", exception.Message)` — pin both the type and the originating message so a regression that throws a *different* `InvalidOperationException` from inside `AlarmCommandHandler` fails the test. **Resolution:** 2026-05-20 — `Subscribe_WhenUnderlyingSubscribeThrows_DisposesConsumer` now captures the thrown exception and asserts `Assert.Contains("simulated wnwrap subscribe failure", exception.Message)` against the fake's exact thrown message. A regression that throws a *different* `InvalidOperationException` from inside `AlarmCommandHandler` (for example its own "already subscribed" guard at line 73 of `AlarmCommandHandler.cs`) now fails the message-contains assertion — the original test's type-only `Assert.Throws` would have passed silently while hiding the swallowed failure cause. The disposal assertion (`consumer.Disposed == true`) is unchanged; the test now pins both the disposal contract and the origin of the propagated exception. XML doc on the test method documents the regression scenario. + +### Worker.Tests-025 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | mxaccessgw conventions | +| Location | `src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs:23`, `src/MxGateway.IntegrationTests/IntegrationTestEnvironment.cs:5`, `src/MxGateway.IntegrationTests/LiveMxAccessFactAttribute.cs:9-12` | +| Status | Resolved | + +**Description:** Worker.Tests-018 resolved the silent-skip issue by adding a Worker.Tests-local `LiveMxAccessFactAttribute`. The resolution called out that "introducing a cross-project shared assembly was not practical" because Worker.Tests targets net48/x86 and IntegrationTests targets net10.0. The two copies are correct today but the contract is held only by convention — both define `LiveMxAccessVariableName = "MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"` as separate `public const string` literals, with the same `=="1"` `StringComparison.Ordinal` check duplicated. The IntegrationTests copy delegates to `IntegrationTestEnvironment.LiveMxAccessTestsEnabled`/`IsEnabled`, so any future opt-in tweak (e.g. accepting `"true"` as well, or honouring a different env-var name) made in `IntegrationTestEnvironment` will silently leave Worker.Tests behind. The XML doc on the Worker.Tests copy acknowledges this risk in prose but the divergence is invisible at compile time — there's no test or assertion that pins the two opt-in checks return the same answer. + +**Recommendation:** Either (a) lift the env-var-name string into `MxGateway.Contracts` (which already multi-targets `net10.0;net48`) as a `public const string`, then both `LiveMxAccessFactAttribute` copies reference the same constant; (b) add a single unit test in Worker.Tests that pins `LiveMxAccessFactAttribute.LiveMxAccessVariableName == "MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"` to make the contract literal-visible to any reviewer changing the name; (c) document the synchronization requirement in `docs/GatewayTesting.md` alongside the existing live-opt-in section. + +**Resolution:** 2026-05-20 — Added `GatewayContractInfo.LiveMxAccessOptInVariableName` to `MxGateway.Contracts` (net10.0/net48-multi-targeted) and routed both `LiveMxAccessFactAttribute` copies plus `IntegrationTestEnvironment.LiveMxAccessVariableName` through that single constant; the env-var literal now lives in one place. + +### Worker.Tests-026 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Worker/MxAccess/MxAccessSession.cs:74-88` | +| Status | Resolved | + +**Description:** `MxAccessSession.CreateForTesting` (added in Worker.Tests-016) is declared `internal static`, gated only by `` in `MxGateway.Worker.csproj`. The XML doc states "production code must use the `Create` factory", but there is no runtime enforcement. The protection rests on (1) the `internal` modifier — which silently widens if any future `InternalsVisibleTo` directive is added (e.g. for an integration-test shim, a benchmark project, or an `InternalsVisibleTo`-using analyzer); and (2) reviewer attention. Worker.Tests itself contains real STA-running test code (the live tests, the probes), so a future test in Worker.Tests could call `CreateForTesting` from a context that has a real MXAccess COM object and the `new object()` placeholder would silently substitute. The factory hands out a session with `mxAccessComObject = new object()` so any code that later goes through `Marshal.IsComObject` or `Marshal.FinalReleaseComObject` on it would simply return false / no-op, masking lifetime regressions. + +**Recommendation:** Add a one-line conditional guard — e.g. `[Conditional("DEBUG")]` is not appropriate (the worker also ships Release builds), but the factory could check that `eventSink` is *not* an `MxAccessBaseEventSink` (the production sink), throwing `InvalidOperationException("CreateForTesting must not be used with the production MxAccessBaseEventSink")`. Production code never passes that sink to a "for testing" factory; the asymmetry is the cheapest signal. Alternatively, gate the factory with `[Obsolete("Test seam — never call from production code", error: false)]` so any production call surfaces as a build warning (and `TreatWarningsAsErrors` would turn that into a build break). + +**Resolution:** 2026-05-20 — Added a runtime guard to `MxAccessSession.CreateForTesting` that throws `ArgumentException` when the supplied `eventSink` is an `MxAccessBaseEventSink` (the production sink), so any future caller wiring the live sink into the test factory fails fast instead of silently bypassing `Marshal.IsComObject` on the `new object()` placeholder. + +### Worker.Tests-027 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs:174, 179-187` | +| Status | Resolved | + +**Description:** The consolidated `FakeRuntimeSession` (introduced by Worker.Tests-014, extended for Worker.Tests-017) reads/writes `cancelledCorrelationIds`, `snapshot`, and `events` under `lock(gate)`. The new `CancelCommandReturnValue` (a `bool` set by the test) is mutated outside any lock and read inside `CancelCommand` outside the lock as well (`return CancelCommandReturnValue;` after the locked `cancelledCorrelationIds.Add`). For a plain `bool` set before the worker's message-loop runs this is harmless on x86 (atomic-on-aligned-write), but it contradicts the rest of the file's locking convention and a future test that flips `CancelCommandReturnValue` mid-dispatch from a different thread would see an undocumented race. The same applies to `BlockDispatch`, `ThrowAfterDispatchReleased`, `ThrowTimeoutOnShutdown`, and `Disposed` — all are `bool`/auto-property without the `gate` lock — but those existed before Worker.Tests-017 and the finding flags only the consistency drift the new property introduces. + +**Recommendation:** Either (a) hold `lock(gate)` when reading `CancelCommandReturnValue` inside `CancelCommand`, matching the surrounding locked statement; (b) mark `CancelCommandReturnValue` with `volatile` to document the cross-thread visibility; or (c) add an XML-doc note stating the property must be set before `RunAsync` begins and is not safe to mutate mid-test. Option (c) is cheapest and matches how `BlockDispatch` is used today. + +**Resolution:** 2026-05-20 — Converted `CancelCommandReturnValue` to a private-backing-field property whose get/set both hold `lock(gate)`, and folded the return statement of `CancelCommand` inside the existing locked block, so the property now respects the same locking convention as `cancelledCorrelationIds`, `snapshot`, and `events`. + +### Worker.Tests-028 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Design-document adherence | +| Location | `docs/GatewayTesting.md`, `src/MxGateway.Worker.Tests/Probes/` | +| Status | Resolved | + +**Description:** The Worker.Tests-023 resolution (commit `a020350`) stated that option (b) was taken — moving the three probe files to `Probes/` — but the recommendation for option (b) was "move them into a `Probes/` subfolder inside `MxGateway.Worker.Tests` **and** add a one-paragraph header in `docs/GatewayTesting.md` describing the probe surface." The folder move was made; the documentation addition was not. `docs/GatewayTesting.md` has no mention of `Probes/`, `AlarmClientWmProbeTests`, `WnWrapConsumerProbeTests`, or `AlarmsLiveSmokeTests` (verified with `Grep` against the doc). A reader navigating `docs/GatewayTesting.md` to understand the testing surface cannot tell the probes exist, what they pin, or how to flip `Skip=null` on the dev rig — the only documentation is the in-source `Skip=...` strings and the per-probe XML doc. + +**Recommendation:** Add a `## Dev-rig probes` (or similar) section to `docs/GatewayTesting.md` that names the three probe files, explains the probe contract (live AVEVA COM, `Skip=null` flip, no in-CI coverage), and points to the source location `src/MxGateway.Worker.Tests/Probes/`. One paragraph is enough; the existing `[Fact(Skip=...)]` strings carry the rest of the detail. + +**Resolution:** 2026-05-20 — Added a `## Dev-rig Probes` section to `docs/GatewayTesting.md` between the Live MXAccess Smoke and Live Galaxy Repository sections; the new section names the three probe files (`AlarmsLiveSmokeTests`, `AlarmClientWmProbeTests`, `WnWrapConsumerProbeTests`), explains the probe contract (live AVEVA COM, `Skip=null` flip on the dev rig, not part of the regression contract), and points to the source location `src/MxGateway.Worker.Tests/Probes/`. + +### Worker.Tests-029 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Code organization & conventions | +| Location | `src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs:9`, `src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs:14`, `src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs:10` | +| Status | Resolved | + +**Description:** Worker.Tests-023 partitioned the probes by directory (`Probes/` subfolder) but kept their original namespace `namespace MxGateway.Worker.Tests;` rather than moving them to `namespace MxGateway.Worker.Tests.Probes;`. The folder/namespace mismatch is a minor C# convention drift (the project's other subfolder-grouped tests — `Bootstrap/`, `Conversion/`, `MxAccess/`, `Sta/`, `Ipc/`, `TestSupport/`, `Contracts/`, `ProjectStructure/` — all use a `MxGateway.Worker.Tests.` namespace matching the directory). It also means an xUnit test filter like `--filter FullyQualifiedName~MxGateway.Worker.Tests.Probes` will discover zero tests, so the partition is invisible to the runner: any CI-side rule that wants to exclude probes still has to enumerate file/class names individually rather than match by namespace. + +**Recommendation:** Move the three probe files to `namespace MxGateway.Worker.Tests.Probes;`. xUnit discovers by attribute, not by namespace, so the rename is behaviour-neutral and lets a `FullyQualifiedName~Probes` filter trivially target them. The two other consolidations introduced in this sweep (`TestSupport/` → `MxGateway.Worker.Tests.TestSupport`) already follow this pattern. + +**Resolution:** 2026-05-20 — Moved `AlarmsLiveSmokeTests`, `AlarmClientWmProbeTests`, and `WnWrapConsumerProbeTests` to `namespace MxGateway.Worker.Tests.Probes;` so the folder and namespace match the project's other subfolder-grouped tests; a `FullyQualifiedName~MxGateway.Worker.Tests.Probes` filter now targets exactly the three probe classes. Verified by xUnit discovery output: the three probes appear under their new namespace as `[SKIP]`. + +### Worker.Tests-030 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Documentation & comments | +| Location | `src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs:862-890` | +| Status | Resolved | + +**Description:** Within `WorkerPipeSessionTests`, the inbound-envelope helpers assign `Sequence` values that are inconsistent with the order in which the tests send them: `CreateGatewayHelloEnvelope` is `Sequence = 1`, `CreateCommandEnvelope` is `Sequence = 2`, `CreateShutdownEnvelope` is `Sequence = 3`, and `CreateCancelEnvelope` is `Sequence = 4`. The Worker.Tests-017 cancel test sends the cancel (`Sequence = 4`) **before** the shutdown (`Sequence = 3`) — a future reader inspecting the wire trace will see decreasing sequence numbers. The test still passes because the worker has no inbound sequence-monotonicity check (verified by `Grep`ing `Ipc/` for `ValidateSequence`/`monotonic`/sequence-comparison patterns — none exist). But `gateway.md` documents monotonic sequence numbers on the outbound side, and the test's literal sequence values suggest a convention that isn't enforced and can mislead a debugger correlating a frame dump to test intent. + +**Recommendation:** Either (a) reassign `CreateCancelEnvelope` to a sequence value `>` shutdown (or pass the sequence as a parameter, matching `CreateGatewayHelloEnvelope`'s parameter style), so the wire trace reads in ascending order; (b) add an XML-doc note on the cancel test stating that the worker has no inbound monotonicity check and the test ignores envelope sequence ordering; (c) parameterise all four helper methods so each test passes its desired sequence and the literal numbers stop carrying implicit meaning. Option (c) is the cleanest because `CreateGatewayHelloEnvelope` is already parameter-driven for nonce/version. + +**Resolution:** 2026-05-20 — Took option (c): parameterised `CreateGatewayHelloEnvelope`/`CreateCommandEnvelope`/`CreateCancelEnvelope`/`CreateShutdownEnvelope` with a `ulong sequence` argument (defaults 1/2/2/3 respectively, matching the typical Hello/Command/Cancel/Shutdown ordering), so the literal sequence values no longer carry implicit meaning. Updated the cancel-correlation test's wire trace to ascend (Hello=1, Cancel=2, Shutdown=3) and added a comment noting that the worker has no inbound monotonicity check — the parameter exists so multi-frame tests can pin the trace ordering explicitly when needed. diff --git a/code-reviews/Worker/findings.md b/code-reviews/Worker/findings.md index 25209ed..26cd611 100644 --- a/code-reviews/Worker/findings.md +++ b/code-reviews/Worker/findings.md @@ -5,26 +5,34 @@ | Module | `src/MxGateway.Worker` | | Reviewer | Claude Code | | Review date | 2026-05-20 | -| Commit reviewed | `1cd51bb` | +| Commit reviewed | `a020350` | | Status | Reviewed | | Open findings | 0 | ## Checklist coverage -This row reflects the 2026-05-20 re-review at commit `1cd51bb`. Worker-001..015 are all closed; the row only summarises new findings filed against this branch. +This row reflects the 2026-05-20 re-review at commit `a020350`. Worker-001..022 are all closed; the row only summarises new findings filed against this commit. The prior pass's fixes for Worker-016..022 were verified sound: + +- **Worker-016**: `StaRuntimeShutdownException` exists, `MxAccessStaSession.cs:261` is the only `catch (StaRuntimeShutdownException)` site in the module. No accidental catch elsewhere (grep verified). The graceful-shutdown vs. STA-affinity-violation distinction holds. +- **Worker-017**: `ReportWatchdogFaultIfNeededAsync` returns early when `CurrentCommandCorrelationId` is non-empty. Sound for the slow-but-progressing case; but see **Worker-023** — there is no defensive ceiling, so a truly stuck command (synchronous COM call hung against a dead MXAccess provider) leaves `CurrentCommandCorrelationId` non-empty forever and the worker-side watchdog is permanently suppressed. +- **Worker-018**: `SetXmlAlarmQuery` is now wrapped in `try/catch (COMException)` and re-thrown as `InvalidOperationException` carrying the HRESULT. Sound. +- **Worker-019**: `subscriptionExpression` field is gone. +- **Worker-020**: `_state is not WorkerState.Ready and not WorkerState.ExecutingCommand` simplified to `_state != WorkerState.Ready`. Confirmed `_state` is never assigned `ExecutingCommand`; volatile reads are atomic. +- **Worker-021**: `_runtimeSession ??=` in `InitializeMxAccessAsync` preserves a factory-supplied session. Confirmed `RunAsync` path bypasses `InitializeMxAccessAsync` entirely (it passes its own factory-driven lambda), so the `??=` only runs on the legacy parameterless-`CompleteStartupHandshakeAsync` direct-invocation path. +- **Worker-022**: `MxAlarmSnapshot.cs` (now containing only `MxAlarmSnapshotRecord`), `MxAlarmStateKind.cs`, `MxAlarmTransitionEvent.cs` — filenames match their single public type; all three keep the `MxGateway.Worker.MxAccess` namespace. | # | Category | Result | |---|---|---| -| 1 | Correctness & logic bugs | Issues found: Worker-018 (`SetXmlAlarmQuery` return code ignored), Worker-019 (`subscriptionExpression` is write-only dead state), Worker-020 (dead `ExecutingCommand` arm in `ProcessCommandAsync` state check), Worker-021 (`InitializeMxAccessAsync` can overwrite an already-set `_runtimeSession`). | -| 2 | mxaccessgw conventions | Issue found: Worker-022 (`MxAlarmSnapshot.cs` declares three public types in one file). | -| 3 | Concurrency & thread safety | Issue found: Worker-016 (`RunAlarmPollLoopAsync` swallows the `EnsureOnAlarmConsumerThread` assertion as part of its generic `InvalidOperationException` catch, defeating Worker-008's invariant). | -| 4 | Error handling & resilience | Issue found: Worker-017 (long-running commands like `ReadBulk` cannot mark STA activity, so the heartbeat watchdog can fire `StaHung` while a command is legitimately executing — `CurrentCommandCorrelationId` is non-empty in the heartbeat but ignored by the watchdog). | -| 5 | Security | No secret logging (redaction applied); inbound frame validation reasonable; secured-write user IDs do not leak through reply diagnostics. No new issues found. | -| 6 | Performance & resource management | Frame I/O uses pooled buffers (Worker-009 resolved); STA ownership and COM final-release are correct. No new issues found. | -| 7 | Design-document adherence | Code matches `gateway.md` / `MxAccessWorkerInstanceDesign.md` / `WorkerFrameProtocol.md`. No new design drift. | -| 8 | Code organization & conventions | Issue found: Worker-022 (see row 2). | -| 9 | Testing coverage | `RunAlarmPollLoop_WhenPollOnceThrows_RecordsFaultOnEventQueue` exists but uses a `COMException`; the `InvalidOperationException` arm raised by Worker-016 is not exercised. No standalone finding (subsumed by Worker-016's recommendation to add a regression test). | -| 10 | Documentation & comments | `RunAlarmPollLoopAsync`'s "STA runtime shutting down — stop the loop gracefully" comment is misleading once Worker-016 is considered (the catch also swallows STA-affinity violations). Noted in Worker-016. | +| 1 | Correctness & logic bugs | Issue found: Worker-025 (`RunAsync` does not null-check the result of `_runtimeSessionFactory()`; a null factory return would NRE on `_runtimeSession.StartAsync(...)` rather than throw a diagnostic exception). | +| 2 | mxaccessgw conventions | No issues found. The split alarm-snapshot files match the one-public-type-per-file convention; namespace consistency verified. | +| 3 | Concurrency & thread safety | Issue found: Worker-024 (the alarm command path — `Subscribe`/`Acknowledge`/`AcknowledgeByName`/`QueryActive`/`Unsubscribe` — has no STA-affinity assertion equivalent to Worker-008's `EnsureOnAlarmConsumerThread` guard; only the alarm *poll* path enforces affinity, leaving a latent gap if a future refactor lets alarm commands run off-STA). | +| 4 | Error handling & resilience | Issue found: Worker-023 (Worker-017's watchdog skip has no defensive ceiling; a truly stuck command — synchronous COM hung against a dead MXAccess provider — keeps `CurrentCommandCorrelationId` non-empty indefinitely, and the worker-side `StaHung` watchdog never fires. Gateway-side `CommandTimeout` is the only safety net). | +| 5 | Security | No issues found. No secret logging on the alarm path; the dropped-reply diagnostic Worker-003 added logs only the correlation id and command method, not the command payload. | +| 6 | Performance & resource management | No new issues found. Frame I/O still uses pooled buffers (Worker-009); STA join timeouts in `Dispose` are bounded. | +| 7 | Design-document adherence | No new design drift. The split alarm files preserve the documented public API surface. Worker-017's resolution comment documents the watchdog design intent — though see Worker-023 for the documentation gap on truly-stuck commands. | +| 8 | Code organization & conventions | No issues found. Worker-022 was the last file-organization issue. | +| 9 | Testing coverage | Worker-016 and Worker-017 each have direct regression tests (`RunAlarmPollLoop_WhenPollOnceThrowsInvalidOperation_RecordsFaultOnEventQueue`, `RunAsync_WhenStaActivityIsStaleWithCommandInFlight_DoesNotWriteWatchdogFault`). Worker-018, -020, -021's resolution notes state "no new regression test was added in this agent because Worker.Tests is being modified by a concurrent agent" — Worker-018's `SetXmlAlarmQuery` failure-translation and Worker-020's simplified `_state != Ready` check have no regression test in this branch yet. No standalone finding — these are documented gaps in the resolution notes of the prior pass. | +| 10 | Documentation & comments | No new issues. Worker-017's XML doc on `ReportWatchdogFaultIfNeededAsync` documents the design intent clearly; the `_runtimeSession ??=` reasoning is documented inline; Worker-016's graceful-vs-affinity distinction is documented at both catch sites. | ## Findings @@ -367,3 +375,66 @@ This row reflects the 2026-05-20 re-review at commit `1cd51bb`. Worker-001..015 **Recommendation:** Move `MxAlarmStateKind` and `MxAlarmTransitionEvent` into their own files (`MxAlarmStateKind.cs`, `MxAlarmTransitionEvent.cs`) and leave `MxAlarmSnapshotRecord` in `MxAlarmSnapshot.cs` (or rename the file to `MxAlarmSnapshotRecord.cs` to match the surviving type). Pure file-organization change; no behaviour or namespace impact. **Resolution:** 2026-05-20 — Split `MxAlarmSnapshot.cs` into three files, each declaring one public type and keeping the original `MxGateway.Worker.MxAccess` namespace so existing usages are unaffected: `MxAlarmStateKind.cs` (the enum, with its XML doc), `MxAlarmTransitionEvent.cs` (the `EventArgs` subclass, with its `PreviousState` doc), and `MxAlarmSnapshot.cs` (now containing only `MxAlarmSnapshotRecord` plus its XML doc). Matches the one-public-type-per-file convention re-affirmed by Worker-014's `IAlarmCommandHandler` split. Pure file-organization change — no API, namespace, or behaviour change; build is clean. + +### Worker-023 + +| Field | Value | +|---|---| +| Severity | Medium | +| Category | Error handling & resilience | +| Location | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:610-668`, `src/MxGateway.Worker/MxAccess/MxAccessCommandExecutor.cs:124-153` | +| Status | Resolved | + +**Description:** Worker-017 (resolved at `a020350`) suppresses the `StaHung` watchdog when `CurrentCommandCorrelationId` is non-empty: "the STA is busy executing a command, not hung." The fix is correct for the motivating case (legitimately slow `ReadBulk` against many uncached tags) — gateway-side per-command timeouts (`WorkerClient.InvokeAsync`'s `timeout` parameter, see `src/MxGateway.Server/Workers/WorkerClient.cs:189-218`) eventually fail the command and may kill the worker. **But the suppression has no defensive ceiling.** Most MXAccess commands in `MxAccessCommandExecutor` — `Register`, `AddItem`, `Advise`, `Write`, `WriteSecured`, and their bulk variants — call directly into the MXAccess COM object **with no internal deadline**. If a COM call hangs (e.g. the MXAccess provider crashed and the cross-apartment marshaler is permanently blocked, or a write completion never fires), `StaRuntime.ProcessQueuedCommands` is stuck inside `workItem.Execute()`, `StaCommandDispatcher.currentCommandCorrelationId` stays non-empty forever, and `ReportWatchdogFaultIfNeededAsync` will short-circuit on every heartbeat. The worker-side `StaHung` watchdog — the only signal that distinguishes a hung STA from a slow gateway response from inside the worker — is permanently defeated for that session. Gateway-side `CommandTimeout` is the safety net, but it depends on the gateway operator picking a sensible per-command timeout (some bulk operations legitimately set this to many minutes), and it does not surface a worker-originated diagnostic (`StaHung` fault category, `LastStaActivityUtc` value) to the gateway audit trail. + +**Recommendation:** Add a defensive upper bound, distinct from `HeartbeatGrace`, after which the watchdog fires even when a command is in flight — e.g. `HeartbeatStuckCeiling` (default 5× `HeartbeatGrace` = 75s, or align with the longest reasonable per-command timeout). Pseudocode for the in-flight branch: + +```csharp +if (!string.IsNullOrEmpty(snapshot.CurrentCommandCorrelationId) + && staleFor <= _sessionOptions.HeartbeatStuckCeiling) +{ + return; // slow command — gateway will time out if needed +} +// staleFor > ceiling OR no command in flight — fire StaHung +``` + +Document the ceiling in `MxAccessWorkerInstanceDesign.md`'s watchdog section. Add a regression test that drives `RunAsync` with `CurrentCommandCorrelationId` non-empty and `LastStaActivityUtc` stale beyond the ceiling, asserting `WorkerFaultCategory.StaHung` is emitted. + +**Resolution:** 2026-05-20 — Added `WorkerPipeSessionOptions.HeartbeatStuckCeiling` (default 75s = 5 × `HeartbeatGrace`) and extended `WorkerPipeSession.ReportWatchdogFaultIfNeededAsync` so the in-flight-command suppression is bounded by the ceiling: once `staleFor > HeartbeatStuckCeiling` the watchdog fires `StaHung` even with `CurrentCommandCorrelationId` non-empty. A truly stuck synchronous COM call (dead provider, blocked marshaler) no longer permanently defeats the worker-side watchdog. The ceiling is validated at startup (`> 0` and `> HeartbeatGrace`). Documented in the new XML doc on `HeartbeatStuckCeiling` and in `docs/MxAccessWorkerInstanceDesign.md`'s "Heartbeat And Watchdog" section. Regression test `WorkerPipeSessionTests.RunAsync_WhenStaActivityIsStaleBeyondCeilingWithCommandInFlight_WritesWatchdogFault` drives `RunAsync` with a non-empty current-command id and stale activity beyond the ceiling, asserting `WorkerFaultCategory.StaHung` is emitted. The existing `RunAsync_WhenStaActivityIsStaleWithCommandInFlight_DoesNotWriteWatchdogFault` test (5s stale, default 75s ceiling) continues to pass, confirming the suppression still works within the ceiling. + +### Worker-024 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Concurrency & thread safety | +| Location | `src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs:63-187`, `src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs:191-323` | +| Status | Resolved | + +**Description:** Worker-008 (resolved 2026-05-18) introduced `MxAccessStaSession.AssertOnAlarmConsumerThread(int?, int)`, called from `EnsureOnAlarmConsumerThread()` in the marshalled poll lambda at `RunAlarmPollLoopAsync` (`MxAccessStaSession.cs:247`). The assertion catches a regression that runs `IMxAccessAlarmConsumer.PollOnce()` off the STA — exactly the deadlock-on-cross-apartment-marshaling risk the `ThreadingModel=Apartment` wnwrap consumer demands. **However, the assertion guards only the poll path.** `AlarmCommandHandler.Subscribe`, `Acknowledge`, `AcknowledgeByName`, `QueryActive`, and `Unsubscribe` — each of which calls into the same `IMxAccessAlarmConsumer` and ultimately the COM object — have no equivalent guard. Today they are reached only through `MxAccessCommandExecutor.Execute` → `StaCommandDispatcher.ExecuteQueuedCommandAsync` → `staRuntime.InvokeAsync(...)`, so they do run on the STA in production. But the invariant is enforced only by *convention* (the same convention Worker-008 made explicit for `PollOnce`); a future refactor that lets a test or a refactored fast-path call into the handler off-STA would silently break the same apartment rule, and the wnwrap COM call would block on marshaling rather than fail loudly. + +**Recommendation:** Add an `EnsureOnAlarmConsumerThread()`-equivalent assertion at the entry of each `AlarmCommandHandler` operation that touches the consumer (`Subscribe` is the highest-value site because it constructs the consumer; `Acknowledge*` and `QueryActive` next). Reuse `MxAccessStaSession.AssertOnAlarmConsumerThread` so the affinity invariant has a single canonical guard. Wire the expected thread id through the handler's constructor (today `AlarmCommandHandler` does not know the STA thread id — `MxAccessStaSession` captures it at line 191 but does not pass it). One implementation shape: hand the handler a small `IThreadAffinityGuard` whose `Verify()` is called at each entry, constructed by `MxAccessStaSession` once `alarmConsumerThreadId` is captured. + +**Resolution:** 2026-05-20 — Extended `AlarmCommandHandler` with a third constructor that takes an optional `Action? threadAffinityCheck`, and invoked the guard at the entry of every method that touches the underlying `IMxAccessAlarmConsumer`: `Subscribe`, `Unsubscribe`, `Acknowledge`, `AcknowledgeByName`, `QueryActive`, and `PollOnce`. The factory signature on `MxAccessStaSession` was widened from `Func` to `Func`, so `MxAccessStaSession` (which captures `alarmConsumerThreadId` at the factory call site, already running inside `staRuntime.InvokeAsync`) can pass its existing `EnsureOnAlarmConsumerThread` as the guard — keeping the affinity invariant on a single canonical check, `AssertOnAlarmConsumerThread`. `WorkerPipeSession`'s three factory wiring sites were updated to `(eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity)`. The previous two-arg `AlarmCommandHandler` constructor remains (now delegating with `threadAffinityCheck: null`) so existing `AlarmCommandHandlerTests` continue to exercise the handler on a single thread without configuring a guard. Regression tests `AlarmCommandHandlerTests.EveryCommandPathEntry_InvokesThreadAffinityGuard` (counts invocations across all six entry points) and `EveryCommandPathEntry_PropagatesAffinityGuardException` (a throwing guard propagates from every entry point) verify the wiring. + +### Worker-025 + +| Field | Value | +|---|---| +| Severity | Low | +| Category | Correctness & logic bugs | +| Location | `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:111-117` | +| Status | Resolved | + +**Description:** `RunAsync` assigns `_runtimeSession = _runtimeSessionFactory()` (line 111) and immediately dereferences `_runtimeSession.StartAsync(...)` inside the lambda at line 115. If the supplied factory ever returns `null`, the lambda will throw `NullReferenceException` rather than a diagnostic exception, and the `finally` block at line 128 (`_runtimeSession?.Dispose()`) silently no-ops. The production factories (`() => new MxAccessStaSession(...)` in the two convenience constructors) never return null, but the factory delegate type `Func` admits null returns and the constructor's `runtimeSessionFactory ?? throw` null-check at line 102 only validates the delegate itself, not its return value. The `InitializeMxAccessAsync` direct-invocation path uses `_runtimeSession ??= new MxAccessStaSession(...)` (line 840), so a null factory return there would be replaced with a default instance — different behavior from the `RunAsync` path. + +**Recommendation:** Promote the null check to the call site: + +```csharp +_runtimeSession = _runtimeSessionFactory() + ?? throw new InvalidOperationException("Worker runtime session factory returned null."); +``` + +Match the pattern `AlarmCommandHandler.Subscribe` already uses for `consumerFactory()` (`AlarmCommandHandler.cs:76-77`). + +**Resolution:** 2026-05-20 — `WorkerPipeSession.RunAsync` now uses `_runtimeSession = _runtimeSessionFactory() ?? throw new InvalidOperationException("Worker runtime session factory returned null.");`, matching the pattern `AlarmCommandHandler.Subscribe` uses for its `consumerFactory()`. A null factory return now produces a clear diagnostic exception at the call site instead of NRE-ing on the next dereference (and the `finally` block's `_runtimeSession?.Dispose()` silently no-oping on a half-initialized session). Regression test `WorkerPipeSessionTests.RunAsync_WhenRuntimeSessionFactoryReturnsNull_ThrowsDiagnosticException` drives `RunAsync` with `() => null!` and asserts the diagnostic `InvalidOperationException` is thrown with the expected message. diff --git a/docs/Authorization.md b/docs/Authorization.md index 8eddddc..2d2cdb6 100644 --- a/docs/Authorization.md +++ b/docs/Authorization.md @@ -123,10 +123,14 @@ private static string ResolveCommandScope(MxCommandKind kind) return kind switch { MxCommandKind.Write or - MxCommandKind.Write2 => GatewayScopes.InvokeWrite, + MxCommandKind.Write2 or + MxCommandKind.WriteBulk or + MxCommandKind.Write2Bulk => GatewayScopes.InvokeWrite, MxCommandKind.WriteSecured or MxCommandKind.WriteSecured2 or + MxCommandKind.WriteSecuredBulk or + MxCommandKind.WriteSecured2Bulk or MxCommandKind.AuthenticateUser => GatewayScopes.InvokeSecure, MxCommandKind.ArchestraUserToId or @@ -141,7 +145,7 @@ private static string ResolveCommandScope(MxCommandKind kind) } ``` -Reads (`Register`, `AddItem`, `Advise`, and any other unspecified kind) fall through to `InvokeRead`, which keeps the matrix small while still separating reads from writes, secured writes, metadata lookups, event drains, and worker shutdown. +Reads (`Register`, `AddItem`, `Advise`, `ReadBulk`, and any other unspecified kind) fall through to `InvokeRead`, which keeps the matrix small while still separating reads from writes, secured writes, metadata lookups, event drains, and worker shutdown. The four bulk-write families (`WriteBulk`, `Write2Bulk`, `WriteSecuredBulk`, `WriteSecured2Bulk`) are mapped explicitly so a missing arm cannot silently demote a bulk write to a read scope. ## Constraint Enforcement @@ -174,11 +178,18 @@ page create dialog (see dashboard API Keys page also renders each key's effective constraints. The service checks read constraints for `AddItem`, `AddItem2`, `AddItemBulk`, -`SubscribeBulk`, and `AdviseItemBulk`. It checks write constraints for -`Write`, `Write2`, `WriteSecured`, and `WriteSecured2`. Successful item -registrations are tracked per session so later item-handle commands resolve -back to the original tag address. If a constrained key presents an unknown item -handle, the gateway fails closed. +`SubscribeBulk`, `AdviseItemBulk`, and `ReadBulk`. It checks write constraints +for `Write`, `Write2`, `WriteSecured`, `WriteSecured2`, `WriteBulk`, +`Write2Bulk`, `WriteSecuredBulk`, and `WriteSecured2Bulk`. Bulk commands run +through `BulkConstraintPlan` (`ReadBulkConstraintPlan`, +`WriteBulkConstraintPlan`, `SubscribeBulkConstraintPlan`), which preserves the +caller's input order: each entry is evaluated against the constraint surface, +and `BulkConstraintPlan.MergeDeniedInto` re-merges denied entries back into +their original index positions so the reply slot at `entries[i]` always +corresponds to the request slot at `entries[i]`. Successful item registrations +are tracked per session so later item-handle commands resolve back to the +original tag address. If a constrained key presents an unknown item handle, +the gateway fails closed. Non-bulk constraint failures return gRPC `PermissionDenied`. Bulk read commands preserve input order and return a failed `SubscribeResult` for each @@ -195,7 +206,7 @@ blocking constraint; secured values and raw credentials are never logged. | `SessionOpen` | `session:open` | `OpenSessionRequest` | | `SessionClose` | `session:close` | `CloseSessionRequest` | | `EventsRead` | `events:read` | `StreamEventsRequest`, `QueryActiveAlarmsRequest`, `MxCommandKind.DrainEvents` | -| `InvokeRead` | `invoke:read` | `MxCommandRequest` for read-style command kinds (`Register`, `AddItem`, `Advise`, and any kind not otherwise mapped) | +| `InvokeRead` | `invoke:read` | `MxCommandRequest` for read-style command kinds (`Register`, `AddItem`, `Advise`, `ReadBulk`, and any kind not otherwise mapped) | | `InvokeWrite` | `invoke:write` | `AcknowledgeAlarmRequest`, `MxCommandKind.Write`, `MxCommandKind.Write2`, `MxCommandKind.WriteBulk`, `MxCommandKind.Write2Bulk` | | `InvokeSecure` | `invoke:secure` | `MxCommandKind.WriteSecured`, `MxCommandKind.WriteSecured2`, `MxCommandKind.WriteSecuredBulk`, `MxCommandKind.WriteSecured2Bulk`, `MxCommandKind.AuthenticateUser` | | `MetadataRead` | `metadata:read` | `MxCommandKind.ArchestraUserToId`, `MxCommandKind.GetSessionState`, `MxCommandKind.GetWorkerInfo`, `GalaxyRepository.TestConnection`, `GalaxyRepository.GetLastDeployTime`, `GalaxyRepository.DiscoverHierarchy`, `GalaxyRepository.WatchDeployEvents` | diff --git a/docs/GatewayTesting.md b/docs/GatewayTesting.md index 27dfcc1..901d13e 100644 --- a/docs/GatewayTesting.md +++ b/docs/GatewayTesting.md @@ -104,6 +104,36 @@ The test output includes session id, worker process id, command status, HRESULT/status diagnostics, event sequence and handles, close status, and worker stdout/stderr lines emitted during the run. +## Dev-rig Probes + +`src/MxGateway.Worker.Tests/Probes/` partitions runtime probes from the regular +Worker.Tests regression suite. The folder is its own +`MxGateway.Worker.Tests.Probes` namespace so a discovery filter (e.g. `dotnet +test --filter FullyQualifiedName~MxGateway.Worker.Tests.Probes`) can target or +exclude them without enumerating individual class names. The probes are +`[Fact(Skip = "...")]` by default and exist to characterize live AVEVA +behavior on the dev rig, not to gate CI — flip `Skip = null` on the dev box +with installed MXAccess + a running Galaxy provider when running them: + +- `AlarmsLiveSmokeTests` — end-to-end smoke for the alarms-over-gateway + pipeline (`WnWrapAlarmConsumer` + `AlarmDispatcher` + + `MxAccessAlarmEventSink`) against `\\\Galaxy!DEV` with the dev rig's + 10-second flip script writing `TestMachine_001.TestAlarm001`. +- `AlarmClientWmProbeTests` — registers as an `AlarmClient` consumer on a real + hidden message-only window and logs every Win32 message that arrives during + a fixed pump window. Used to identify the `WM_APP` / + `RegisterWindowMessage` IDs alarm callbacks use. +- `WnWrapConsumerProbeTests` — instantiates AVEVA's standalone `wnwrapConsumer` + COM class, subscribes to the dev rig's `\\\Galaxy!DEV` provider, + and polls `GetXmlCurrentAlarms2`. The XML payload bypasses the + `FILETIME→DateTime` auto-marshaling that crashes + `aaAlarmManagedClient.AlarmClient.GetHighPriAlarm` on this rig. + +The probes share the Worker.Tests project (so they can use its `net48`/`x86` +configuration and the installed `ArchestrA.MxAccess` / `aaAlarmManagedClient` +references), but they are not part of the regression contract — a Worker.Tests +run with `Skip` left in place passes them as skipped. + ## Live Galaxy Repository `GalaxyRepositoryLiveTests` in `src/MxGateway.IntegrationTests/Galaxy/` exercises diff --git a/docs/MxAccessWorkerInstanceDesign.md b/docs/MxAccessWorkerInstanceDesign.md index f2eeafc..9d61235 100644 --- a/docs/MxAccessWorkerInstanceDesign.md +++ b/docs/MxAccessWorkerInstanceDesign.md @@ -672,6 +672,23 @@ heartbeat fields until dedicated thresholds own those warnings. The worker reports stale STA activity, but the gateway owns the final kill decision through its existing heartbeat and worker lifecycle policy. +The in-flight-command suppression itself is bounded by +`WorkerPipeSessionOptions.HeartbeatStuckCeiling` (default 75 seconds = 5 × +`HeartbeatGrace`). The motivating case for the suppression is a legitimately +slow synchronous command — but a genuinely stuck COM call (for example +against a dead MXAccess provider whose cross-apartment marshaler is +permanently blocked, or a write completion that never fires) leaves +`CurrentCommandCorrelationId` non-empty indefinitely. Without an upper bound +the worker-side `StaHung` watchdog would be permanently defeated for that +session and only the gateway's per-command timeout would catch the hang — +losing the worker-originated diagnostic (`StaHung` fault category, the +stale-by interval) from the gateway audit trail. Once `LastStaActivityUtc` +has been stale for longer than `HeartbeatStuckCeiling`, the watchdog fires +`StaHung` regardless of whether a command is in flight, on the assumption +that no legitimate STA command should run that long without periodically +refreshing activity. Deployments that legitimately run very long bulk +operations should raise the ceiling rather than disable it. + ## Shutdown Graceful shutdown sequence: diff --git a/src/MxGateway.Contracts/GatewayContractInfo.cs b/src/MxGateway.Contracts/GatewayContractInfo.cs index d903ae0..8127e67 100644 --- a/src/MxGateway.Contracts/GatewayContractInfo.cs +++ b/src/MxGateway.Contracts/GatewayContractInfo.cs @@ -13,4 +13,14 @@ public static class GatewayContractInfo public const uint WorkerProtocolVersion = 1; public const string DefaultBackendName = "mxaccess-worker"; + + /// + /// Environment variable name that opts an xUnit suite into running live + /// MXAccess COM tests. Single source of truth shared by both + /// MxGateway.IntegrationTests.LiveMxAccessFactAttribute and + /// MxGateway.Worker.Tests.TestSupport.LiveMxAccessFactAttribute + /// so any future opt-in tweak does not silently leave one project + /// behind — see Worker.Tests-025. + /// + public const string LiveMxAccessOptInVariableName = "MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"; } diff --git a/src/MxGateway.Contracts/Generated/MxaccessGateway.cs b/src/MxGateway.Contracts/Generated/MxaccessGateway.cs index dbf85d4..8ba521a 100644 --- a/src/MxGateway.Contracts/Generated/MxaccessGateway.cs +++ b/src/MxGateway.Contracts/Generated/MxaccessGateway.cs @@ -19753,9 +19753,11 @@ namespace MxGateway.Contracts.Proto { /// /// Per-item result for the four bulk write families. `item_handle` mirrors the /// request entry's item_handle so callers can correlate inputs to outputs even - /// when the gateway's tag-allowlist filter dropped some entries before reaching - /// the worker. Per-item failures populate `error_message` + `hresult` and never - /// raise — callers iterate and inspect each entry. + /// when the gateway's per-entry `IConstraintEnforcer.CheckWriteHandleAsync` + /// filter (see `MxAccessGatewayService.ReplaceWriteBulkEntries` and + /// `docs/Authorization.md`) dropped some entries before reaching the worker. + /// Per-item failures populate `error_message` + `hresult` and never raise — + /// callers iterate and inspect each entry. /// [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] public sealed partial class BulkWriteResult : pb::IMessage @@ -20338,6 +20340,20 @@ namespace MxGateway.Contracts.Proto { /// an existing live subscription's last OnDataChange (the worker did not touch /// the subscription); false when the worker took the AddItem + Advise + wait + /// UnAdvise + RemoveItem snapshot lifecycle itself. + /// + /// On `was_successful = true`, `value`, `quality`, `source_timestamp`, and + /// `statuses` carry the read data (from the cached subscription or the snapshot + /// lifecycle, depending on `was_cached`) and `error_message` is empty. On + /// `was_successful = false`, only `server_handle`, `tag_address`, `item_handle` + /// (when allocated), `was_cached`, and `error_message` are populated; `value`, + /// `quality`, `source_timestamp`, and `statuses` are left at their proto3 + /// defaults (null / 0 / null / empty) and must not be read as data — they are + /// wire-indistinguishable from "value is null with quality bad" data and serve + /// only as absent markers. ReadBulk has no `hresult` field by design (its + /// outcomes are timeout / cache / lifecycle states, not MXAccess COM return + /// codes — see `docs/DesignDecisions.md` "Bulk Command Family"). Per-tag + /// failures populate `error_message` and never raise — callers iterate and + /// inspect each entry. /// [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] public sealed partial class BulkReadResult : pb::IMessage diff --git a/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto b/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto index 36f9d73..9a74078 100644 --- a/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto +++ b/src/MxGateway.Contracts/Protos/mxaccess_gateway.proto @@ -548,9 +548,11 @@ message BulkSubscribeReply { // Per-item result for the four bulk write families. `item_handle` mirrors the // request entry's item_handle so callers can correlate inputs to outputs even -// when the gateway's tag-allowlist filter dropped some entries before reaching -// the worker. Per-item failures populate `error_message` + `hresult` and never -// raise — callers iterate and inspect each entry. +// when the gateway's per-entry `IConstraintEnforcer.CheckWriteHandleAsync` +// filter (see `MxAccessGatewayService.ReplaceWriteBulkEntries` and +// `docs/Authorization.md`) dropped some entries before reaching the worker. +// Per-item failures populate `error_message` + `hresult` and never raise — +// callers iterate and inspect each entry. message BulkWriteResult { int32 server_handle = 1; int32 item_handle = 2; @@ -568,6 +570,20 @@ message BulkWriteReply { // an existing live subscription's last OnDataChange (the worker did not touch // the subscription); false when the worker took the AddItem + Advise + wait + // UnAdvise + RemoveItem snapshot lifecycle itself. +// +// On `was_successful = true`, `value`, `quality`, `source_timestamp`, and +// `statuses` carry the read data (from the cached subscription or the snapshot +// lifecycle, depending on `was_cached`) and `error_message` is empty. On +// `was_successful = false`, only `server_handle`, `tag_address`, `item_handle` +// (when allocated), `was_cached`, and `error_message` are populated; `value`, +// `quality`, `source_timestamp`, and `statuses` are left at their proto3 +// defaults (null / 0 / null / empty) and must not be read as data — they are +// wire-indistinguishable from "value is null with quality bad" data and serve +// only as absent markers. ReadBulk has no `hresult` field by design (its +// outcomes are timeout / cache / lifecycle states, not MXAccess COM return +// codes — see `docs/DesignDecisions.md` "Bulk Command Family"). Per-tag +// failures populate `error_message` and never raise — callers iterate and +// inspect each entry. message BulkReadResult { int32 server_handle = 1; string tag_address = 2; diff --git a/src/MxGateway.IntegrationTests/IntegrationTestEnvironment.cs b/src/MxGateway.IntegrationTests/IntegrationTestEnvironment.cs index 5e52cc5..db0c0cd 100644 --- a/src/MxGateway.IntegrationTests/IntegrationTestEnvironment.cs +++ b/src/MxGateway.IntegrationTests/IntegrationTestEnvironment.cs @@ -1,8 +1,16 @@ +using MxGateway.Contracts; + namespace MxGateway.IntegrationTests; public static class IntegrationTestEnvironment { - public const string LiveMxAccessVariableName = "MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"; + /// + /// Sourced from + /// so the env-var literal is shared with + /// MxGateway.Worker.Tests.TestSupport.LiveMxAccessFactAttribute + /// (Worker.Tests-025). + /// + public const string LiveMxAccessVariableName = GatewayContractInfo.LiveMxAccessOptInVariableName; public const string LiveMxAccessWorkerExecutableVariableName = "MXGATEWAY_LIVE_MXACCESS_WORKER_EXE"; public const string LiveMxAccessItemVariableName = "MXGATEWAY_LIVE_MXACCESS_ITEM"; public const string LiveMxAccessClientNameVariableName = "MXGATEWAY_LIVE_MXACCESS_CLIENT_NAME"; diff --git a/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs b/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs index d7735d9..b0b1311 100644 --- a/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs +++ b/src/MxGateway.IntegrationTests/WorkerLiveMxAccessSmokeTests.cs @@ -1,5 +1,7 @@ using System.Collections.Concurrent; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Text; using Google.Protobuf.WellKnownTypes; using Grpc.Core; using Microsoft.Extensions.Logging; @@ -357,14 +359,6 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) .ConfigureAwait(false); LogEvent(firstDataChange); - // RecordingServerStreamWriter.Messages returns a snapshot copy under its own - // lock, so iterating after each teardown step is safe without external sync. - int dataChangeCountBeforeUnadvise = CountMatchingEvents( - eventWriter, - e => e.Family == MxEventFamily.OnDataChange - && e.ServerHandle == serverHandle - && e.ItemHandle == itemHandle); - // 1) UnAdvise — must reply Ok; the worker must stop emitting OnDataChange // for this (server, item) pair after this returns. MxCommandReply unadviseReply = await fixture.Service.Invoke( @@ -390,21 +384,33 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) Assert.Equal(ProtocolStatusCode.Ok, unregisterReply.ProtocolStatus.Code); Assert.Equal(MxCommandKind.Unregister, unregisterReply.Kind); - // Allow a short settle window for any in-flight OnDataChange to drain, then - // assert no further events arrived for the un-advised (serverHandle, itemHandle). - // MXAccess parity: after UnAdvise the provider must stop publishing OnDataChange - // for this item — a regression that left a stale subscription alive would surface - // as additional events after this delay. + // Parity rule: after UnAdvise returns Ok the worker must stop emitting + // OnDataChange for this (server, item) pair. Events the provider already + // published before that ack are in-flight and not a regression — the rule + // only constrains events generated AFTER the teardown returned. So the + // "before" baseline is taken *after* a first settle window drains those + // in-flight events, not before UnAdvise was issued (which races against + // the round-trip + STA dispatch + pipe send window — see IntegrationTests-017). + // + // RecordingServerStreamWriter.Messages returns a snapshot copy under its + // own lock, so iterating after each settle window is safe without external + // sync. await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); + int dataChangeCountAfterFirstSettle = CountMatchingEvents( + eventWriter, + e => e.Family == MxEventFamily.OnDataChange + && e.ServerHandle == serverHandle + && e.ItemHandle == itemHandle); - int dataChangeCountAfterTeardown = CountMatchingEvents( + await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); + int dataChangeCountAfterSecondSettle = CountMatchingEvents( eventWriter, e => e.Family == MxEventFamily.OnDataChange && e.ServerHandle == serverHandle && e.ItemHandle == itemHandle); output.WriteLine( - $"DataChange count before UnAdvise={dataChangeCountBeforeUnadvise} after teardown+settle={dataChangeCountAfterTeardown}"); - Assert.Equal(dataChangeCountBeforeUnadvise, dataChangeCountAfterTeardown); + $"DataChange count after first settle={dataChangeCountAfterFirstSettle} after second settle={dataChangeCountAfterSecondSettle}"); + Assert.Equal(dataChangeCountAfterFirstSettle, dataChangeCountAfterSecondSettle); // A RemoveItem against the just-freed item handle must not silently succeed — // the worker has to relay MXAccess's invalid-handle response. Closing the @@ -438,8 +444,16 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) File.Exists(workerExecutablePath), $"Live MXAccess worker executable was not found at {workerExecutablePath}. Build the worker or set {IntegrationTestEnvironment.LiveMxAccessWorkerExecutableVariableName}."); - TestWorkerProcessFactory processFactory = new(output); - await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, output); + // IntegrationTests-019: CLAUDE.md's credential-redaction rule covers every log + // surface the test sees, not just the reply's DiagnosticMessage. Wire a buffering + // wrapper around output and route the worker stdout/stderr echo and the gateway + // ILogger sink through it so the post-run assertion covers the accumulated test + // output. A regression that logged the request body, the WorkerCommandRequest + // envelope, or printed the credential from inside the worker is caught here + // even if the bare DiagnosticMessage check still passes. + RecordingTestOutputHelper recordedOutput = new(output); + TestWorkerProcessFactory processFactory = new(recordedOutput); + await using GatewayServiceFixture fixture = new(workerExecutablePath, processFactory, recordedOutput); // Stream events so a regression that emitted an OperationComplete or // OnWriteComplete with wrong handles would still be observable via the test // output (we don't assert a specific event here — the docs note successful @@ -450,6 +464,7 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) string? sessionId = null; Task? streamTask = null; using CancellationTokenSource streamCancellation = new(); + (string verifyUser, string verifyPassword) = ResolveLiveMxAccessSecuredCredentials(); try { @@ -473,32 +488,31 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) MxCommandReply registerReply = await fixture.Service.Invoke( CreateRegisterRequest(sessionId), new TestServerCallContext()).ConfigureAwait(false); - LogReply("Register", registerReply); + LogReplyTo(recordedOutput, "Register", registerReply); Assert.Equal(ProtocolStatusCode.Ok, registerReply.ProtocolStatus.Code); int serverHandle = registerReply.Register.ServerHandle; MxCommandReply addItemReply = await fixture.Service.Invoke( CreateAddItemRequest(sessionId, serverHandle), new TestServerCallContext()).ConfigureAwait(false); - LogReply("AddItem", addItemReply); + LogReplyTo(recordedOutput, "AddItem", addItemReply); Assert.Equal(ProtocolStatusCode.Ok, addItemReply.ProtocolStatus.Code); int itemHandle = addItemReply.AddItem.ItemHandle; MxCommandReply adviseReply = await fixture.Service.Invoke( CreateAdviseRequest(sessionId, serverHandle, itemHandle), new TestServerCallContext()).ConfigureAwait(false); - LogReply("Advise", adviseReply); + LogReplyTo(recordedOutput, "Advise", adviseReply); Assert.Equal(ProtocolStatusCode.Ok, adviseReply.ProtocolStatus.Code); // AuthenticateUser resolves an ArchestrA user id for the WriteSecured call. // Credentials are env-overridable so the test honors the gateway's "do not // log secrets" rule and works against either MXAccess's own user store or // the LmxOpcUa-baseline GLAuth-bridged ArchestrA identity (admin/admin123). - (string verifyUser, string verifyPassword) = ResolveLiveMxAccessSecuredCredentials(); MxCommandReply authReply = await fixture.Service.Invoke( CreateAuthenticateUserRequest(sessionId, serverHandle, verifyUser, verifyPassword), new TestServerCallContext()).ConfigureAwait(false); - output.WriteLine( + recordedOutput.WriteLine( $"AuthenticateUser status={authReply.ProtocolStatus.Code} hresult={authReply.Hresult} user_id={authReply.AuthenticateUser?.UserId}"); // AuthenticateUser is allowed to fail (the underlying provider may reject @@ -518,7 +532,7 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) currentUserId, verifierUserId: 0), new TestServerCallContext()).ConfigureAwait(false); - LogReply("WriteSecured", writeSecuredReply); + LogReplyTo(recordedOutput, "WriteSecured", writeSecuredReply); // Parity: the command itself completed its round-trip — the reply kind is // WriteSecured and the gateway protocol status is set. The MXAccess outcome @@ -538,6 +552,13 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) streamCancellation.Cancel(); await ShutDownAsync(fixture, processFactory, sessionId, streamTask).ConfigureAwait(false); } + + // CLAUDE.md credential contract: passwords and WriteSecured payloads must never + // reach logs. The buffered output covers the gateway ILogger sink, worker + // stdout/stderr, and every direct WriteLine the test body issued. A regression + // that dumped the request envelope, the AuthenticateUserCommand body, or any + // command-level WriteSecured payload would land here and trip this assertion. + Assert.DoesNotContain(verifyPassword, recordedOutput.Captured, StringComparison.Ordinal); } /// @@ -611,15 +632,50 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) // The fault classification must come from a known worker-client error code so // operators get an actionable cause string rather than an opaque exception - // trace. We accept any of the abnormal-exit classifications WorkerClient - // routes through SetFaulted on a killed worker. + // trace. We accept the classifications WorkerClient actually drives on an + // abnormal exit (kill-the-process path): the read loop hits EndOfStream and + // calls SetFaulted with WorkerClientErrorCode.PipeDisconnected and the + // message "Worker pipe disconnected." (see WorkerClient.cs:378-381). The + // earlier broad list (including "worker") matched every WorkerClient fault + // message (they all begin with "Worker"); tighten to the pipe/disconnect/ + // end-of-stream classifications that match THIS path, so a regression that + // routed an unrelated fault here would surface as a test failure rather + // than silently passing (see IntegrationTests-020). "heartbeat" is dropped + // because HeartbeatGraceSeconds (15s) exceeds the StreamShutdownTimeout + // (10s) poll window, so a heartbeat-expired transition can never be + // observed inside this test. Assert.True( - observedFault!.Contains("disconnect", StringComparison.OrdinalIgnoreCase) - || observedFault.Contains("pipe", StringComparison.OrdinalIgnoreCase) - || observedFault.Contains("heartbeat", StringComparison.OrdinalIgnoreCase) - || observedFault.Contains("worker", StringComparison.OrdinalIgnoreCase) + observedFault!.Contains("pipe disconnected", StringComparison.OrdinalIgnoreCase) || observedFault.Contains("end of stream", StringComparison.OrdinalIgnoreCase), - $"Fault description '{observedFault}' did not match a known worker-exit classification."); + $"Fault description '{observedFault}' did not match a known abnormal-exit classification " + + "(expected 'pipe disconnected' or 'end of stream' from WorkerClient's EndOfStream path)."); + + // IntegrationTests-021: also assert the StreamEvents call observed the fault + // — the chain that puts the session into Faulted goes through ReadEventsAsync + // propagating a WorkerClientException into EventStreamService, which calls + // session.MarkFaulted. The gateway then maps the WorkerClientException to an + // RpcException at the public boundary (MxAccessGatewayService.MapException → + // MapWorkerClientException). Polling session.State alone would silently pass + // if a future refactor moved MarkFaulted off the stream-consumption path — + // assert the streamTask itself terminated with a fault so the test couples + // to the actual fault-propagation path. Compare to the inverse assertion in + // the Write parity test (line 217: Assert.False(streamTask.IsFaulted, ...)). + try + { + await streamTask.WaitAsync(StreamShutdownTimeout).ConfigureAwait(false); + } + catch (Exception streamException) + { + output.WriteLine($"StreamEvents task terminated with: {streamException.GetType().Name}: {streamException.Message}"); + } + + Assert.True( + streamTask.IsCompleted, + "StreamEvents task did not complete within the shutdown timeout after the worker was killed."); + Assert.True( + streamTask.IsFaulted, + "StreamEvents task must fault on abnormal worker exit, not complete cleanly — " + + "the fault-propagation path from WorkerClient.SetFaulted through ReadEventsAsync is the contract."); } finally { @@ -948,12 +1004,20 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) string method, MxCommandReply reply) { - output.WriteLine( + LogReplyTo(output, method, reply); + } + + private static void LogReplyTo( + ITestOutputHelper sink, + string method, + MxCommandReply reply) + { + sink.WriteLine( $"{method} status={reply.ProtocolStatus.Code} hresult={reply.Hresult} diagnostic={reply.DiagnosticMessage}"); foreach (MxStatusProxy status in reply.Statuses) { - output.WriteLine( + sink.WriteLine( $"{method} mxstatus success={status.Success} category={status.Category} detail={status.Detail} text={status.DiagnosticText}"); } } @@ -1034,7 +1098,7 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) /// transitions it to Faulted, which the public gRPC API only exposes indirectly via /// CloseSession's reply (and not before a graceful close completes). /// - public bool TryGetSession(string sessionId, out GatewaySession session) + public bool TryGetSession(string sessionId, [MaybeNullWhen(false)] out GatewaySession session) { return _registry.TryGet(sessionId, out session); } @@ -1439,6 +1503,56 @@ public sealed class WorkerLiveMxAccessSmokeTests(ITestOutputHelper output) } } + /// + /// Buffering wrapper around an that mirrors every line + /// written through it into a the test owns. The WriteSecured + /// parity test (IntegrationTests-019) uses this to make CLAUDE.md's "passwords and + /// WriteSecured payloads must never reach logs" rule a property of the entire + /// test output stream — gateway entries (echoed via + /// ), worker stdout/stderr (echoed via + /// ), and direct + /// output.WriteLine calls all land in the same buffer, so a future maintenance + /// change that prints a credential through any of those channels is caught by the + /// assertion rather than slipping past the existing DiagnosticMessage check. + /// + private sealed class RecordingTestOutputHelper(ITestOutputHelper inner) : ITestOutputHelper + { + private readonly StringBuilder buffer = new(); + private readonly object syncRoot = new(); + + public string Captured + { + get + { + lock (syncRoot) + { + return buffer.ToString(); + } + } + } + + public void WriteLine(string message) + { + lock (syncRoot) + { + buffer.AppendLine(message); + } + + inner.WriteLine(message); + } + + public void WriteLine(string format, params object[] args) + { + string formatted = string.Format(System.Globalization.CultureInfo.InvariantCulture, format, args); + lock (syncRoot) + { + buffer.AppendLine(formatted); + } + + inner.WriteLine(format, args); + } + } + private sealed class AllowAllConstraintEnforcer : IConstraintEnforcer { public Task CheckReadTagAsync( diff --git a/src/MxGateway.Server/Configuration/GatewayOptionsValidator.cs b/src/MxGateway.Server/Configuration/GatewayOptionsValidator.cs index 3c1b459..3421607 100644 --- a/src/MxGateway.Server/Configuration/GatewayOptionsValidator.cs +++ b/src/MxGateway.Server/Configuration/GatewayOptionsValidator.cs @@ -25,6 +25,7 @@ public sealed class GatewayOptionsValidator : IValidateOptions ValidateEvents(options.Events, failures); ValidateDashboard(options.Dashboard, failures); ValidateProtocol(options.Protocol, failures); + ValidateAlarms(options.Alarms, failures); return failures.Count == 0 ? ValidateOptionsResult.Success @@ -228,6 +229,33 @@ public sealed class GatewayOptionsValidator : IValidateOptions failures); } + private static void ValidateAlarms(AlarmsOptions options, List failures) + { + if (!options.Enabled) + { + return; + } + + // When the alarm auto-subscribe hook is enabled, the gateway needs either a + // canonical SubscriptionExpression or a DefaultArea to compose one from. Both + // empty is the configuration mistake SessionManager.TryAutoSubscribeAlarmsAsync + // currently surfaces per-session — pulling it up to startup validation makes + // the misconfiguration fail-fast at boot, in line with every other section. + if (string.IsNullOrWhiteSpace(options.SubscriptionExpression) + && string.IsNullOrWhiteSpace(options.DefaultArea)) + { + failures.Add( + "MxGateway:Alarms requires either a non-blank SubscriptionExpression or a non-blank DefaultArea when Enabled is true."); + } + + if (!string.IsNullOrWhiteSpace(options.SubscriptionExpression) + && !options.SubscriptionExpression.StartsWith(@"\\", StringComparison.Ordinal)) + { + failures.Add( + @"MxGateway:Alarms:SubscriptionExpression must start with '\\' (canonical \\\Galaxy! shape)."); + } + } + private static void ValidateProtocol(ProtocolOptions options, List failures) { if (options.WorkerProtocolVersion != GatewayContractInfo.WorkerProtocolVersion) diff --git a/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs b/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs index a5c6f5c..4750bcd 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyGlobMatcher.cs @@ -65,15 +65,20 @@ public static class GalaxyGlobMatcher RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Compiled, TimeSpan.FromMilliseconds(100)); - if (RegexCache.TryAdd(glob, compiled)) + // GetOrAdd atomically returns whichever instance is in the cache after the + // call — either the locally-compiled regex (we won the race) or the regex + // another thread inserted (we lost). It also avoids the TryAdd-then-indexer + // pattern where the key could be evicted between the failed TryAdd and the + // indexer read, producing a KeyNotFoundException under contention near the + // cap (Server-024). + Regex result = RegexCache.GetOrAdd(glob, compiled); + if (ReferenceEquals(result, compiled)) { + // We were the inserter — track for FIFO eviction and bound the cache. InsertionOrder.Enqueue(glob); EvictIfOverCapacity(); - return compiled; } - - // Another thread won the race — use its compiled regex. - return RegexCache[glob]; + return result; } private static void EvictIfOverCapacity() diff --git a/src/MxGateway.Server/Grpc/EventStreamService.cs b/src/MxGateway.Server/Grpc/EventStreamService.cs index 11ae968..85db01f 100644 --- a/src/MxGateway.Server/Grpc/EventStreamService.cs +++ b/src/MxGateway.Server/Grpc/EventStreamService.cs @@ -26,7 +26,7 @@ public sealed class EventStreamService( StreamEventsRequest request, [EnumeratorCancellation] CancellationToken cancellationToken) { - if (!sessionManager.TryGetSession(request.SessionId, out GatewaySession session)) + if (!sessionManager.TryGetSession(request.SessionId, out GatewaySession? session) || session is null) { throw new SessionManagerException( SessionManagerErrorCode.SessionNotFound, diff --git a/src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs b/src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs index a3fd2e2..7418735 100644 --- a/src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs +++ b/src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs @@ -17,7 +17,7 @@ namespace MxGateway.Server.Grpc; /// direct SQL probe since callers use it as a health check. /// public sealed class GalaxyRepositoryGrpcService( - GalaxyDb.GalaxyRepository repository, + GalaxyDb.IGalaxyRepository repository, GalaxyDb.IGalaxyHierarchyCache cache, GalaxyDb.IGalaxyDeployNotifier notifier, IGatewayRequestIdentityAccessor identityAccessor, diff --git a/src/MxGateway.Server/Grpc/MxAccessGatewayService.cs b/src/MxGateway.Server/Grpc/MxAccessGatewayService.cs index c824639..c59bde7 100644 --- a/src/MxGateway.Server/Grpc/MxAccessGatewayService.cs +++ b/src/MxGateway.Server/Grpc/MxAccessGatewayService.cs @@ -54,6 +54,8 @@ public sealed class MxAccessGatewayService( reply.Capabilities.Add("unary-invoke"); reply.Capabilities.Add("server-stream-events"); reply.Capabilities.Add("bulk-subscribe-commands"); + reply.Capabilities.Add("bulk-read-commands"); + reply.Capabilities.Add("bulk-write-commands"); reply.Capabilities.Add("unary-acknowledge-alarm"); reply.Capabilities.Add("server-stream-active-alarms"); @@ -253,7 +255,7 @@ public sealed class MxAccessGatewayService( private GatewaySession ResolveSession(string sessionId) { - if (!sessionManager.TryGetSession(sessionId, out GatewaySession session)) + if (!sessionManager.TryGetSession(sessionId, out GatewaySession? session) || session is null) { throw new SessionManagerException( SessionManagerErrorCode.SessionNotFound, diff --git a/src/MxGateway.Server/Sessions/ISessionManager.cs b/src/MxGateway.Server/Sessions/ISessionManager.cs index d9743f5..e20b831 100644 --- a/src/MxGateway.Server/Sessions/ISessionManager.cs +++ b/src/MxGateway.Server/Sessions/ISessionManager.cs @@ -1,3 +1,4 @@ +using System.Diagnostics.CodeAnalysis; using MxGateway.Contracts.Proto; namespace MxGateway.Server.Sessions; @@ -20,7 +21,7 @@ public interface ISessionManager /// True if the session exists; otherwise false. bool TryGetSession( string sessionId, - out GatewaySession session); + [MaybeNullWhen(false)] out GatewaySession session); /// Invokes a command on the worker for the specified session. /// Identifier of the session. diff --git a/src/MxGateway.Server/Sessions/ISessionRegistry.cs b/src/MxGateway.Server/Sessions/ISessionRegistry.cs index 294e03a..82a24e5 100644 --- a/src/MxGateway.Server/Sessions/ISessionRegistry.cs +++ b/src/MxGateway.Server/Sessions/ISessionRegistry.cs @@ -1,3 +1,5 @@ +using System.Diagnostics.CodeAnalysis; + namespace MxGateway.Server.Sessions; /// @@ -28,7 +30,7 @@ public interface ISessionRegistry /// Identifier of the session. /// The retrieved session, if found. /// True if found; false otherwise. - bool TryGet(string sessionId, out GatewaySession session); + bool TryGet(string sessionId, [MaybeNullWhen(false)] out GatewaySession session); /// /// Attempts to remove a session by ID; returns false if not found. @@ -36,7 +38,7 @@ public interface ISessionRegistry /// Identifier of the session to remove. /// The removed session, if found. /// True if removed; false if not found. - bool TryRemove(string sessionId, out GatewaySession session); + bool TryRemove(string sessionId, [MaybeNullWhen(false)] out GatewaySession session); /// /// Returns a snapshot of all sessions in the registry. diff --git a/src/MxGateway.Server/Sessions/NotWiredAlarmRpcDispatcher.cs b/src/MxGateway.Server/Sessions/NotWiredAlarmRpcDispatcher.cs index 78389b4..2e5306d 100644 --- a/src/MxGateway.Server/Sessions/NotWiredAlarmRpcDispatcher.cs +++ b/src/MxGateway.Server/Sessions/NotWiredAlarmRpcDispatcher.cs @@ -8,20 +8,19 @@ using MxGateway.Server.Grpc; namespace MxGateway.Server.Sessions; /// -/// PR A.6 / A.7 — default shipped while -/// the worker-side AlarmClient event subscription is gated on dev-rig -/// validation. Acknowledges with a structured "worker-pending" +/// Null fallback used when no dispatcher +/// is registered in the DI container (DI omission or standalone tests). +/// Acknowledges with a structured "alarm dispatcher not registered" /// diagnostic and yields an empty active-alarm stream. /// /// /// -/// Replaces the inline diagnostic strings in -/// MxAccessGatewayService.AcknowledgeAlarm / -/// QueryActiveAlarms from PR A.3 with an injectable seam. -/// When the worker dispatcher (PR A.6/A.7 dev-rig follow-up) lands, -/// WorkerAlarmRpcDispatcher replaces this implementation in -/// the DI container and the same handler shape comes alive without -/// further changes to the public RPC surface. +/// Production wires as the +/// default via +/// SessionServiceCollectionExtensions.AddGatewaySessions, so +/// clients that hit this fallback are running against an +/// intentionally minimal service composition rather than the full +/// gateway. /// /// public sealed class NotWiredAlarmRpcDispatcher : IAlarmRpcDispatcher @@ -35,8 +34,8 @@ public sealed class NotWiredAlarmRpcDispatcher : IAlarmRpcDispatcher { SessionId = request.SessionId, CorrelationId = request.ClientCorrelationId, - ProtocolStatus = MxAccessGrpcMapper.Ok("AcknowledgeAlarm accepted; worker dispatch pending dev-rig wiring."), - DiagnosticMessage = "Gateway-side AcknowledgeAlarm accepted; the worker-side AlarmClient consumer (PR A.5) is in place but the dispatcher hookup is gated on validating the AVEVA alarm-provider event subscription on the dev rig.", + ProtocolStatus = MxAccessGrpcMapper.Ok("AcknowledgeAlarm accepted; alarm dispatcher is not registered."), + DiagnosticMessage = "Alarm dispatcher is not registered.", }); } diff --git a/src/MxGateway.Server/Sessions/SessionManager.cs b/src/MxGateway.Server/Sessions/SessionManager.cs index 7fbd22d..488c2f8 100644 --- a/src/MxGateway.Server/Sessions/SessionManager.cs +++ b/src/MxGateway.Server/Sessions/SessionManager.cs @@ -1,3 +1,4 @@ +using System.Diagnostics.CodeAnalysis; using System.Security.Cryptography; using Google.Protobuf.WellKnownTypes; using Microsoft.Extensions.Logging; @@ -132,7 +133,7 @@ public sealed class SessionManager : ISessionManager /// True if session found; otherwise false. public bool TryGetSession( string sessionId, - out GatewaySession session) + [MaybeNullWhen(false)] out GatewaySession session) { return _registry.TryGet(sessionId, out session); } @@ -297,7 +298,7 @@ public sealed class SessionManager : ISessionManager private GatewaySession GetRequiredSession(string sessionId) { - if (!_registry.TryGet(sessionId, out GatewaySession session)) + if (!_registry.TryGet(sessionId, out GatewaySession? session) || session is null) { throw new SessionManagerException( SessionManagerErrorCode.SessionNotFound, diff --git a/src/MxGateway.Server/Sessions/SessionRegistry.cs b/src/MxGateway.Server/Sessions/SessionRegistry.cs index fbeb842..d6f46d3 100644 --- a/src/MxGateway.Server/Sessions/SessionRegistry.cs +++ b/src/MxGateway.Server/Sessions/SessionRegistry.cs @@ -1,4 +1,5 @@ using System.Collections.Concurrent; +using System.Diagnostics.CodeAnalysis; using MxGateway.Contracts.Proto; namespace MxGateway.Server.Sessions; @@ -38,9 +39,9 @@ public sealed class SessionRegistry : ISessionRegistry /// The retrieved session if found. public bool TryGet( string sessionId, - out GatewaySession session) + [MaybeNullWhen(false)] out GatewaySession session) { - return _sessions.TryGetValue(sessionId, out session!); + return _sessions.TryGetValue(sessionId, out session); } /// @@ -50,9 +51,9 @@ public sealed class SessionRegistry : ISessionRegistry /// The removed session if found. public bool TryRemove( string sessionId, - out GatewaySession session) + [MaybeNullWhen(false)] out GatewaySession session) { - return _sessions.TryRemove(sessionId, out session!); + return _sessions.TryRemove(sessionId, out session); } /// diff --git a/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs b/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs index 40aa622..ef19f52 100644 --- a/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs +++ b/src/MxGateway.Server/Sessions/WorkerAlarmRpcDispatcher.cs @@ -76,7 +76,7 @@ public sealed class WorkerAlarmRpcDispatcher( { ArgumentNullException.ThrowIfNull(request); - if (!sessionRegistry.TryGet(request.SessionId, out GatewaySession session)) + if (!sessionRegistry.TryGet(request.SessionId, out GatewaySession? session) || session is null) { return new AcknowledgeAlarmReply { @@ -186,7 +186,7 @@ public sealed class WorkerAlarmRpcDispatcher( { ArgumentNullException.ThrowIfNull(request); - if (!sessionRegistry.TryGet(request.SessionId, out GatewaySession session)) + if (!sessionRegistry.TryGet(request.SessionId, out GatewaySession? session) || session is null) { // Server-019: align with AcknowledgeAsync's missing-session handling and // surface a SessionNotFound error rather than yielding an empty stream. diff --git a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs index dabc0b0..9ba783e 100644 --- a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs +++ b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs @@ -1,5 +1,6 @@ using MxGateway.Server.Galaxy; using MxGateway.Contracts.Proto.Galaxy; +using MxGateway.Tests.TestSupport; namespace MxGateway.Tests.Galaxy; @@ -156,17 +157,4 @@ public sealed class GalaxyHierarchyCacheTests } } - private sealed class ManualTimeProvider(DateTimeOffset start = default) : TimeProvider - { - private DateTimeOffset _now = start == default ? DateTimeOffset.UtcNow : start; - - /// - public override DateTimeOffset GetUtcNow() => _now; - - /// - /// Advances the current time by the specified duration. - /// - /// Time duration to advance. - public void Advance(TimeSpan duration) => _now += duration; - } } diff --git a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs index 8eca572..fcb276e 100644 --- a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs +++ b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceConstraintTests.cs @@ -346,6 +346,180 @@ public sealed class MxAccessGatewayServiceConstraintTests Assert.All(reply.WriteSecuredBulk.Results, r => Assert.False(r.WasSuccessful)); } + /// + /// Tests-020: Write2Bulk takes the third GetPayload/SetPayload + /// switch arm in WriteBulkConstraintPlan. The merge logic is shared with + /// WriteBulk, but a full denial through the CreateDeniedReply path + /// proves the Write2Bulk arm of the per-kind SetPayload switch fires + /// (and not, say, WriteBulk by mistake) — guarding against a refactor that + /// drops or misroutes the Write2Bulk case. + /// + [Fact] + public async Task Invoke_Write2Bulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply() + { + PredicateConstraintEnforcer enforcer = new() { DenyWriteHandle = (_, _) => true }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateWrite2BulkRequest(7, [10, 11]), + new TestServerCallContext()); + + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Equal(MxCommandKind.Write2Bulk, reply.Kind); + Assert.Equal(2, reply.Write2Bulk.Results.Count); + Assert.All(reply.Write2Bulk.Results, r => Assert.False(r.WasSuccessful)); + // Sibling reply slots must remain empty — pin the SetPayload arm fired + // for Write2Bulk and not for one of the other three Write*Bulk kinds. + Assert.Empty(reply.WriteBulk?.Results ?? new Google.Protobuf.Collections.RepeatedField()); + Assert.Empty(reply.WriteSecuredBulk?.Results ?? new Google.Protobuf.Collections.RepeatedField()); + Assert.Empty(reply.WriteSecured2Bulk?.Results ?? new Google.Protobuf.Collections.RepeatedField()); + } + + /// + /// Tests-020: WriteSecured2Bulk takes the fourth GetPayload/SetPayload + /// switch arm in WriteBulkConstraintPlan. Same reasoning as + /// Write2Bulk — assert the WriteSecured2Bulk reply slot is populated + /// to prove that arm of the switch fires. + /// + [Fact] + public async Task Invoke_WriteSecured2Bulk_WhenAllHandlesDenied_ShortCircuitsWithDeniedOnlyReply() + { + PredicateConstraintEnforcer enforcer = new() { DenyWriteHandle = (_, _) => true }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateWriteSecured2BulkRequest(7, [10, 11]), + new TestServerCallContext()); + + Assert.Equal(0, sessionManager.InvokeCount); + Assert.Equal(MxCommandKind.WriteSecured2Bulk, reply.Kind); + Assert.Equal(2, reply.WriteSecured2Bulk.Results.Count); + Assert.All(reply.WriteSecured2Bulk.Results, r => Assert.False(r.WasSuccessful)); + // Sibling reply slots must remain empty — pin the SetPayload arm fired + // for WriteSecured2Bulk and not for one of the other three Write*Bulk kinds. + Assert.Empty(reply.WriteBulk?.Results ?? new Google.Protobuf.Collections.RepeatedField()); + Assert.Empty(reply.Write2Bulk?.Results ?? new Google.Protobuf.Collections.RepeatedField()); + Assert.Empty(reply.WriteSecuredBulk?.Results ?? new Google.Protobuf.Collections.RepeatedField()); + } + + // === Worker reply-count divergence (Tests-024) === + + /// + /// Tests-024: WriteBulkConstraintPlan.MergeDeniedInto dequeues from + /// allowedResults per non-denied slot via Queue.TryDequeue, + /// which silently returns false when the queue is empty. Pin the + /// observable behaviour when the worker returns FEWER allowed results than + /// the gateway forwarded: the merged reply is truncated — denied entries + /// keep their slots, but the trailing allowed slot for which no worker + /// result arrived is dropped (no synthetic failure result is fabricated). + /// This fixture makes that "silent truncate" behaviour explicit so a future + /// change either fills the gap with a synthetic failure or fails this test. + /// + [Fact] + public async Task Invoke_WriteBulk_WhenWorkerReturnsFewerResultsThanAllowed_MergedReplyIsTruncated() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyWriteHandle = (_, itemHandle) => itemHandle == 902, + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + // Gateway forwards 2 allowed handles (901, 903) but the worker returns only + // 1 result. The merge logic should keep denied entry 902 at index 1, place + // the single worker result at index 0, and leave index 2 empty (truncate). + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.WriteBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + WriteBulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 7, ItemHandle = 901, WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateWriteBulkRequest(7, [901, 902, 903]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + BulkWriteReply merged = reply.WriteBulk; + // Current behaviour: the merged reply is shorter than OriginalCount when + // the worker under-supplies. Two slots survive — the worker result at + // index 0 and the denied entry at index 1 — and the trailing slot is + // silently dropped via Queue.TryDequeue returning false. + Assert.Equal(2, merged.Results.Count); + Assert.True(merged.Results[0].WasSuccessful); + Assert.Equal(901, merged.Results[0].ItemHandle); + Assert.False(merged.Results[1].WasSuccessful); + Assert.Equal(902, merged.Results[1].ItemHandle); + } + + /// + /// Tests-024: when the worker returns MORE allowed results than the + /// gateway forwarded, the extras must be silently ignored — the merged + /// reply length stays at OriginalCount. This pins the + /// for index < OriginalCount loop bound so a regression that + /// accidentally surfaces extras as trailing results is caught. + /// + [Fact] + public async Task Invoke_WriteBulk_WhenWorkerReturnsExtraResults_IgnoresExtras() + { + PredicateConstraintEnforcer enforcer = new() + { + DenyWriteHandle = (_, itemHandle) => itemHandle == 902, + }; + FakeSessionManager sessionManager = CreateSessionManagerWithSeed(); + // Gateway forwards 2 allowed handles (901, 903) but the worker returns 4. + sessionManager.InvokeReply = new WorkerCommandReply + { + Reply = new MxCommandReply + { + SessionId = SessionId, + Kind = MxCommandKind.WriteBulk, + ProtocolStatus = MxAccessGrpcMapper.Ok(), + WriteBulk = new BulkWriteReply + { + Results = + { + new BulkWriteResult { ServerHandle = 7, ItemHandle = 901, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 7, ItemHandle = 903, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 7, ItemHandle = 999, WasSuccessful = true }, + new BulkWriteResult { ServerHandle = 7, ItemHandle = 1000, WasSuccessful = true }, + }, + }, + }, + }; + MxAccessGatewayService service = CreateService(sessionManager, enforcer); + + MxCommandReply reply = await service.Invoke( + CreateWriteBulkRequest(7, [901, 902, 903]), + new TestServerCallContext()); + + Assert.Equal(1, sessionManager.InvokeCount); + BulkWriteReply merged = reply.WriteBulk; + // Merged reply length stays at OriginalCount (3); the two extra worker + // results (item handles 999, 1000) are silently discarded by the + // OriginalCount-bounded loop. + Assert.Equal(3, merged.Results.Count); + Assert.Equal(901, merged.Results[0].ItemHandle); + Assert.True(merged.Results[0].WasSuccessful); + Assert.Equal(902, merged.Results[1].ItemHandle); + Assert.False(merged.Results[1].WasSuccessful); + Assert.Equal(903, merged.Results[2].ItemHandle); + Assert.True(merged.Results[2].WasSuccessful); + Assert.DoesNotContain(merged.Results, r => r.ItemHandle == 999); + Assert.DoesNotContain(merged.Results, r => r.ItemHandle == 1000); + } + // === Unary write-handle enforcement (EnforceWriteHandleAsync) === /// @@ -547,6 +721,48 @@ public sealed class MxAccessGatewayServiceConstraintTests }; } + private static MxCommandRequest CreateWrite2BulkRequest(int serverHandle, IReadOnlyList itemHandles) + { + Write2BulkCommand cmd = new() { ServerHandle = serverHandle }; + foreach (int handle in itemHandles) + { + cmd.Entries.Add(new Write2BulkEntry + { + ItemHandle = handle, + Value = new MxValue { StringValue = "v" }, + TimestampValue = new MxValue { Int64Value = 1234567890L }, + }); + } + + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.Write2Bulk, Write2Bulk = cmd }, + }; + } + + private static MxCommandRequest CreateWriteSecured2BulkRequest(int serverHandle, IReadOnlyList itemHandles) + { + WriteSecured2BulkCommand cmd = new() { ServerHandle = serverHandle }; + foreach (int handle in itemHandles) + { + cmd.Entries.Add(new WriteSecured2BulkEntry + { + ItemHandle = handle, + CurrentUserId = 1, + VerifierUserId = 2, + Value = new MxValue { StringValue = "v" }, + TimestampValue = new MxValue { Int64Value = 1234567890L }, + }); + } + + return new MxCommandRequest + { + SessionId = SessionId, + Command = new MxCommand { Kind = MxCommandKind.WriteSecured2Bulk, WriteSecured2Bulk = cmd }, + }; + } + private static MxCommandRequest CreateWriteRequest(int serverHandle, int itemHandle) { return new MxCommandRequest diff --git a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs index 7b71097..97be331 100644 --- a/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs +++ b/src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs @@ -344,9 +344,9 @@ public sealed class MxAccessGatewayServiceTests Assert.Equal(StatusCode.InvalidArgument, exception.StatusCode); } - /// Verifies AcknowledgeAlarm returns OK with a worker-pending diagnostic for valid input. + /// Verifies AcknowledgeAlarm returns OK with a "dispatcher not registered" diagnostic when DI omits the dispatcher. [Fact] - public async Task AcknowledgeAlarm_WithValidRequest_ReturnsOkWithWorkerPendingDiagnostic() + public async Task AcknowledgeAlarm_WithValidRequest_ReturnsOkWithNotRegisteredDiagnostic() { MxAccessGatewayService service = CreateService(new FakeSessionManager()); @@ -364,7 +364,7 @@ public sealed class MxAccessGatewayServiceTests Assert.Equal(ProtocolStatusCode.Ok, reply.ProtocolStatus.Code); Assert.Equal("session-1", reply.SessionId); Assert.Equal("corr-1", reply.CorrelationId); - Assert.Contains("worker", reply.DiagnosticMessage, StringComparison.OrdinalIgnoreCase); + Assert.Contains("not registered", reply.DiagnosticMessage, StringComparison.OrdinalIgnoreCase); } /// Verifies QueryActiveAlarms rejects empty session_id. diff --git a/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs b/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs index de37c1c..cae115f 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs @@ -68,6 +68,43 @@ public sealed class GatewaySessionTests await session.DisposeAsync(); } + /// + /// Server-028 regression. A issued + /// while is parked between its + /// Closing and Closed writes must not break the close path's + /// terminal contract: the in-flight close runs to Closed, the fault + /// reason is preserved on , and the + /// session does not get stuck in . The + /// state machine documents "Closing only allows a transition to Closed or + /// Faulted" — this test pins the resolved end state so a future tightening + /// of MarkFaulted cannot silently regress it. + /// + [Fact] + public async Task MarkFaulted_DuringInFlightClose_PreservesFaultButYieldsToClose() + { + BlockingShutdownWorkerClient workerClient = new(); + GatewaySession session = CreateReadySession(workerClient); + + Task closeTask = session.CloseAsync("test-close", CancellationToken.None); + await workerClient.WaitForShutdownStartAsync(); + + // Close has set _state = Closing under _syncRoot and is parked inside + // worker.ShutdownAsync. Fault the session from another thread while parked. + Assert.Equal(SessionState.Closing, session.State); + session.MarkFaulted("concurrent-fault"); + + workerClient.ReleaseShutdown(); + SessionCloseResult result = await closeTask; + + // Close still wins — Closed is terminal — but the fault reason is preserved + // so observers see the original cause once the session settles. + Assert.Equal(SessionState.Closed, result.FinalState); + Assert.Equal(SessionState.Closed, session.State); + Assert.Equal("concurrent-fault", session.FinalFault); + + await session.DisposeAsync(); + } + /// /// Server-016 regression. must wait /// for an in-flight before disposing diff --git a/src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs b/src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs index 18c0e8c..4b5f2e8 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs @@ -4,16 +4,16 @@ using MxGateway.Server.Sessions; namespace MxGateway.Tests.Gateway.Sessions; /// -/// PR A.6 / A.7 — pins the not-yet-wired dispatcher's behaviour: -/// AcknowledgeAsync returns OK with a worker-pending diagnostic and -/// QueryActiveAlarmsAsync yields an empty stream. Production -/// WorkerAlarmRpcDispatcher (dev-rig follow-up) replaces this -/// impl in DI without changing the gateway handler shape. +/// Pins the null-fallback dispatcher's behaviour: AcknowledgeAsync +/// returns OK with a "dispatcher not registered" diagnostic and +/// QueryActiveAlarmsAsync yields an empty stream. Production binds +/// WorkerAlarmRpcDispatcher in DI; this fallback is only used +/// when no dispatcher is registered (DI omission / standalone tests). /// public sealed class NotWiredAlarmRpcDispatcherTests { [Fact] - public async Task AcknowledgeAsync_WhenNotWired_ReturnsOkWithWorkerPendingDiagnostic() + public async Task AcknowledgeAsync_WhenNotWired_ReturnsOkWithNotRegisteredDiagnostic() { IAlarmRpcDispatcher dispatcher = new NotWiredAlarmRpcDispatcher(); @@ -31,7 +31,7 @@ public sealed class NotWiredAlarmRpcDispatcherTests Assert.Equal(ProtocolStatusCode.Ok, reply.ProtocolStatus.Code); Assert.Equal("session-1", reply.SessionId); Assert.Equal("corr-1", reply.CorrelationId); - Assert.Contains("worker", reply.DiagnosticMessage, StringComparison.OrdinalIgnoreCase); + Assert.Contains("not registered", reply.DiagnosticMessage, StringComparison.OrdinalIgnoreCase); } [Fact] diff --git a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs index ccf8b1e..de371c3 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerBulkTests.cs @@ -433,6 +433,53 @@ public sealed class SessionManagerBulkTests cts.Token)); } + /// + /// Tests-022: Pin mid-flight cancellation behaviour for at least one bulk + /// path. Unlike the pre-cancel WriteSecuredBulkAsync_PropagatesCancellation + /// above, this fake's + /// returns a -backed task that does NOT + /// complete until the registered token fires. The session call therefore + /// reaches InvokeBulkInternalAsyncInvokeAsync → + /// workerClient.InvokeAsync and parks on an in-flight await; only + /// after that does cts.CancelAsync() fire. This is the path a real + /// client closing its stream would hit, which the pre-cancel pattern can't + /// exercise. + /// + [Fact] + public async Task WriteSecuredBulkAsync_WhenCancelledMidFlight_ThrowsOperationCanceledForRequestToken() + { + MidFlightBulkWorkerClient workerClient = new(); + GatewaySession session = await OpenSessionAsync(workerClient); + using CancellationTokenSource cts = new(); + + Task> writeTask = session.WriteSecuredBulkAsync( + 12, + new[] + { + new WriteSecuredBulkEntry + { + ItemHandle = 1, + CurrentUserId = 7, + VerifierUserId = 8, + Value = new MxValue { DataType = MxDataType.Integer, Int32Value = 0 }, + }, + }, + cts.Token); + + // Wait until the gateway has descended into the worker's InvokeAsync and + // registered its cancellation continuation — only then is this a true + // mid-flight cancel. + await workerClient.InvokeStarted.Task.WaitAsync(TimeSpan.FromSeconds(5)); + Assert.False(writeTask.IsCompleted); + + await cts.CancelAsync(); + + OperationCanceledException exception = await Assert.ThrowsAnyAsync( + async () => await writeTask); + Assert.Equal(cts.Token, exception.CancellationToken); + Assert.Equal(1, workerClient.InvokeCount); + } + [Fact] public async Task WriteSecured2BulkAsync_ForwardsOneWriteSecured2BulkCommandAndPreservesCredentialAndTimestampPayload() { @@ -587,12 +634,17 @@ public sealed class SessionManagerBulkTests } private static async Task OpenSessionAsync(FakeBulkWorkerClient workerClient) + { + return await OpenSessionAsync((IWorkerClient)workerClient); + } + + private static async Task OpenSessionAsync(IWorkerClient workerClient) { SessionManager manager = CreateManager(workerClient); return await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None); } - private static SessionManager CreateManager(FakeBulkWorkerClient workerClient) + private static SessionManager CreateManager(IWorkerClient workerClient) { return new SessionManager( new SessionRegistry(), @@ -708,4 +760,87 @@ public sealed class SessionManagerBulkTests /// public ValueTask DisposeAsync() => ValueTask.CompletedTask; } + + /// + /// Mid-flight cancellation fake for Tests-022. + /// signals , registers + /// a cancellation continuation on the caller's , + /// and parks on a that completes + /// only when the token fires or the fake is shut down. This is the only + /// way to land an on the async + /// continuation rather than the synchronous fast-path inside + /// ThrowIfCancellationRequested. + /// + private sealed class MidFlightBulkWorkerClient : IWorkerClient + { + private readonly TaskCompletionSource _invokeCompletion = + new(TaskCreationOptions.RunContinuationsAsynchronously); + + /// + public string SessionId { get; init; } = "session-1"; + + /// + public int? ProcessId { get; init; } = 1234; + + /// + public WorkerClientState State { get; set; } = WorkerClientState.Ready; + + /// + public DateTimeOffset LastHeartbeatAt { get; init; } = DateTimeOffset.UtcNow; + + /// Gets the number of times was entered. + public int InvokeCount { get; private set; } + + /// Signals when first enters — the test + /// awaits this before triggering mid-flight cancellation. + public TaskCompletionSource InvokeStarted { get; } = + new(TaskCreationOptions.RunContinuationsAsynchronously); + + /// + public Task StartAsync(CancellationToken cancellationToken) => Task.CompletedTask; + + /// + public Task InvokeAsync( + WorkerCommand command, + TimeSpan timeout, + CancellationToken cancellationToken) + { + InvokeCount++; + // Register cancellation BEFORE signalling start so the test can be + // certain the continuation is wired the moment InvokeStarted resolves. + cancellationToken.Register(() => _invokeCompletion.TrySetCanceled(cancellationToken)); + InvokeStarted.TrySetResult(); + return _invokeCompletion.Task; + } + + /// + public async IAsyncEnumerable ReadEventsAsync( + [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + await Task.CompletedTask; + yield break; + } + + /// + public Task ShutdownAsync(TimeSpan timeout, CancellationToken cancellationToken) + { + State = WorkerClientState.Closed; + _invokeCompletion.TrySetCanceled(cancellationToken); + return Task.CompletedTask; + } + + /// + public void Kill(string reason) + { + State = WorkerClientState.Faulted; + _invokeCompletion.TrySetCanceled(); + } + + /// + public ValueTask DisposeAsync() + { + _invokeCompletion.TrySetCanceled(); + return ValueTask.CompletedTask; + } + } } diff --git a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs index 58c6faf..2bdff8a 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs @@ -5,6 +5,7 @@ using MxGateway.Server.Configuration; using MxGateway.Server.Metrics; using MxGateway.Server.Sessions; using MxGateway.Server.Workers; +using MxGateway.Tests.TestSupport; namespace MxGateway.Tests.Gateway.Sessions; @@ -24,7 +25,7 @@ public sealed class SessionManagerTests GatewaySession session = await manager.OpenSessionAsync(CreateOpenRequest(), "client-1", CancellationToken.None); - Assert.True(manager.TryGetSession(session.SessionId, out GatewaySession registered)); + Assert.True(manager.TryGetSession(session.SessionId, out GatewaySession? registered)); Assert.Same(session, registered); Assert.Equal(SessionState.Ready, session.State); Assert.Equal("client-1", session.ClientIdentity); @@ -763,10 +764,4 @@ public sealed class SessionManagerTests } } - private sealed class ManualTimeProvider(DateTimeOffset start) : TimeProvider - { - private DateTimeOffset _now = start; - - public override DateTimeOffset GetUtcNow() => _now; - } } diff --git a/src/MxGateway.Tests/Gateway/Sessions/SessionWorkerClientFactoryFakeWorkerTests.cs b/src/MxGateway.Tests/Gateway/Sessions/SessionWorkerClientFactoryFakeWorkerTests.cs index ceb75b3..10604f6 100644 --- a/src/MxGateway.Tests/Gateway/Sessions/SessionWorkerClientFactoryFakeWorkerTests.cs +++ b/src/MxGateway.Tests/Gateway/Sessions/SessionWorkerClientFactoryFakeWorkerTests.cs @@ -330,18 +330,28 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests : IAsyncDisposable DateTimeOffset.UtcNow); } - /// Fake worker process for testing process lifecycle. + /// + /// Fake worker process for testing process lifecycle. + /// awaits a completed only by + /// or , so a caller observing + /// completion can trust that exit actually happened — bringing this fake into + /// parity with the smoke-test variant in GatewayEndToEndFakeWorkerSmokeTests + /// (Tests-015 / Tests-023). This removes the latent regression vector where a + /// future Assert.True(launcher.Process.HasExited) in this file would + /// pass spuriously regardless of whether the worker truly exited. + /// private sealed class FakeWorkerProcess(int processId) : IWorkerProcess { + private readonly TaskCompletionSource _exited = new(TaskCreationOptions.RunContinuationsAsynchronously); private bool _disposed; /// public int Id { get; } = processId; - /// Gets or sets a value indicating whether the process has exited. + /// Gets a value indicating whether the process has exited. public bool HasExited { get; private set; } - /// Gets or sets the process exit code. + /// Gets the process exit code, or null if the process has not exited. public int? ExitCode { get; private set; } /// Gets the number of times the Kill method was called. @@ -350,17 +360,14 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests : IAsyncDisposable /// public ValueTask WaitForExitAsync(CancellationToken cancellationToken) { - HasExited = true; - ExitCode = 0; - return ValueTask.CompletedTask; + return new ValueTask(_exited.Task.WaitAsync(cancellationToken)); } /// public void Kill(bool entireProcessTree) { KillCount++; - HasExited = true; - ExitCode = -1; + MarkExited(-1); } /// @@ -371,5 +378,14 @@ public sealed class SessionWorkerClientFactoryFakeWorkerTests : IAsyncDisposable /// Gets a value indicating whether this process has been disposed. public bool IsDisposed => _disposed; + + /// Marks the process as exited with the specified exit code. + /// The process exit code. + public void MarkExited(int exitCode) + { + HasExited = true; + ExitCode = exitCode; + _exited.TrySetResult(); + } } } diff --git a/src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs b/src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs index 5da5caa..4a2aa34 100644 --- a/src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs +++ b/src/MxGateway.Tests/Gateway/Workers/FakeWorkerHarnessTests.cs @@ -2,6 +2,7 @@ using MxGateway.Contracts; using MxGateway.Contracts.Proto; using MxGateway.Server.Workers; using MxGateway.Tests.Gateway.Workers.Fakes; +using MxGateway.Tests.TestSupport; namespace MxGateway.Tests.Gateway.Workers; @@ -222,16 +223,4 @@ public sealed class FakeWorkerHarnessTests } } - /// Time provider with a manually advanced clock for deterministic timestamp tests. - private sealed class ManualTimeProvider(DateTimeOffset start) : TimeProvider - { - private DateTimeOffset _now = start; - - /// - public override DateTimeOffset GetUtcNow() => _now; - - /// Advances the manual clock by the given amount. - /// Amount of time to add to the current clock value. - public void Advance(TimeSpan delta) => _now += delta; - } } diff --git a/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs b/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs index 8faa038..f834163 100644 --- a/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs +++ b/src/MxGateway.Tests/Gateway/Workers/WorkerClientTests.cs @@ -4,6 +4,7 @@ using MxGateway.Contracts; using MxGateway.Contracts.Proto; using MxGateway.Server.Metrics; using MxGateway.Server.Workers; +using MxGateway.Tests.TestSupport; namespace MxGateway.Tests.Gateway.Workers; @@ -616,19 +617,6 @@ public sealed class WorkerClientTests } } - /// Time provider with a manually advanced clock for deterministic timestamp tests. - private sealed class ManualTimeProvider(DateTimeOffset start) : TimeProvider - { - private DateTimeOffset _now = start; - - /// - public override DateTimeOffset GetUtcNow() => _now; - - /// Advances the manual clock by the given amount. - /// Amount of time to add to the current clock value. - public void Advance(TimeSpan delta) => _now += delta; - } - private sealed class FakeWorkerProcess : IWorkerProcess { private readonly TaskCompletionSource _exited = new(TaskCreationOptions.RunContinuationsAsynchronously); diff --git a/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs b/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs index 60d9dfc..137f146 100644 --- a/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs +++ b/src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs @@ -18,6 +18,7 @@ public sealed class GatewayGrpcScopeResolverTests [InlineData(typeof(TestConnectionRequest), GatewayScopes.MetadataRead)] [InlineData(typeof(GetLastDeployTimeRequest), GatewayScopes.MetadataRead)] [InlineData(typeof(DiscoverHierarchyRequest), GatewayScopes.MetadataRead)] + [InlineData(typeof(WatchDeployEventsRequest), GatewayScopes.MetadataRead)] public void ResolveRequiredScope_KnownRpcRequest_ReturnsExpectedScope( Type requestType, string expectedScope) diff --git a/src/MxGateway.Tests/TestSupport/ManualTimeProvider.cs b/src/MxGateway.Tests/TestSupport/ManualTimeProvider.cs new file mode 100644 index 0000000..901c89c --- /dev/null +++ b/src/MxGateway.Tests/TestSupport/ManualTimeProvider.cs @@ -0,0 +1,22 @@ +namespace MxGateway.Tests.TestSupport; + +/// +/// with a manually advanced clock for deterministic +/// timestamp / heartbeat / lease tests. Tests inject one of these instead of +/// so timing assertions don't depend on the +/// wall clock. Constructed without arguments (or with default) it seeds +/// from ; for fully deterministic tests pass +/// an explicit start instant. +/// +/// Initial clock value. When default, the clock seeds from . +public sealed class ManualTimeProvider(DateTimeOffset start = default) : TimeProvider +{ + private DateTimeOffset _now = start == default ? DateTimeOffset.UtcNow : start; + + /// + public override DateTimeOffset GetUtcNow() => _now; + + /// Advances the manual clock by the given amount. + /// Amount of time to add to the current clock value. + public void Advance(TimeSpan delta) => _now += delta; +} diff --git a/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs b/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs index 762c13a..c11e53d 100644 --- a/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs +++ b/src/MxGateway.Worker.Tests/Ipc/WorkerPipeSessionTests.cs @@ -442,6 +442,78 @@ public sealed class WorkerPipeSessionTests await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); } + /// + /// Worker-023 regression: the in-flight-command suppression on the + /// StaHung watchdog (Worker-017) is bounded by + /// WorkerPipeSessionOptions.HeartbeatStuckCeiling. A truly + /// stuck synchronous STA command (e.g. a dead MXAccess provider) would + /// otherwise keep CurrentCommandCorrelationId non-empty forever + /// and permanently defeat the watchdog. Once LastStaActivityUtc + /// has been stale for longer than HeartbeatStuckCeiling the + /// watchdog DOES fire StaHung even with a command in flight. + /// + [Fact] + public async Task RunAsync_WhenStaActivityIsStaleBeyondCeilingWithCommandInFlight_WritesWatchdogFault() + { + using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5)); + using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token); + FakeRuntimeSession runtime = new(); + // Stale by 5s, which exceeds the configured 200 ms ceiling — the + // watchdog must fire even with a command in flight. + runtime.SetSnapshot(new WorkerRuntimeHeartbeatSnapshot( + DateTimeOffset.UtcNow - TimeSpan.FromSeconds(5), + pendingCommandCount: 0, + outboundEventQueueDepth: 0, + lastEventSequence: 0, + currentCommandCorrelationId: "stuck-command")); + WorkerPipeSession session = CreatePipeSession( + pipePair.WorkerStream, + runtime, + new WorkerPipeSessionOptions + { + HeartbeatInterval = TimeSpan.FromMilliseconds(20), + HeartbeatGrace = TimeSpan.FromMilliseconds(50), + HeartbeatStuckCeiling = TimeSpan.FromMilliseconds(200), + }); + Task runTask = session.RunAsync(cancellation.Token); + await CompleteGatewayHandshakeAsync(pipePair, cancellation.Token); + + WorkerEnvelope fault = await ReadUntilAsync( + pipePair.GatewayReader, + WorkerEnvelope.BodyOneofCase.WorkerFault, + cancellation.Token); + + Assert.Equal(WorkerFaultCategory.StaHung, fault.WorkerFault.Category); + Assert.Contains("STA activity is stale", fault.WorkerFault.DiagnosticMessage); + + await SendShutdownAndWaitAsync(pipePair, runTask, cancellation.Token); + } + + /// + /// Worker-025 regression: RunAsync must throw a diagnostic + /// exception if the runtime-session factory returns null, rather than + /// deferring the failure to an NRE on the next dereference. + /// + [Fact] + public async Task RunAsync_WhenRuntimeSessionFactoryReturnsNull_ThrowsDiagnosticException() + { + using CancellationTokenSource cancellation = new(TimeSpan.FromSeconds(5)); + using PipePair pipePair = await PipePair.CreateAsync(cancellation.Token); + WorkerFrameProtocolOptions options = CreateOptions(); + WorkerPipeSession session = new( + new WorkerFrameReader(pipePair.WorkerStream, options), + new WorkerFrameWriter(pipePair.WorkerStream, options), + options, + () => 1234, + new WorkerPipeSessionOptions(), + () => null!); + + InvalidOperationException exception = await Assert.ThrowsAsync( + () => session.RunAsync(cancellation.Token)); + + Assert.Contains("factory returned null", exception.Message); + } + /// /// Worker-006 regression: when graceful shutdown times out, RunAsync /// must still dispose the runtime session in its finally block. @@ -818,15 +890,23 @@ public sealed class WorkerPipeSessionTests Nonce); } + // Inbound-envelope sequence numbers below are documentation-only: the + // worker has no inbound monotonicity check, so the literal values do + // not affect dispatch. Each helper exposes a sequence parameter + // (default = position in the typical Hello/Command/Cancel/Shutdown + // ordering) so a multi-frame test that interleaves the helpers can + // assign monotonically increasing values and produce a wire trace + // that reads in ascending order — see Worker.Tests-030. private static WorkerEnvelope CreateGatewayHelloEnvelope( string nonce = Nonce, - uint supportedProtocolVersion = GatewayContractInfo.WorkerProtocolVersion) + uint supportedProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, + ulong sequence = 1) { return new WorkerEnvelope { ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, SessionId = SessionId, - Sequence = 1, + Sequence = sequence, GatewayHello = new GatewayHello { SupportedProtocolVersion = supportedProtocolVersion, @@ -836,13 +916,13 @@ public sealed class WorkerPipeSessionTests }; } - private static WorkerEnvelope CreateCommandEnvelope(string correlationId) + private static WorkerEnvelope CreateCommandEnvelope(string correlationId, ulong sequence = 2) { return new WorkerEnvelope { ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, SessionId = SessionId, - Sequence = 2, + Sequence = sequence, CorrelationId = correlationId, WorkerCommand = new WorkerCommand { @@ -859,13 +939,13 @@ public sealed class WorkerPipeSessionTests }; } - private static WorkerEnvelope CreateCancelEnvelope(string correlationId) + private static WorkerEnvelope CreateCancelEnvelope(string correlationId, ulong sequence = 2) { return new WorkerEnvelope { ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, SessionId = SessionId, - Sequence = 4, + Sequence = sequence, CorrelationId = correlationId, WorkerCancel = new WorkerCancel { @@ -874,13 +954,13 @@ public sealed class WorkerPipeSessionTests }; } - private static WorkerEnvelope CreateShutdownEnvelope() + private static WorkerEnvelope CreateShutdownEnvelope(ulong sequence = 3) { return new WorkerEnvelope { ProtocolVersion = GatewayContractInfo.WorkerProtocolVersion, SessionId = SessionId, - Sequence = 3, + Sequence = sequence, WorkerShutdown = new WorkerShutdown { GracePeriod = Duration.FromTimeSpan(TimeSpan.FromSeconds(1)), diff --git a/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs b/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs index 597a0b4..04a01c2 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/AlarmCommandHandlerTests.cs @@ -189,6 +189,81 @@ public sealed class AlarmCommandHandlerTests () => handler.Subscribe("x", "y")); } + /// + /// Worker-024 regression: every method that touches the underlying + /// must invoke the configured + /// STA-affinity guard. A guard that throws (simulating an off-STA + /// call) must propagate from every command-path entry point. + /// + [Fact] + public void EveryCommandPathEntry_InvokesThreadAffinityGuard() + { + FakeConsumer consumer = new FakeConsumer(); + int guardInvocations = 0; + AlarmCommandHandler handler = new AlarmCommandHandler( + new MxAccessEventQueue(), + () => consumer, + () => guardInvocations++); + + // Subscribe is the first call — guard must run before the consumer + // factory is invoked. We tally invocation counts after each call so + // that a missed guard surfaces as the diagnostic count, not a generic + // "Subscribe should have failed". + handler.Subscribe(@"\\HOST\Galaxy!A", "s1"); + Assert.Equal(1, guardInvocations); + + handler.Acknowledge(Guid.NewGuid(), "c", "u", "n", "d", "F"); + Assert.Equal(2, guardInvocations); + + handler.AcknowledgeByName("a", "p", "g", "c", "u", "n", "d", "F"); + Assert.Equal(3, guardInvocations); + + _ = handler.QueryActive(null); + Assert.Equal(4, guardInvocations); + + handler.PollOnce(); + Assert.Equal(5, guardInvocations); + + handler.Unsubscribe(); + Assert.Equal(6, guardInvocations); + } + + /// + /// Worker-024 regression: a guard that throws must propagate from + /// every command-path entry point — proving the guard is not + /// swallowed by an inner try/catch. + /// + [Fact] + public void EveryCommandPathEntry_PropagatesAffinityGuardException() + { + FakeConsumer consumer = new FakeConsumer(); + AlarmCommandHandler handler = new AlarmCommandHandler( + new MxAccessEventQueue(), + () => consumer, + threadAffinityCheck: () => + throw new InvalidOperationException("off-STA")); + + // Subscribe: guard runs before the dispatcher is constructed. + Assert.Throws( + () => handler.Subscribe(@"\\HOST\Galaxy!A", "s1")); + + // To exercise the other entry points we need a subscribed handler. + // Construct a parallel handler with a passing guard, then swap in a + // throwing one — but the existing handler is the simpler vehicle: + // re-build the handler with the guard initially silent, subscribe, + // then verify each remaining entry by passing a guard that throws + // through a second handler instance — actually the cleaner way is to + // assert each independently with a fresh handler. Below we reuse + // the same throwing handler for the not-subscribed-yet entries: + Assert.Throws( + () => handler.Acknowledge(Guid.Empty, "", "", "", "", "")); + Assert.Throws( + () => handler.AcknowledgeByName("", "", "", "", "", "", "", "")); + Assert.Throws(() => handler.QueryActive(null)); + Assert.Throws(() => handler.PollOnce()); + Assert.Throws(() => handler.Unsubscribe()); + } + private static MxAlarmSnapshotRecord NewRecord(string provider, string group, string tag) { return new MxAlarmSnapshotRecord diff --git a/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs b/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs index 0d4ea6f..19b3d59 100644 --- a/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs +++ b/src/MxGateway.Worker.Tests/MxAccess/MxAccessStaSessionTests.cs @@ -200,7 +200,7 @@ public sealed class MxAccessStaSessionTests factory, eventSink, new MxAccessEventQueue(), - _eq => handler); + (_eq, _affinity) => handler); await session.StartAsync("session-1", workerProcessId: 1); @@ -279,7 +279,7 @@ public sealed class MxAccessStaSessionTests factory, eventSink, new MxAccessEventQueue(), - _eq => handler); + (_eq, _affinity) => handler); await session.StartAsync("session-1", workerProcessId: 1); @@ -320,7 +320,7 @@ public sealed class MxAccessStaSessionTests factory, eventSink, new MxAccessEventQueue(), - _eq => handler); + (_eq, _affinity) => handler); await session.StartAsync("session-1", workerProcessId: 1); @@ -369,7 +369,7 @@ public sealed class MxAccessStaSessionTests factory, eventSink, eventQueue, - _eq => handler); + (_eq, _affinity) => handler); await session.StartAsync("session-1", workerProcessId: 1); @@ -416,7 +416,7 @@ public sealed class MxAccessStaSessionTests factory, eventSink, eventQueue, - _eq => handler); + (_eq, _affinity) => handler); await session.StartAsync("session-1", workerProcessId: 1); diff --git a/src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs b/src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs index 39432fc..0453025 100644 --- a/src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs +++ b/src/MxGateway.Worker.Tests/Probes/AlarmClientWmProbeTests.cs @@ -11,7 +11,7 @@ using aaAlarmManagedClient; using ArchestrA.MxAccess; using Xunit.Abstractions; -namespace MxGateway.Worker.Tests; +namespace MxGateway.Worker.Tests.Probes; /// /// Runtime probe — registers as an AlarmClient consumer with a real diff --git a/src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs b/src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs index 77c1aba..bb3f7c7 100644 --- a/src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs +++ b/src/MxGateway.Worker.Tests/Probes/AlarmsLiveSmokeTests.cs @@ -6,7 +6,7 @@ using MxGateway.Contracts.Proto; using MxGateway.Worker.MxAccess; using Xunit.Abstractions; -namespace MxGateway.Worker.Tests; +namespace MxGateway.Worker.Tests.Probes; /// /// Live dev-rig smoke test for the alarms-over-gateway pipeline. diff --git a/src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs b/src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs index e12d6fe..b287683 100644 --- a/src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs +++ b/src/MxGateway.Worker.Tests/Probes/WnWrapConsumerProbeTests.cs @@ -7,7 +7,7 @@ using System.Threading; using WNWRAPCONSUMERLib; using Xunit.Abstractions; -namespace MxGateway.Worker.Tests; +namespace MxGateway.Worker.Tests.Probes; /// /// Runtime probe — instantiate AVEVA's standalone wnwrapConsumer COM diff --git a/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs b/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs index 2926381..46b56ff 100644 --- a/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs +++ b/src/MxGateway.Worker.Tests/TestSupport/FakeRuntimeSession.cs @@ -166,12 +166,34 @@ internal sealed class FakeRuntimeSession : IWorkerRuntimeSession } } + private bool cancelCommandReturnValue; + /// /// Optional return value yielded by . /// Defaults to false (the runtime had no matching in-flight - /// command), matching the previous test-double behaviour. + /// command), matching the previous test-double behaviour. Mutated + /// and read under lock(gate) to match the locking convention + /// the rest of this fake uses for cancelledCorrelationIds, + /// snapshot, and events (Worker.Tests-027). /// - public bool CancelCommandReturnValue { get; set; } + public bool CancelCommandReturnValue + { + get + { + lock (gate) + { + return cancelCommandReturnValue; + } + } + + set + { + lock (gate) + { + cancelCommandReturnValue = value; + } + } + } /// Cancels command by correlation ID. /// The command correlation ID. @@ -181,9 +203,8 @@ internal sealed class FakeRuntimeSession : IWorkerRuntimeSession lock (gate) { cancelledCorrelationIds.Add(correlationId); + return cancelCommandReturnValue; } - - return CancelCommandReturnValue; } /// Requests graceful shutdown. diff --git a/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs b/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs index bf1030c..066edcf 100644 --- a/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs +++ b/src/MxGateway.Worker.Tests/TestSupport/LiveMxAccessFactAttribute.cs @@ -1,17 +1,18 @@ using System; +using MxGateway.Contracts; namespace MxGateway.Worker.Tests.TestSupport; /// /// Marks an xUnit test as requiring installed MXAccess COM and live -/// provider state. When the opt-in environment variable -/// MXGATEWAY_RUN_LIVE_MXACCESS_TESTS is not set to 1, the -/// test is reported as Skipped by xUnit rather than silently -/// returning early (which xUnit would otherwise report as -/// Passed). Mirrors -/// MxGateway.IntegrationTests.LiveMxAccessFactAttribute; the -/// copy avoids a cross-project reference and keeps the Worker.Tests -/// net48/x86 build self-contained. +/// provider state. When the opt-in environment variable named by +/// is +/// not set to 1, the test is reported as Skipped by +/// xUnit rather than silently returning early (which xUnit would +/// otherwise report as Passed). Mirrors +/// MxGateway.IntegrationTests.LiveMxAccessFactAttribute; both +/// copies bind to the same GatewayContractInfo constant so the +/// env-var name has a single literal source of truth (Worker.Tests-025). /// public sealed class LiveMxAccessFactAttribute : FactAttribute { @@ -19,8 +20,10 @@ public sealed class LiveMxAccessFactAttribute : FactAttribute /// The environment variable that opts the suite into running live /// MXAccess COM tests. Must be set to 1 on a machine with the /// installed MXAccess runtime and a reachable Galaxy provider. + /// Sourced from + /// so a single constant gates both Worker.Tests and IntegrationTests. /// - public const string LiveMxAccessVariableName = "MXGATEWAY_RUN_LIVE_MXACCESS_TESTS"; + public const string LiveMxAccessVariableName = GatewayContractInfo.LiveMxAccessOptInVariableName; /// Initializes the attribute, skipping the test unless the env var is set. public LiveMxAccessFactAttribute() diff --git a/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs b/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs index 7adc38e..5f04145 100644 --- a/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs +++ b/src/MxGateway.Worker/Ipc/WorkerPipeSession.cs @@ -51,7 +51,7 @@ public sealed class WorkerPipeSession options, () => Process.GetCurrentProcess().Id, new WorkerPipeSessionOptions(), - () => new MxAccessStaSession(eq => new AlarmCommandHandler(eq)), + () => new MxAccessStaSession((eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity)), logger) { } @@ -72,7 +72,7 @@ public sealed class WorkerPipeSession options, processIdProvider, new WorkerPipeSessionOptions(), - () => new MxAccessStaSession(eq => new AlarmCommandHandler(eq)), + () => new MxAccessStaSession((eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity)), logger: null) { } @@ -108,7 +108,16 @@ public sealed class WorkerPipeSession /// Token to cancel the asynchronous operation. public async Task RunAsync(CancellationToken cancellationToken = default) { - _runtimeSession = _runtimeSessionFactory(); + // Worker-025: the factory delegate itself is null-checked in the + // constructor, but its return value is not — a factory that returned + // null would NRE on the StartAsync lambda below. Throw a diagnostic + // exception instead so the failure is unambiguous (and so the + // finally block's _runtimeSession?.Dispose() can't silently no-op + // on a torn half-initialized session). Mirrors the same pattern + // AlarmCommandHandler.Subscribe uses for its consumerFactory(). + _runtimeSession = _runtimeSessionFactory() + ?? throw new InvalidOperationException( + "Worker runtime session factory returned null."); try { await CompleteStartupHandshakeAsync( @@ -625,6 +634,18 @@ public sealed class WorkerPipeSession /// STA (no command in flight and no activity), which is the only case /// the watchdog can usefully distinguish from a slow command. /// + /// + /// Worker-023: the in-flight-command suppression is itself bounded by + /// WorkerPipeSessionOptions.HeartbeatStuckCeiling. A truly stuck + /// synchronous COM call (e.g. against a dead MXAccess provider whose + /// cross-apartment marshaler is permanently blocked) leaves + /// CurrentCommandCorrelationId non-empty forever; without an + /// upper bound the worker-side StaHung watchdog would be + /// permanently defeated and only the gateway's per-command timeout + /// would catch the hang. Once LastActivityUtc has been stale + /// for longer than HeartbeatStuckCeiling the watchdog fires + /// StaHung regardless of whether a command is in flight. + /// private async Task ReportWatchdogFaultIfNeededAsync( WorkerRuntimeHeartbeatSnapshot snapshot, CancellationToken cancellationToken) @@ -636,14 +657,22 @@ public sealed class WorkerPipeSession return; } - if (!string.IsNullOrEmpty(snapshot.CurrentCommandCorrelationId)) + if (!string.IsNullOrEmpty(snapshot.CurrentCommandCorrelationId) + && staleFor <= _sessionOptions.HeartbeatStuckCeiling) { - // A command is in flight — the STA is busy executing it, not + // A command is in flight and we are still within the defensive + // suppression ceiling — the STA is busy executing it, not // hung. The next MarkActivity() in StaRuntime.ProcessQueuedCommands // will refresh LastActivityUtc once the command returns, at which // point this branch stops being taken. The heartbeat already // surfaces the in-flight correlation id so the gateway can apply // its own per-command timeout if it considers the command too slow. + // + // Worker-023: once staleFor exceeds HeartbeatStuckCeiling we fall + // through to the fault path even with a command in flight — a + // truly stuck synchronous COM call would otherwise keep + // CurrentCommandCorrelationId non-empty indefinitely and the + // worker-side watchdog would never fire. return; } @@ -837,7 +866,8 @@ public sealed class WorkerPipeSession // is preserved for the legacy direct-invocation path where the // parameterless CompleteStartupHandshakeAsync is used without a // prior factory call. - _runtimeSession ??= new MxAccessStaSession(eq => new AlarmCommandHandler(eq)); + _runtimeSession ??= new MxAccessStaSession( + (eq, affinity) => new AlarmCommandHandler(eq, () => new WnWrapAlarmConsumer(), affinity)); IWorkerRuntimeSession session = _runtimeSession; try { diff --git a/src/MxGateway.Worker/Ipc/WorkerPipeSessionOptions.cs b/src/MxGateway.Worker/Ipc/WorkerPipeSessionOptions.cs index 4b82ff0..ce07008 100644 --- a/src/MxGateway.Worker/Ipc/WorkerPipeSessionOptions.cs +++ b/src/MxGateway.Worker/Ipc/WorkerPipeSessionOptions.cs @@ -9,12 +9,21 @@ public sealed class WorkerPipeSessionOptions public static readonly TimeSpan DefaultHeartbeatInterval = TimeSpan.FromSeconds(5); /// Default heartbeat grace period (15 seconds). public static readonly TimeSpan DefaultHeartbeatGrace = TimeSpan.FromSeconds(15); + /// + /// Default defensive ceiling beyond which the watchdog fires + /// + /// even while a command is in flight (75 seconds = 5 × + /// ). See + /// for the rationale. + /// + public static readonly TimeSpan DefaultHeartbeatStuckCeiling = TimeSpan.FromSeconds(75); /// Initializes a new instance of the WorkerPipeSessionOptions class with default values. public WorkerPipeSessionOptions() { HeartbeatInterval = DefaultHeartbeatInterval; HeartbeatGrace = DefaultHeartbeatGrace; + HeartbeatStuckCeiling = DefaultHeartbeatStuckCeiling; } /// Gets or sets the heartbeat interval. @@ -23,6 +32,27 @@ public sealed class WorkerPipeSessionOptions /// Gets or sets the heartbeat grace period. public TimeSpan HeartbeatGrace { get; set; } + /// + /// Gets or sets the defensive upper bound on how long the watchdog + /// will suppress its StaHung fault while a command is in + /// flight. Worker-017 suppresses the watchdog when the heartbeat + /// snapshot's CurrentCommandCorrelationId is non-empty so a + /// legitimately slow command (e.g. ReadBulk against many + /// uncached tags) does not self-fault — but a truly stuck + /// synchronous COM call against a dead MXAccess provider leaves + /// CurrentCommandCorrelationId non-empty forever and would + /// permanently defeat the watchdog. HeartbeatStuckCeiling is + /// the upper bound on that suppression: once + /// LastStaActivityUtc has been stale for longer than this + /// ceiling, the watchdog DOES fire StaHung even with a + /// command in flight, on the assumption that no legitimate STA + /// command should run that long without periodically refreshing + /// activity. Default is + /// (75 seconds = 5 × ); raise + /// for deployments that run very long bulk operations. + /// + public TimeSpan HeartbeatStuckCeiling { get; set; } + /// Validates the session options. public void Validate() { @@ -39,5 +69,20 @@ public sealed class WorkerPipeSessionOptions nameof(HeartbeatGrace), "Worker heartbeat grace must be greater than zero."); } + + if (HeartbeatStuckCeiling <= TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException( + nameof(HeartbeatStuckCeiling), + "Worker heartbeat stuck ceiling must be greater than zero."); + } + + if (HeartbeatStuckCeiling <= HeartbeatGrace) + { + throw new ArgumentOutOfRangeException( + nameof(HeartbeatStuckCeiling), + "Worker heartbeat stuck ceiling must be greater than HeartbeatGrace; " + + "otherwise it would fire before the in-flight-command suppression had any effect."); + } } } diff --git a/src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs b/src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs index 3a97839..1f7db81 100644 --- a/src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs +++ b/src/MxGateway.Worker/MxAccess/AlarmCommandHandler.cs @@ -36,12 +36,13 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler { private readonly MxAccessEventQueue eventQueue; private readonly Func consumerFactory; + private readonly Action? threadAffinityCheck; private readonly object syncRoot = new object(); private AlarmDispatcher? dispatcher; private bool disposed; public AlarmCommandHandler(MxAccessEventQueue eventQueue) - : this(eventQueue, () => new WnWrapAlarmConsumer()) + : this(eventQueue, () => new WnWrapAlarmConsumer(), threadAffinityCheck: null) { } @@ -49,9 +50,32 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler public AlarmCommandHandler( MxAccessEventQueue eventQueue, Func consumerFactory) + : this(eventQueue, consumerFactory, threadAffinityCheck: null) + { + } + + /// + /// Worker-024: production constructor that also injects an + /// STA-affinity guard. is + /// invoked at the entry of every method that touches the underlying + /// (or the wnwrap COM object + /// through it) — , , + /// , , + /// , — so an + /// off-STA call raises a programming-error diagnostic instead of + /// deadlocking on cross-apartment marshaling to the + /// ThreadingModel=Apartment wnwrap CLSID. The guard is + /// optional: tests that already drive the handler on a single + /// thread can pass null. + /// + public AlarmCommandHandler( + MxAccessEventQueue eventQueue, + Func consumerFactory, + Action? threadAffinityCheck) { this.eventQueue = eventQueue ?? throw new ArgumentNullException(nameof(eventQueue)); this.consumerFactory = consumerFactory ?? throw new ArgumentNullException(nameof(consumerFactory)); + this.threadAffinityCheck = threadAffinityCheck; } public bool IsSubscribed @@ -64,6 +88,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler { if (disposed) throw new ObjectDisposedException(nameof(AlarmCommandHandler)); if (subscription is null) throw new ArgumentNullException(nameof(subscription)); + threadAffinityCheck?.Invoke(); lock (syncRoot) { @@ -94,6 +119,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler /// public void Unsubscribe() { + threadAffinityCheck?.Invoke(); AlarmDispatcher? toDispose; lock (syncRoot) { @@ -112,6 +138,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler string operatorDomain, string operatorFullName) { + threadAffinityCheck?.Invoke(); AlarmDispatcher? d = GetDispatcherOrThrow(); return d.Acknowledge( alarmGuid, @@ -133,6 +160,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler string operatorDomain, string operatorFullName) { + threadAffinityCheck?.Invoke(); AlarmDispatcher? d = GetDispatcherOrThrow(); return d.AcknowledgeByName( alarmName ?? string.Empty, @@ -148,6 +176,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler /// public IReadOnlyList QueryActive(string? alarmFilterPrefix) { + threadAffinityCheck?.Invoke(); AlarmDispatcher? d = GetDispatcherOrThrow(); IReadOnlyList all = d.SnapshotActiveAlarms(); if (string.IsNullOrEmpty(alarmFilterPrefix)) return all; @@ -165,6 +194,7 @@ public sealed class AlarmCommandHandler : IAlarmCommandHandler /// public void PollOnce() { + threadAffinityCheck?.Invoke(); AlarmDispatcher? d; lock (syncRoot) d = dispatcher; // No-op when not yet subscribed or already disposed. diff --git a/src/MxGateway.Worker/MxAccess/MxAccessSession.cs b/src/MxGateway.Worker/MxAccess/MxAccessSession.cs index 6758616..7f74f3f 100644 --- a/src/MxGateway.Worker/MxAccess/MxAccessSession.cs +++ b/src/MxGateway.Worker/MxAccess/MxAccessSession.cs @@ -65,12 +65,23 @@ public sealed class MxAccessSession : IDisposable /// session methods without touching MXAccess COM. This is exposed via /// InternalsVisibleTo("MxGateway.Worker.Tests"); production code /// must use the factory. + /// + /// A runtime guard rejects an — + /// the production sink wired by — because the + /// new object() stand-in this factory uses for the COM object + /// would silently bypass + /// during disposal and mask lifetime regressions (Worker.Tests-026). /// /// The server abstraction to drive. /// The event sink to attach to the session. /// Optional handle registry; a fresh one is created when null. /// Optional value cache; a fresh one is created when null. /// Optional creation thread id; defaults to the current managed thread id. + /// + /// Thrown when is the production + /// . Tests must pass a test + /// double sink — production code must use . + /// internal static MxAccessSession CreateForTesting( IMxAccessServer mxAccessServer, IMxAccessEventSink eventSink, @@ -78,6 +89,14 @@ public sealed class MxAccessSession : IDisposable MxAccessValueCache? valueCache = null, int? creationThreadId = null) { + if (eventSink is MxAccessBaseEventSink) + { + throw new ArgumentException( + "CreateForTesting must not be used with the production MxAccessBaseEventSink. " + + "Use MxAccessSession.Create for production code; pass a test-double IMxAccessEventSink here.", + nameof(eventSink)); + } + return new MxAccessSession( new object(), mxAccessServer, diff --git a/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs b/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs index adbf5b2..2561842 100644 --- a/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs +++ b/src/MxGateway.Worker/MxAccess/MxAccessStaSession.cs @@ -17,7 +17,13 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession private readonly IMxAccessEventSink eventSink; private readonly MxAccessEventQueue eventQueue; private readonly StaRuntime staRuntime; - private readonly Func? alarmCommandHandlerFactory; + // Worker-024: the factory takes an Action so MxAccessStaSession can hand + // the alarm handler its STA-affinity guard (a closure over + // alarmConsumerThreadId captured at the factory call site). The handler + // then invokes the guard at the entry of every method that touches the + // wnwrap consumer, matching the STA-affinity invariant already enforced + // for the poll path via EnsureOnAlarmConsumerThread. + private readonly Func? alarmCommandHandlerFactory; private StaCommandDispatcher? commandDispatcher; private MxAccessSession? session; private IAlarmCommandHandler? alarmCommandHandler; @@ -44,7 +50,7 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession /// ; pass null to opt out /// of alarm-side commands. /// - internal MxAccessStaSession(Func? alarmCommandHandlerFactory) + internal MxAccessStaSession(Func? alarmCommandHandlerFactory) : this( new StaRuntime(), new MxAccessComObjectFactory(), @@ -96,7 +102,7 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession StaRuntime staRuntime, IMxAccessComObjectFactory factory, MxAccessEventQueue eventQueue, - Func? alarmCommandHandlerFactory) + Func? alarmCommandHandlerFactory) : this(staRuntime, factory, new MxAccessBaseEventSink(eventQueue), eventQueue, alarmCommandHandlerFactory) { } @@ -129,7 +135,7 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession IMxAccessComObjectFactory factory, IMxAccessEventSink eventSink, MxAccessEventQueue eventQueue, - Func? alarmCommandHandlerFactory) + Func? alarmCommandHandlerFactory) { this.staRuntime = staRuntime ?? throw new ArgumentNullException(nameof(staRuntime)); this.factory = factory ?? throw new ArgumentNullException(nameof(factory)); @@ -189,7 +195,17 @@ public sealed class MxAccessStaSession : IWorkerRuntimeSession // thread id; RunAlarmPollLoopAsync then asserts each // PollOnce executes on the same thread. alarmConsumerThreadId = Environment.CurrentManagedThreadId; - alarmCommandHandler = alarmCommandHandlerFactory(eventQueue); + // Worker-024: hand the handler an affinity guard so each + // of its command-path entries (Subscribe / Acknowledge / + // AcknowledgeByName / QueryActive / Unsubscribe / PollOnce) + // asserts the same STA-affinity invariant the poll path + // already enforced. Without this the command path relied + // on convention alone; a future refactor that let a + // command run off-STA would silently deadlock on + // cross-apartment marshaling against the wnwrap consumer. + alarmCommandHandler = alarmCommandHandlerFactory( + eventQueue, + EnsureOnAlarmConsumerThread); } commandDispatcher = new StaCommandDispatcher( staRuntime,