diff --git a/docs/GatewayTesting.md b/docs/GatewayTesting.md index 901d13e..45c1eca 100644 --- a/docs/GatewayTesting.md +++ b/docs/GatewayTesting.md @@ -261,7 +261,14 @@ path and writes a JSON report under `artifacts/e2e/`: 1. **Session + register** — opens one session and registers. 2. **Bulk** — verifies `SubscribeBulk` / `UnsubscribeBulk` on a bounded tag subset (skip with `-SkipBulk`). -3. **Add-item / advise** — adds and advises every discovered test tag. +3. **Add-item / advise** — adds and advises every discovered test tag. The + loop has no `StreamEvents` consumer attached, so advised tags accumulate + MXAccess change events in the worker event channel + (`MxGateway:Events:QueueCapacity`); left unbounded it overflows under + `FailFast` backpressure and faults the worker. Every `-DrainEveryTags` + advised tags (default 15) the loop connects a short-lived `StreamEvents` + drain so the gateway pumps that channel empty. `-DrainEveryTags 0` disables + the drain. 4. **Stream** — asserts a bounded event stream delivers at least one event (skip with `-SkipStream`). 5. **Parity** — asserts MXAccess error paths are rejected rather than silently @@ -287,6 +294,16 @@ path and writes a JSON report under `artifacts/e2e/`: write support (`MxAccessCommandExecutor` returning `InvalidRequest` for `Write`/`Write2`/`WriteSecured`/`WriteSecured2`). +Before the per-client phases run, the script builds the .NET CLI +(`dotnet build`) and installs the Java CLI (`gradle :mxgateway-cli:installDist`) +once, then invokes the compiled artifacts directly. The matrix issues several +hundred CLI calls per client; invoking `dotnet run` / `gradle +:mxgateway-cli:run` per call rebuilds and cold-starts the toolchain every time, +which stretches the add-item/advise loop long enough for the worker event +channel to overflow under `FailFast` backpressure. The Go, Rust, and Python +clients still build on demand (`go run` / `cargo run` / `python -m`) because +their per-call startup is already sub-second. + Build the gateway and worker, start the gateway, and provide a valid API key before running the client e2e script: diff --git a/scripts/run-client-e2e-tests.ps1 b/scripts/run-client-e2e-tests.ps1 index 7c93436..b1acc69 100644 --- a/scripts/run-client-e2e-tests.ps1 +++ b/scripts/run-client-e2e-tests.ps1 @@ -31,6 +31,14 @@ param( [string]$Database = "ZB", [int]$EventLimit = 5, [int]$BulkTagCount = 6, + # The per-tag advise loop advises every discovered tag with no StreamEvents + # consumer attached, so MXAccess change events accumulate in the worker + # event channel (MxGateway:Events:QueueCapacity). Left unbounded the channel + # overflows under FailFast backpressure and faults the worker — slow, + # process-per-call clients (the Java CLI) hit this before the loop ends. + # Every DrainEveryTags advised tags the loop connects a short-lived + # StreamEvents drain so the gateway pumps that channel empty. 0 disables it. + [int]$DrainEveryTags = 15, [switch]$SkipStream, [switch]$SkipBulk, # Skip the bulk read+write coverage that runs alongside the existing @@ -97,6 +105,10 @@ if ($BulkTagCount -lt 1) { throw "BulkTagCount must be greater than zero." } +if ($DrainEveryTags -lt 0) { + throw "DrainEveryTags cannot be negative." +} + if ($WriteEchoMaxEvents -lt 1) { throw "WriteEchoMaxEvents must be greater than zero." } @@ -454,6 +466,49 @@ function Assert-BulkResults { } } +# Builds the dotnet and Java client CLIs once up front and records the path to +# each compiled artifact. The e2e matrix issues ~250 CLI calls per client; +# invoking `dotnet run` / `gradle :mxgateway-cli:run` per call rebuilds and +# cold-starts the toolchain every time, stretching the per-tag advise loop long +# enough for the worker event channel to overflow under the FailFast +# backpressure policy. Running the compiled artifact keeps per-call latency +# sub-second, matching the Go/Rust/Python paths. +function Initialize-ClientBuilds { + if ($Clients -contains "dotnet") { + $cliProject = Join-Path $repoRoot "clients/dotnet/MxGateway.Client.Cli/MxGateway.Client.Cli.csproj" + $script:dotnetCliExe = Join-Path $repoRoot ` + "clients/dotnet/MxGateway.Client.Cli/bin/Debug/net10.0/MxGateway.Client.Cli.exe" + if (-not $DryRun) { + Write-Host "Building the .NET client CLI once: $cliProject" + Invoke-NativeCommand -FilePath "dotnet" ` + -Arguments @("build", $cliProject, "-c", "Debug", "--nologo", "-v", "quiet") ` + -WorkingDirectory $repoRoot | Out-Null + if (-not (Test-Path $script:dotnetCliExe)) { + throw "The .NET client CLI build did not produce '$script:dotnetCliExe'." + } + } + } + + if ($Clients -contains "java") { + $script:javaCliBat = Join-Path $repoRoot ` + "clients/java/mxgateway-cli/build/install/mxgateway-cli/bin/mxgateway-cli.bat" + if (-not $DryRun) { + $gradleCommand = Get-Command "gradle.bat", "gradle.cmd", "gradle.exe", "gradle" ` + -ErrorAction SilentlyContinue | Select-Object -First 1 + if ($null -eq $gradleCommand) { + throw "The 'gradle' command was not found on PATH; the Java client e2e flow requires Gradle." + } + Write-Host "Installing the Java client CLI once via :mxgateway-cli:installDist" + Invoke-NativeCommand -FilePath "cmd.exe" ` + -Arguments @("/c", $gradleCommand.Source, "--quiet", ":mxgateway-cli:installDist") ` + -WorkingDirectory (Join-Path $repoRoot "clients/java") | Out-Null + if (-not (Test-Path $script:javaCliBat)) { + throw "The Java client CLI install did not produce '$script:javaCliBat'." + } + } + } +} + function Get-ClientCommand { param( [string]$Client, @@ -476,7 +531,6 @@ function Get-ClientCommand { switch ($Client) { "dotnet" { $arguments = @( - "run", "--project", "clients/dotnet/MxGateway.Client.Cli", "--", $Operation, "--endpoint", $httpEndpoint, "--api-key-env", $ApiKeyEnvName, @@ -509,7 +563,7 @@ function Get-ClientCommand { } elseif ($Operation -eq "close-session") { $arguments += @("--session-id", $Values.sessionId) } - return [pscustomobject]@{ file = "dotnet"; args = $arguments; cwd = $repoRoot; env = @{} } + return [pscustomobject]@{ file = $script:dotnetCliExe; args = $arguments; cwd = $repoRoot; env = @{} } } "go" { $arguments = @( @@ -657,18 +711,15 @@ function Get-ClientCommand { } elseif ($Operation -eq "close-session") { $cliArgs += @("--session-id", $Values.sessionId) } - $arguments = @("--quiet", ":mxgateway-cli:run", "--args=$($cliArgs -join ' ')") - # Gradle ships as gradle.bat on Windows; .NET's Process.Start - # (UseShellExecute=false) cannot launch a batch file directly, so - # resolve the launcher and run it through cmd.exe. - $gradleCommand = Get-Command "gradle.bat", "gradle.cmd", "gradle.exe", "gradle" ` - -ErrorAction SilentlyContinue | Select-Object -First 1 - if ($null -eq $gradleCommand) { - throw "The 'gradle' command was not found on PATH; the Java client e2e flow requires Gradle." - } + # The Java CLI is installed once up front (gradle + # :mxgateway-cli:installDist) so each call runs the generated + # launcher script directly instead of paying Gradle configuration + # plus a JVM cold-start per invocation. .NET's Process.Start + # (UseShellExecute=false) cannot launch a .bat directly, so the + # launcher runs through cmd.exe. return [pscustomobject]@{ file = "cmd.exe" - args = @("/c", $gradleCommand.Source) + $arguments + args = @("/c", $script:javaCliBat) + $cliArgs cwd = (Join-Path $repoRoot "clients/java") env = @{} } @@ -796,6 +847,30 @@ function Invoke-ClientOperationExpectingFailure { -AllowFailure } +# Connects a short-lived StreamEvents consumer so the gateway empties the worker +# event channel. The per-tag advise loop advises every discovered tag with no +# consumer attached; without periodic draining the worker event channel +# (MxGateway:Events:QueueCapacity) overflows under FailFast backpressure and +# faults the worker. +# +# A small bounded read is enough: the gateway's per-stream producer +# (EventStreamService.ProduceEventsAsync) races ahead of the CLI and pulls the +# entire worker event channel into its own buffer the instant a subscriber +# attaches, so the channel is emptied long before the CLI finishes reading +# these events. Run via the expecting-failure path so the drain's exit code is +# ignored — its purpose is the side effect (emptying the channel), not output. +function Invoke-EventDrain { + param( + [string]$Client, + [string]$SessionId + ) + + Invoke-ClientOperationExpectingFailure -Client $Client -Operation "stream-events" -Values @{ + sessionId = $SessionId + maxEvents = 200 + } | Out-Null +} + # Runs the full e2e flow for a single language client and returns the result # record. Discovered tags are passed in so the (slow) SQL discovery runs once. function Invoke-ClientFlow { @@ -1000,6 +1075,7 @@ function Invoke-ClientFlow { } } + $advisedSinceDrain = 0 foreach ($tag in $Tags) { $addJson = Invoke-ClientOperation -Client $Client -Operation "add-item" -Values @{ sessionId = $sessionId @@ -1020,6 +1096,15 @@ function Invoke-ClientFlow { itemHandle = $itemHandle protectedWriteRequired = $tag.attributeName -eq "ProtectedValue" } + + # Drain the worker event channel every DrainEveryTags advised tags + # so this unbounded advise loop cannot overflow it and fault the + # worker before the loop completes. + $advisedSinceDrain++ + if ($DrainEveryTags -gt 0 -and $advisedSinceDrain -ge $DrainEveryTags) { + Invoke-EventDrain -Client $Client -SessionId $sessionId + $advisedSinceDrain = 0 + } } # --- Event streaming ---------------------------------------------- @@ -1131,6 +1216,7 @@ function Get-ChildArgumentList { "-Database", $Database, "-EventLimit", "$EventLimit", "-BulkTagCount", "$BulkTagCount", + "-DrainEveryTags", "$DrainEveryTags", "-WriteAttribute", $WriteAttribute, "-WriteType", $WriteType, "-WriteValueBase", "$WriteValueBase", @@ -1219,6 +1305,7 @@ if ($Parallel -and $Clients.Count -gt 1) { attributes = $Attributes eventLimit = $EventLimit bulkTagCount = $BulkTagCount + drainEveryTags = $DrainEveryTags skipStream = [bool]$SkipStream skipBulk = [bool]$SkipBulk verifyWrite = [bool]$VerifyWrite @@ -1247,6 +1334,8 @@ if ($Parallel -and $Clients.Count -gt 1) { } # --- Serial mode ----------------------------------------------------------- +Initialize-ClientBuilds + $discoveryJson = & $discoveryScript ` -MachineStart $MachineStart ` -MachineEnd $MachineEnd ` @@ -1277,6 +1366,7 @@ $run = [ordered]@{ attributes = $Attributes eventLimit = $EventLimit bulkTagCount = $BulkTagCount + drainEveryTags = $DrainEveryTags skipStream = [bool]$SkipStream skipBulk = [bool]$SkipBulk verifyWrite = [bool]$VerifyWrite