e2e: build client CLIs once and drain events so dotnet/java pass

The cross-language client e2e matrix failed for dotnet and Java. Both
failures were in the harness, not the client code.

1. Per-call toolchain cold-start. The matrix issues ~250 CLI calls per
   client; it invoked `dotnet run` / `gradle :mxgateway-cli:run` every
   time, rebuilding and cold-starting the toolchain per call. Build each
   CLI once up front (`dotnet build`, `gradle :mxgateway-cli:installDist`)
   and invoke the compiled artifact directly. This alone fixes dotnet.

2. Worker event-channel overflow. The per-tag advise loop advises every
   discovered tag with no StreamEvents consumer attached, so change
   events accumulate in the worker event channel
   (MxGateway:Events:QueueCapacity) until FailFast faults the worker.
   dotnet's faster loop slipped under the window; the Java CLI's
   process-per-call JVM cold-start did not. Every -DrainEveryTags advised
   tags (default 15) the loop connects a short StreamEvents drain; the
   gateway's per-stream producer empties the channel the instant a
   subscriber attaches, so a small bounded read suffices.

Full 5-client matrix (dotnet, go, rust, python, java) now passes with
-VerifyWrite against a live gateway.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-21 05:24:24 -04:00
parent b794c46bc7
commit c1ff8c94e8
2 changed files with 120 additions and 13 deletions
+102 -12
View File
@@ -31,6 +31,14 @@ param(
[string]$Database = "ZB",
[int]$EventLimit = 5,
[int]$BulkTagCount = 6,
# The per-tag advise loop advises every discovered tag with no StreamEvents
# consumer attached, so MXAccess change events accumulate in the worker
# event channel (MxGateway:Events:QueueCapacity). Left unbounded the channel
# overflows under FailFast backpressure and faults the worker — slow,
# process-per-call clients (the Java CLI) hit this before the loop ends.
# Every DrainEveryTags advised tags the loop connects a short-lived
# StreamEvents drain so the gateway pumps that channel empty. 0 disables it.
[int]$DrainEveryTags = 15,
[switch]$SkipStream,
[switch]$SkipBulk,
# Skip the bulk read+write coverage that runs alongside the existing
@@ -97,6 +105,10 @@ if ($BulkTagCount -lt 1) {
throw "BulkTagCount must be greater than zero."
}
if ($DrainEveryTags -lt 0) {
throw "DrainEveryTags cannot be negative."
}
if ($WriteEchoMaxEvents -lt 1) {
throw "WriteEchoMaxEvents must be greater than zero."
}
@@ -454,6 +466,49 @@ function Assert-BulkResults {
}
}
# Builds the dotnet and Java client CLIs once up front and records the path to
# each compiled artifact. The e2e matrix issues ~250 CLI calls per client;
# invoking `dotnet run` / `gradle :mxgateway-cli:run` per call rebuilds and
# cold-starts the toolchain every time, stretching the per-tag advise loop long
# enough for the worker event channel to overflow under the FailFast
# backpressure policy. Running the compiled artifact keeps per-call latency
# sub-second, matching the Go/Rust/Python paths.
function Initialize-ClientBuilds {
if ($Clients -contains "dotnet") {
$cliProject = Join-Path $repoRoot "clients/dotnet/MxGateway.Client.Cli/MxGateway.Client.Cli.csproj"
$script:dotnetCliExe = Join-Path $repoRoot `
"clients/dotnet/MxGateway.Client.Cli/bin/Debug/net10.0/MxGateway.Client.Cli.exe"
if (-not $DryRun) {
Write-Host "Building the .NET client CLI once: $cliProject"
Invoke-NativeCommand -FilePath "dotnet" `
-Arguments @("build", $cliProject, "-c", "Debug", "--nologo", "-v", "quiet") `
-WorkingDirectory $repoRoot | Out-Null
if (-not (Test-Path $script:dotnetCliExe)) {
throw "The .NET client CLI build did not produce '$script:dotnetCliExe'."
}
}
}
if ($Clients -contains "java") {
$script:javaCliBat = Join-Path $repoRoot `
"clients/java/mxgateway-cli/build/install/mxgateway-cli/bin/mxgateway-cli.bat"
if (-not $DryRun) {
$gradleCommand = Get-Command "gradle.bat", "gradle.cmd", "gradle.exe", "gradle" `
-ErrorAction SilentlyContinue | Select-Object -First 1
if ($null -eq $gradleCommand) {
throw "The 'gradle' command was not found on PATH; the Java client e2e flow requires Gradle."
}
Write-Host "Installing the Java client CLI once via :mxgateway-cli:installDist"
Invoke-NativeCommand -FilePath "cmd.exe" `
-Arguments @("/c", $gradleCommand.Source, "--quiet", ":mxgateway-cli:installDist") `
-WorkingDirectory (Join-Path $repoRoot "clients/java") | Out-Null
if (-not (Test-Path $script:javaCliBat)) {
throw "The Java client CLI install did not produce '$script:javaCliBat'."
}
}
}
}
function Get-ClientCommand {
param(
[string]$Client,
@@ -476,7 +531,6 @@ function Get-ClientCommand {
switch ($Client) {
"dotnet" {
$arguments = @(
"run", "--project", "clients/dotnet/MxGateway.Client.Cli", "--",
$Operation,
"--endpoint", $httpEndpoint,
"--api-key-env", $ApiKeyEnvName,
@@ -509,7 +563,7 @@ function Get-ClientCommand {
} elseif ($Operation -eq "close-session") {
$arguments += @("--session-id", $Values.sessionId)
}
return [pscustomobject]@{ file = "dotnet"; args = $arguments; cwd = $repoRoot; env = @{} }
return [pscustomobject]@{ file = $script:dotnetCliExe; args = $arguments; cwd = $repoRoot; env = @{} }
}
"go" {
$arguments = @(
@@ -657,18 +711,15 @@ function Get-ClientCommand {
} elseif ($Operation -eq "close-session") {
$cliArgs += @("--session-id", $Values.sessionId)
}
$arguments = @("--quiet", ":mxgateway-cli:run", "--args=$($cliArgs -join ' ')")
# Gradle ships as gradle.bat on Windows; .NET's Process.Start
# (UseShellExecute=false) cannot launch a batch file directly, so
# resolve the launcher and run it through cmd.exe.
$gradleCommand = Get-Command "gradle.bat", "gradle.cmd", "gradle.exe", "gradle" `
-ErrorAction SilentlyContinue | Select-Object -First 1
if ($null -eq $gradleCommand) {
throw "The 'gradle' command was not found on PATH; the Java client e2e flow requires Gradle."
}
# The Java CLI is installed once up front (gradle
# :mxgateway-cli:installDist) so each call runs the generated
# launcher script directly instead of paying Gradle configuration
# plus a JVM cold-start per invocation. .NET's Process.Start
# (UseShellExecute=false) cannot launch a .bat directly, so the
# launcher runs through cmd.exe.
return [pscustomobject]@{
file = "cmd.exe"
args = @("/c", $gradleCommand.Source) + $arguments
args = @("/c", $script:javaCliBat) + $cliArgs
cwd = (Join-Path $repoRoot "clients/java")
env = @{}
}
@@ -796,6 +847,30 @@ function Invoke-ClientOperationExpectingFailure {
-AllowFailure
}
# Connects a short-lived StreamEvents consumer so the gateway empties the worker
# event channel. The per-tag advise loop advises every discovered tag with no
# consumer attached; without periodic draining the worker event channel
# (MxGateway:Events:QueueCapacity) overflows under FailFast backpressure and
# faults the worker.
#
# A small bounded read is enough: the gateway's per-stream producer
# (EventStreamService.ProduceEventsAsync) races ahead of the CLI and pulls the
# entire worker event channel into its own buffer the instant a subscriber
# attaches, so the channel is emptied long before the CLI finishes reading
# these events. Run via the expecting-failure path so the drain's exit code is
# ignored — its purpose is the side effect (emptying the channel), not output.
function Invoke-EventDrain {
param(
[string]$Client,
[string]$SessionId
)
Invoke-ClientOperationExpectingFailure -Client $Client -Operation "stream-events" -Values @{
sessionId = $SessionId
maxEvents = 200
} | Out-Null
}
# Runs the full e2e flow for a single language client and returns the result
# record. Discovered tags are passed in so the (slow) SQL discovery runs once.
function Invoke-ClientFlow {
@@ -1000,6 +1075,7 @@ function Invoke-ClientFlow {
}
}
$advisedSinceDrain = 0
foreach ($tag in $Tags) {
$addJson = Invoke-ClientOperation -Client $Client -Operation "add-item" -Values @{
sessionId = $sessionId
@@ -1020,6 +1096,15 @@ function Invoke-ClientFlow {
itemHandle = $itemHandle
protectedWriteRequired = $tag.attributeName -eq "ProtectedValue"
}
# Drain the worker event channel every DrainEveryTags advised tags
# so this unbounded advise loop cannot overflow it and fault the
# worker before the loop completes.
$advisedSinceDrain++
if ($DrainEveryTags -gt 0 -and $advisedSinceDrain -ge $DrainEveryTags) {
Invoke-EventDrain -Client $Client -SessionId $sessionId
$advisedSinceDrain = 0
}
}
# --- Event streaming ----------------------------------------------
@@ -1131,6 +1216,7 @@ function Get-ChildArgumentList {
"-Database", $Database,
"-EventLimit", "$EventLimit",
"-BulkTagCount", "$BulkTagCount",
"-DrainEveryTags", "$DrainEveryTags",
"-WriteAttribute", $WriteAttribute,
"-WriteType", $WriteType,
"-WriteValueBase", "$WriteValueBase",
@@ -1219,6 +1305,7 @@ if ($Parallel -and $Clients.Count -gt 1) {
attributes = $Attributes
eventLimit = $EventLimit
bulkTagCount = $BulkTagCount
drainEveryTags = $DrainEveryTags
skipStream = [bool]$SkipStream
skipBulk = [bool]$SkipBulk
verifyWrite = [bool]$VerifyWrite
@@ -1247,6 +1334,8 @@ if ($Parallel -and $Clients.Count -gt 1) {
}
# --- Serial mode -----------------------------------------------------------
Initialize-ClientBuilds
$discoveryJson = & $discoveryScript `
-MachineStart $MachineStart `
-MachineEnd $MachineEnd `
@@ -1277,6 +1366,7 @@ $run = [ordered]@{
attributes = $Attributes
eventLimit = $EventLimit
bulkTagCount = $BulkTagCount
drainEveryTags = $DrainEveryTags
skipStream = [bool]$SkipStream
skipBulk = [bool]$SkipBulk
verifyWrite = [bool]$VerifyWrite