Cross-language ReadBulk stress benchmark

Adds a bench-read-bulk subcommand to every client CLI (.NET, Go, Rust,
Python, Java) and a PowerShell driver that runs all five concurrently
against the deployed gateway and prints a side-by-side comparison.

Each CLI''s bench:

  - Opens its own session, registers, subscribes to bulk-size tags so the
    worker''s MxAccessValueCache populates from real OnDataChange events.
  - Runs a warmup-seconds-long pre-loop with identical calls so JIT /
    connection-pool / first-call overhead is amortised before the
    measurement window.
  - Runs ReadBulk in a tight in-process loop for duration-seconds with
    per-call high-resolution latency capture (Stopwatch in .NET,
    time.Now in Go, std::time::Instant in Rust, time.perf_counter in
    Python, System.nanoTime in Java).
  - Unsubscribes + closes the session, then emits one JSON object with
    the shared schema: { language, durationMs, totalCalls, successfulCalls,
    failedCalls, totalReadResults, cachedReadResults, callsPerSecond,
    latencyMs: { p50, p95, p99, max, mean } }.

The PS driver (scripts/bench-read-bulk.ps1) launches one detached process
per client, waits for all to finish, parses the trailing JSON object from
each stdout, prints a comparison table, and persists the combined report
under artifacts/bench/. Quoting around Java''s `gradle --args="..."` is
handled by writing a one-shot .bat that cmd.exe runs; the .NET CLI''s
per-call gRPC timeout is auto-scaled to (Duration + Warmup + 30s) so the
channel-wide timeout doesn''t cancel the bench mid-loop.

Live 30-second steady-state run against the deployed gateway, all five
clients hitting the same six TestMachine_001..006.TestChangingInt tags:

  client    calls/sec  cached/total    p50 ms  p95 ms  p99 ms  max ms
  dotnet      171.78   30924/30924      3.84   14.06   40.41  542.48
  go          175.46   31590/31590      3.93   13.52   41.26  243.00
  rust        123.26   22188/22188      5.52   15.78   48.11  544.41
  python      145.79   26244/26244      4.86   14.85   41.65  645.84
  java        181.12   32604/32604      3.80   10.59   33.37  344.27

143,550 ReadBulk results across all five clients during the 30s window;
100% were was_cached = true (the worker''s cache fast-path never fell
through to the snapshot lifecycle). Aggregate read throughput ~800
calls/sec against five concurrent sessions sharing the same cached tags.

A second variant with bulk-size 20 sustained the same per-client call
rate while delivering 3.3x more values per call (~37,000 cached reads/sec
aggregate across the five concurrent sessions), confirming the linear
per-tag cache lookup inside one call is not a bottleneck at this scale.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-20 05:17:08 -04:00
parent eaa7093cd6
commit 93633ce99c
6 changed files with 1370 additions and 0 deletions
@@ -111,6 +111,8 @@ public static class MxGatewayClientCli
.ConfigureAwait(false),
"write-secured2-bulk" => await WriteSecured2BulkAsync(arguments, client, standardOutput, cancellation.Token)
.ConfigureAwait(false),
"bench-read-bulk" => await BenchReadBulkAsync(arguments, client, standardOutput, cancellation.Token)
.ConfigureAwait(false),
"stream-events" => await StreamEventsAsync(arguments, client, standardOutput, cancellation.Token)
.ConfigureAwait(false),
"write" => await WriteAsync(arguments, client, standardOutput, cancellation.Token)
@@ -581,6 +583,237 @@ public static class MxGatewayClientCli
cancellationToken);
}
/// <summary>
/// Cross-language stress benchmark for ReadBulk. Opens its own session,
/// subscribes to N tags so the worker's MxAccessValueCache populates from
/// real OnDataChange events, then hammers ReadBulk in a tight in-process
/// loop with per-call Stopwatch timing. Emits a single JSON object on
/// stdout that the scripts/bench-read-bulk.ps1 driver collates across
/// all five language clients.
/// </summary>
private static async Task<int> BenchReadBulkAsync(
CliArguments arguments,
IMxGatewayCliClient client,
TextWriter output,
CancellationToken cancellationToken)
{
int durationSeconds = arguments.GetInt32("duration-seconds", 30);
int warmupSeconds = arguments.GetInt32("warmup-seconds", 3);
int bulkSize = arguments.GetInt32("bulk-size", 6);
int tagStart = arguments.GetInt32("tag-start", 1);
string tagPrefix = arguments.GetOptional("tag-prefix") ?? "TestMachine_";
string tagAttribute = arguments.GetOptional("tag-attribute") ?? "TestChangingInt";
uint timeoutMs = (uint)arguments.GetInt32("timeout-ms", 1500);
string clientName = arguments.GetOptional("client-name") ?? "mxgw-dotnet-bench";
string[] tags = new string[bulkSize];
for (int i = 0; i < bulkSize; i++)
{
// TestMachine_NNN.<attribute>, three-digit machine numbers matching
// the existing e2e tag-discovery convention.
tags[i] = $"{tagPrefix}{(tagStart + i):D3}.{tagAttribute}";
}
// Open + register + subscribe-bulk so the cache populates before the
// measurement window opens.
OpenSessionReply openReply = await client.OpenSessionAsync(
new OpenSessionRequest { ClientSessionName = clientName, ClientCorrelationId = CreateCorrelationId() },
cancellationToken)
.ConfigureAwait(false);
string sessionId = openReply.SessionId;
try
{
MxCommandReply registerReply = await InvokeAndEnsureAsync(
client,
CreateCommandRequest(sessionId, new MxCommand
{
Kind = MxCommandKind.Register,
Register = new RegisterCommand { ClientName = clientName },
}),
cancellationToken)
.ConfigureAwait(false);
int serverHandle = registerReply.Register?.ServerHandle ?? registerReply.ReturnValue.Int32Value;
SubscribeBulkCommand subscribe = new() { ServerHandle = serverHandle };
subscribe.TagAddresses.Add(tags);
MxCommandReply subscribeReply = await InvokeAndEnsureAsync(
client,
CreateCommandRequest(sessionId, new MxCommand
{
Kind = MxCommandKind.SubscribeBulk,
SubscribeBulk = subscribe,
}),
cancellationToken)
.ConfigureAwait(false);
int[] itemHandles = subscribeReply.SubscribeBulk?.Results
.Where(r => r.WasSuccessful)
.Select(r => r.ItemHandle)
.ToArray() ?? [];
// Warm-up: drive the same call shape so the JIT / connection
// pipelines settle before the measurement window opens.
DateTime warmupDeadline = DateTime.UtcNow + TimeSpan.FromSeconds(warmupSeconds);
ReadBulkCommand readBulkCommand = new()
{
ServerHandle = serverHandle,
TimeoutMs = timeoutMs,
};
readBulkCommand.TagAddresses.Add(tags);
MxCommand readBulkMxCommand = new() { Kind = MxCommandKind.ReadBulk, ReadBulk = readBulkCommand };
while (DateTime.UtcNow < warmupDeadline)
{
_ = await client.InvokeAsync(
CreateCommandRequest(sessionId, readBulkMxCommand),
cancellationToken)
.ConfigureAwait(false);
}
// Steady state — capture per-call wall latency with a high-res
// Stopwatch so the resolution is sub-millisecond on modern Windows.
List<double> latencyMillis = new(capacity: 65536);
long totalReadResults = 0;
long cachedReadResults = 0;
int successfulCalls = 0;
int failedCalls = 0;
DateTime steadyDeadline = DateTime.UtcNow + TimeSpan.FromSeconds(durationSeconds);
DateTime steadyStart = DateTime.UtcNow;
while (DateTime.UtcNow < steadyDeadline)
{
System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew();
MxCommandReply reply;
try
{
reply = await client.InvokeAsync(
CreateCommandRequest(sessionId, readBulkMxCommand),
cancellationToken)
.ConfigureAwait(false);
sw.Stop();
}
catch
{
sw.Stop();
failedCalls++;
latencyMillis.Add(sw.Elapsed.TotalMilliseconds);
continue;
}
latencyMillis.Add(sw.Elapsed.TotalMilliseconds);
if (reply.ProtocolStatus?.Code != ProtocolStatusCode.Ok)
{
failedCalls++;
continue;
}
successfulCalls++;
if (reply.ReadBulk is not null)
{
foreach (BulkReadResult r in reply.ReadBulk.Results)
{
totalReadResults++;
if (r.WasCached)
{
cachedReadResults++;
}
}
}
}
double steadyElapsedSeconds = (DateTime.UtcNow - steadyStart).TotalSeconds;
if (itemHandles.Length > 0)
{
UnsubscribeBulkCommand unsubscribe = new() { ServerHandle = serverHandle };
unsubscribe.ItemHandles.Add(itemHandles);
_ = await client.InvokeAsync(
CreateCommandRequest(sessionId, new MxCommand
{
Kind = MxCommandKind.UnsubscribeBulk,
UnsubscribeBulk = unsubscribe,
}),
cancellationToken)
.ConfigureAwait(false);
}
int totalCalls = successfulCalls + failedCalls;
double callsPerSecond = steadyElapsedSeconds > 0
? totalCalls / steadyElapsedSeconds
: 0;
object stats = new
{
language = "dotnet",
command = "bench-read-bulk",
endpoint = arguments.GetOptional("endpoint") ?? "(default)",
clientName,
bulkSize,
durationSeconds,
warmupSeconds,
durationMs = (long)(steadyElapsedSeconds * 1000),
tags,
totalCalls,
successfulCalls,
failedCalls,
totalReadResults,
cachedReadResults,
callsPerSecond = Math.Round(callsPerSecond, 2),
latencyMs = new
{
p50 = Percentile(latencyMillis, 0.50),
p95 = Percentile(latencyMillis, 0.95),
p99 = Percentile(latencyMillis, 0.99),
max = latencyMillis.Count > 0 ? Math.Round(latencyMillis.Max(), 3) : 0,
mean = latencyMillis.Count > 0 ? Math.Round(latencyMillis.Average(), 3) : 0,
},
};
output.WriteLine(JsonSerializer.Serialize(stats, JsonOptions));
return 0;
}
finally
{
try
{
await client.CloseSessionAsync(
new CloseSessionRequest { SessionId = sessionId, ClientCorrelationId = CreateCorrelationId() },
cancellationToken)
.ConfigureAwait(false);
}
catch
{
// Closing the session is best-effort — never let it mask a real bench error.
}
}
}
/// <summary>
/// Computes the requested percentile from an unsorted latency sample using
/// nearest-rank with linear interpolation. Rounds to 3 decimal places to
/// match the JSON schema the PS driver collates.
/// </summary>
private static double Percentile(IReadOnlyList<double> sample, double quantile)
{
if (sample.Count == 0)
{
return 0;
}
double[] sorted = sample.ToArray();
Array.Sort(sorted);
if (sorted.Length == 1)
{
return Math.Round(sorted[0], 3);
}
double rank = quantile * (sorted.Length - 1);
int lower = (int)Math.Floor(rank);
int upper = (int)Math.Ceiling(rank);
double fraction = rank - lower;
double value = sorted[lower] + (sorted[upper] - sorted[lower]) * fraction;
return Math.Round(value, 3);
}
/// <summary>
/// Parses the bulk-write CLI's <c>--values</c> list. All entries share
/// the single <c>--type</c> argument; the comma-separated values are
@@ -1239,6 +1472,7 @@ public static class MxGatewayClientCli
or "write2-bulk"
or "write-secured-bulk"
or "write-secured2-bulk"
or "bench-read-bulk"
or "stream-events"
or "write"
or "write2"