Cross-language ReadBulk stress benchmark

Adds a bench-read-bulk subcommand to every client CLI (.NET, Go, Rust,
Python, Java) and a PowerShell driver that runs all five concurrently
against the deployed gateway and prints a side-by-side comparison.

Each CLI''s bench:

  - Opens its own session, registers, subscribes to bulk-size tags so the
    worker''s MxAccessValueCache populates from real OnDataChange events.
  - Runs a warmup-seconds-long pre-loop with identical calls so JIT /
    connection-pool / first-call overhead is amortised before the
    measurement window.
  - Runs ReadBulk in a tight in-process loop for duration-seconds with
    per-call high-resolution latency capture (Stopwatch in .NET,
    time.Now in Go, std::time::Instant in Rust, time.perf_counter in
    Python, System.nanoTime in Java).
  - Unsubscribes + closes the session, then emits one JSON object with
    the shared schema: { language, durationMs, totalCalls, successfulCalls,
    failedCalls, totalReadResults, cachedReadResults, callsPerSecond,
    latencyMs: { p50, p95, p99, max, mean } }.

The PS driver (scripts/bench-read-bulk.ps1) launches one detached process
per client, waits for all to finish, parses the trailing JSON object from
each stdout, prints a comparison table, and persists the combined report
under artifacts/bench/. Quoting around Java''s `gradle --args="..."` is
handled by writing a one-shot .bat that cmd.exe runs; the .NET CLI''s
per-call gRPC timeout is auto-scaled to (Duration + Warmup + 30s) so the
channel-wide timeout doesn''t cancel the bench mid-loop.

Live 30-second steady-state run against the deployed gateway, all five
clients hitting the same six TestMachine_001..006.TestChangingInt tags:

  client    calls/sec  cached/total    p50 ms  p95 ms  p99 ms  max ms
  dotnet      171.78   30924/30924      3.84   14.06   40.41  542.48
  go          175.46   31590/31590      3.93   13.52   41.26  243.00
  rust        123.26   22188/22188      5.52   15.78   48.11  544.41
  python      145.79   26244/26244      4.86   14.85   41.65  645.84
  java        181.12   32604/32604      3.80   10.59   33.37  344.27

143,550 ReadBulk results across all five clients during the 30s window;
100% were was_cached = true (the worker''s cache fast-path never fell
through to the snapshot lifecycle). Aggregate read throughput ~800
calls/sec against five concurrent sessions sharing the same cached tags.

A second variant with bulk-size 20 sustained the same per-client call
rate while delivering 3.3x more values per call (~37,000 cached reads/sec
aggregate across the five concurrent sessions), confirming the linear
per-tag cache lookup inside one call is not a bottleneck at this scale.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-20 05:17:08 -04:00
parent eaa7093cd6
commit 93633ce99c
6 changed files with 1370 additions and 0 deletions
+213
View File
@@ -231,6 +231,32 @@ enum Command {
#[arg(long)]
json: bool,
},
/// Cross-language ReadBulk stress benchmark. Opens its own session,
/// subscribes to bulk-size tags, then hammers ReadBulk in a tight loop
/// for duration-seconds and emits a JSON stats record the
/// scripts/bench-read-bulk.ps1 driver collates across all five clients.
BenchReadBulk {
#[command(flatten)]
connection: ConnectionArgs,
#[arg(long, default_value = "mxgw-rust-bench")]
client_name: String,
#[arg(long, default_value_t = 30)]
duration_seconds: u64,
#[arg(long, default_value_t = 3)]
warmup_seconds: u64,
#[arg(long, default_value_t = 6)]
bulk_size: usize,
#[arg(long, default_value_t = 1)]
tag_start: usize,
#[arg(long, default_value = "TestMachine_")]
tag_prefix: String,
#[arg(long, default_value = "TestChangingInt")]
tag_attribute: String,
#[arg(long, default_value_t = 1500)]
timeout_ms: u32,
#[arg(long)]
json: bool,
},
StreamEvents {
#[command(flatten)]
connection: ConnectionArgs,
@@ -663,6 +689,38 @@ async fn run(cli: Cli) -> Result<(), Error> {
.await?;
print_write_bulk_results("write-secured2-bulk", &results, json);
}
Command::BenchReadBulk {
connection,
client_name,
duration_seconds,
warmup_seconds,
bulk_size,
tag_start,
tag_prefix,
tag_attribute,
timeout_ms,
json,
} => {
if bulk_size == 0 {
return Err(Error::InvalidArgument {
name: "bulk-size".to_owned(),
detail: "must be positive".to_owned(),
});
}
run_bench_read_bulk(
connection,
client_name,
duration_seconds,
warmup_seconds,
bulk_size,
tag_start,
tag_prefix,
tag_attribute,
timeout_ms,
json,
)
.await?;
}
Command::StreamEvents {
connection,
session_id,
@@ -936,6 +994,161 @@ async fn session_for(
Ok(client.session(session_id))
}
/// Cross-language ReadBulk stress benchmark — mirrors the .NET / Go / Python /
/// Java implementations so the PS driver collates one JSON schema across all
/// five clients.
#[allow(clippy::too_many_arguments)]
async fn run_bench_read_bulk(
connection: ConnectionArgs,
client_name: String,
duration_seconds: u64,
warmup_seconds: u64,
bulk_size: usize,
tag_start: usize,
tag_prefix: String,
tag_attribute: String,
timeout_ms: u32,
use_json: bool,
) -> Result<(), Error> {
let endpoint = connection.endpoint.clone();
let client = connect(connection).await?;
let session = client
.open_session(OpenSessionRequest {
client_session_name: client_name.clone(),
..OpenSessionRequest::default()
})
.await?;
let tags: Vec<String> = (0..bulk_size)
.map(|i| format!("{tag_prefix}{:03}.{tag_attribute}", tag_start + i))
.collect();
// Bench body in its own block so the trailing session.close() always
// runs, even on the early returns the loop body never hits today.
let bench_outcome = async {
let server_handle = session.register(&client_name).await?;
let subscribe_results = session.subscribe_bulk(server_handle, tags.clone()).await?;
let item_handles: Vec<i32> = subscribe_results
.iter()
.filter(|r| r.was_successful)
.map(|r| r.item_handle)
.collect();
let warmup_deadline = std::time::Instant::now()
+ std::time::Duration::from_secs(warmup_seconds);
while std::time::Instant::now() < warmup_deadline {
let _ = session
.read_bulk(server_handle, tags.clone(), timeout_ms)
.await;
}
let mut latencies_ms: Vec<f64> = Vec::with_capacity(65_536);
let mut total_read_results: u64 = 0;
let mut cached_read_results: u64 = 0;
let mut successful_calls: u64 = 0;
let mut failed_calls: u64 = 0;
let steady_start = std::time::Instant::now();
let steady_deadline = steady_start + std::time::Duration::from_secs(duration_seconds);
while std::time::Instant::now() < steady_deadline {
let call_start = std::time::Instant::now();
let outcome = session.read_bulk(server_handle, tags.clone(), timeout_ms).await;
let elapsed_ms = call_start.elapsed().as_secs_f64() * 1000.0;
latencies_ms.push(elapsed_ms);
match outcome {
Ok(results) => {
successful_calls += 1;
for r in &results {
total_read_results += 1;
if r.was_cached {
cached_read_results += 1;
}
}
}
Err(_) => failed_calls += 1,
}
}
let steady_elapsed = steady_start.elapsed();
if !item_handles.is_empty() {
let _ = session.unsubscribe_bulk(server_handle, item_handles).await;
}
let total_calls = successful_calls + failed_calls;
let calls_per_second = if steady_elapsed.as_secs_f64() > 0.0 {
total_calls as f64 / steady_elapsed.as_secs_f64()
} else {
0.0
};
let summary = percentile_summary(&latencies_ms);
let stats = serde_json::json!({
"language": "rust",
"command": "bench-read-bulk",
"endpoint": endpoint,
"clientName": client_name,
"bulkSize": bulk_size,
"durationSeconds": duration_seconds,
"warmupSeconds": warmup_seconds,
"durationMs": steady_elapsed.as_millis() as u64,
"tags": tags,
"totalCalls": total_calls,
"successfulCalls": successful_calls,
"failedCalls": failed_calls,
"totalReadResults": total_read_results,
"cachedReadResults": cached_read_results,
"callsPerSecond": round_to(calls_per_second, 2),
"latencyMs": summary,
});
if use_json {
println!("{}", stats);
} else {
println!("{calls_per_second}");
}
Ok::<(), Error>(())
}
.await;
let _ = session.close().await;
bench_outcome
}
fn percentile_summary(sample: &[f64]) -> serde_json::Value {
if sample.is_empty() {
return serde_json::json!({ "p50": 0.0, "p95": 0.0, "p99": 0.0, "max": 0.0, "mean": 0.0 });
}
let mut sorted = sample.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let max = sorted[sorted.len() - 1];
let mean = sample.iter().sum::<f64>() / sample.len() as f64;
serde_json::json!({
"p50": round_to(percentile(&sorted, 0.50), 3),
"p95": round_to(percentile(&sorted, 0.95), 3),
"p99": round_to(percentile(&sorted, 0.99), 3),
"max": round_to(max, 3),
"mean": round_to(mean, 3),
})
}
fn percentile(sorted: &[f64], quantile: f64) -> f64 {
if sorted.is_empty() {
return 0.0;
}
if sorted.len() == 1 {
return sorted[0];
}
let rank = quantile * (sorted.len() - 1) as f64;
let lower = rank.floor() as usize;
let upper = (lower + 1).min(sorted.len() - 1);
let fraction = rank - lower as f64;
sorted[lower] + (sorted[upper] - sorted[lower]) * fraction
}
fn round_to(value: f64, digits: u32) -> f64 {
let shift = 10f64.powi(digits as i32);
(value * shift).round() / shift
}
fn print_version(use_json: bool) {
if use_json {
println!("{}", version_json());