Cross-language ReadBulk stress benchmark

Adds a bench-read-bulk subcommand to every client CLI (.NET, Go, Rust, Python, Java) and a PowerShell driver that runs all five concurrently against the deployed gateway and prints a side-by-side comparison. Each CLI''s bench: - Opens its own session, registers, subscribes to bulk-size tags so the worker''s MxAccessValueCache populates from real OnDataChange events. - Runs a warmup-seconds-long pre-loop with identical calls so JIT / connection-pool / first-call overhead is amortised before the measurement window. - Runs ReadBulk in a tight in-process loop for duration-seconds with per-call high-resolution latency capture (Stopwatch in .NET, time.Now in Go, std::time::Instant in Rust, time.perf_counter in Python, System.nanoTime in Java). - Unsubscribes + closes the session, then emits one JSON object with the shared schema: { language, durationMs, totalCalls, successfulCalls, failedCalls, totalReadResults, cachedReadResults, callsPerSecond, latencyMs: { p50, p95, p99, max, mean } }. The PS driver (scripts/bench-read-bulk.ps1) launches one detached process per client, waits for all to finish, parses the trailing JSON object from each stdout, prints a comparison table, and persists the combined report under artifacts/bench/. Quoting around Java''s `gradle --args="..."` is handled by writing a one-shot .bat that cmd.exe runs; the .NET CLI''s per-call gRPC timeout is auto-scaled to (Duration + Warmup + 30s) so the channel-wide timeout doesn''t cancel the bench mid-loop. Live 30-second steady-state run against the deployed gateway, all five clients hitting the same six TestMachine_001..006.TestChangingInt tags: client calls/sec cached/total p50 ms p95 ms p99 ms max ms dotnet 171.78 30924/30924 3.84 14.06 40.41 542.48 go 175.46 31590/31590 3.93 13.52 41.26 243.00 rust 123.26 22188/22188 5.52 15.78 48.11 544.41 python 145.79 26244/26244 4.86 14.85 41.65 645.84 java 181.12 32604/32604 3.80 10.59 33.37 344.27 143,550 ReadBulk results across all five clients during the 30s window; 100% were was_cached = true (the worker''s cache fast-path never fell through to the snapshot lifecycle). Aggregate read throughput ~800 calls/sec against five concurrent sessions sharing the same cached tags. A second variant with bulk-size 20 sustained the same per-client call rate while delivering 3.3x more values per call (~37,000 cached reads/sec aggregate across the five concurrent sessions), confirming the linear per-tag cache lookup inside one call is not a bottleneck at this scale. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 05:17:08 -04:00
parent eaa7093cd6
commit 93633ce99c
6 changed files with 1370 additions and 0 deletions
@@ -270,6 +270,29 @@ def write_secured2_bulk(**kwargs: Any) -> None:
    _run(_write_secured2_bulk(**kwargs), output_json=kwargs["output_json"], secrets=_secrets(kwargs))


+@main.command("bench-read-bulk")
+@gateway_options
+@click.option("--client-name", default="mxgw-python-bench", show_default=True)
+@click.option("--duration-seconds", default=30, type=int, show_default=True)
+@click.option("--warmup-seconds", default=3, type=int, show_default=True)
+@click.option("--bulk-size", default=6, type=int, show_default=True)
+@click.option("--tag-start", default=1, type=int, show_default=True)
+@click.option("--tag-prefix", default="TestMachine_", show_default=True)
+@click.option("--tag-attribute", default="TestChangingInt", show_default=True)
+@click.option("--timeout-ms", default=1500, type=int, show_default=True)
+@click.option("--json", "output_json", is_flag=True, help="Emit JSON output.")
+def bench_read_bulk(**kwargs: Any) -> None:
+    """Cross-language ReadBulk stress benchmark.
+
+    Opens its own session, subscribes to bulk-size tags so the worker value
+    cache populates from real OnDataChange events, runs ReadBulk in a tight
+    loop for duration-seconds, and emits the shared JSON stats schema the
+    scripts/bench-read-bulk.ps1 driver collates across all five clients.
+    """
+
+    _run(_bench_read_bulk(**kwargs), output_json=kwargs["output_json"], secrets=_secrets(kwargs))
+
+
@main.command("stream-events")
@gateway_options
@click.option("--session-id", required=True, help="Gateway session id.")
@@ -538,6 +561,119 @@ async def _write_secured2_bulk(**kwargs: Any) -> dict[str, Any]:
        return {"results": [_message_dict(result) for result in results]}


+async def _bench_read_bulk(**kwargs: Any) -> dict[str, Any]:
+    """ReadBulk stress benchmark — matches the .NET / Go / Rust / Java schema."""
+    import time
+
+    bulk_size = int(kwargs["bulk_size"])
+    if bulk_size < 1:
+        raise click.UsageError("bulk-size must be positive")
+    duration_seconds = int(kwargs["duration_seconds"])
+    warmup_seconds = int(kwargs["warmup_seconds"])
+    tag_start = int(kwargs["tag_start"])
+    tag_prefix = kwargs["tag_prefix"]
+    tag_attribute = kwargs["tag_attribute"]
+    timeout_ms = int(kwargs["timeout_ms"])
+    client_name = kwargs["client_name"]
+    tags = [f"{tag_prefix}{i:03d}.{tag_attribute}" for i in range(tag_start, tag_start + bulk_size)]
+
+    async with await _connect(kwargs) as client:
+        session = await client.open_session(client_session_name=client_name)
+        server_handle = 0
+        item_handles: list[int] = []
+        try:
+            server_handle = await session.register(client_name)
+            subscribe_results = await session.subscribe_bulk(server_handle, tags)
+            item_handles = [r.item_handle for r in subscribe_results if r.was_successful]
+
+            # Warm-up window so JIT / connection pool / first-call costs are
+            # amortised before the measurement window opens.
+            warmup_deadline = time.perf_counter() + warmup_seconds
+            while time.perf_counter() < warmup_deadline:
+                await session.read_bulk(server_handle, tags, timeout_ms=timeout_ms)
+
+            latencies_ms: list[float] = []
+            total_results = 0
+            cached_results = 0
+            successful = 0
+            failed = 0
+            steady_start = time.perf_counter()
+            steady_deadline = steady_start + duration_seconds
+            while time.perf_counter() < steady_deadline:
+                call_start = time.perf_counter()
+                try:
+                    results = await session.read_bulk(server_handle, tags, timeout_ms=timeout_ms)
+                except Exception:
+                    failed += 1
+                    latencies_ms.append((time.perf_counter() - call_start) * 1000.0)
+                    continue
+                latencies_ms.append((time.perf_counter() - call_start) * 1000.0)
+                successful += 1
+                for r in results:
+                    total_results += 1
+                    if r.was_cached:
+                        cached_results += 1
+            steady_elapsed = time.perf_counter() - steady_start
+            total_calls = successful + failed
+            calls_per_second = total_calls / steady_elapsed if steady_elapsed > 0 else 0.0
+        finally:
+            if item_handles:
+                try:
+                    await session.unsubscribe_bulk(server_handle, item_handles)
+                except Exception:
+                    pass
+            try:
+                await session.close()
+            except Exception:
+                pass
+
+        return {
+            "language": "python",
+            "command": "bench-read-bulk",
+            "endpoint": kwargs.get("endpoint"),
+            "clientName": client_name,
+            "bulkSize": bulk_size,
+            "durationSeconds": duration_seconds,
+            "warmupSeconds": warmup_seconds,
+            "durationMs": int(steady_elapsed * 1000),
+            "tags": tags,
+            "totalCalls": total_calls,
+            "successfulCalls": successful,
+            "failedCalls": failed,
+            "totalReadResults": total_results,
+            "cachedReadResults": cached_results,
+            "callsPerSecond": round(calls_per_second, 2),
+            "latencyMs": _percentile_summary(latencies_ms),
+        }
+
+
+def _percentile_summary(sample: list[float]) -> dict[str, float]:
+    if not sample:
+        return {"p50": 0.0, "p95": 0.0, "p99": 0.0, "max": 0.0, "mean": 0.0}
+    sorted_sample = sorted(sample)
+    return {
+        "p50": round(_percentile(sorted_sample, 0.50), 3),
+        "p95": round(_percentile(sorted_sample, 0.95), 3),
+        "p99": round(_percentile(sorted_sample, 0.99), 3),
+        "max": round(sorted_sample[-1], 3),
+        "mean": round(sum(sample) / len(sample), 3),
+    }
+
+
+def _percentile(sorted_sample: list[float], quantile: float) -> float:
+    """Nearest-rank with linear interpolation; matches every other client."""
+    n = len(sorted_sample)
+    if n == 0:
+        return 0.0
+    if n == 1:
+        return sorted_sample[0]
+    rank = quantile * (n - 1)
+    lower = int(rank)
+    upper = min(lower + 1, n - 1)
+    fraction = rank - lower
+    return sorted_sample[lower] + (sorted_sample[upper] - sorted_sample[lower]) * fraction
+
+
 async def _stream_events(**kwargs: Any) -> dict[str, Any]:
    async with await _connect(kwargs) as client:
        session = _session(client, kwargs["session_id"])