Cross-language ReadBulk stress benchmark

Adds a bench-read-bulk subcommand to every client CLI (.NET, Go, Rust,
Python, Java) and a PowerShell driver that runs all five concurrently
against the deployed gateway and prints a side-by-side comparison.

Each CLI''s bench:

  - Opens its own session, registers, subscribes to bulk-size tags so the
    worker''s MxAccessValueCache populates from real OnDataChange events.
  - Runs a warmup-seconds-long pre-loop with identical calls so JIT /
    connection-pool / first-call overhead is amortised before the
    measurement window.
  - Runs ReadBulk in a tight in-process loop for duration-seconds with
    per-call high-resolution latency capture (Stopwatch in .NET,
    time.Now in Go, std::time::Instant in Rust, time.perf_counter in
    Python, System.nanoTime in Java).
  - Unsubscribes + closes the session, then emits one JSON object with
    the shared schema: { language, durationMs, totalCalls, successfulCalls,
    failedCalls, totalReadResults, cachedReadResults, callsPerSecond,
    latencyMs: { p50, p95, p99, max, mean } }.

The PS driver (scripts/bench-read-bulk.ps1) launches one detached process
per client, waits for all to finish, parses the trailing JSON object from
each stdout, prints a comparison table, and persists the combined report
under artifacts/bench/. Quoting around Java''s `gradle --args="..."` is
handled by writing a one-shot .bat that cmd.exe runs; the .NET CLI''s
per-call gRPC timeout is auto-scaled to (Duration + Warmup + 30s) so the
channel-wide timeout doesn''t cancel the bench mid-loop.

Live 30-second steady-state run against the deployed gateway, all five
clients hitting the same six TestMachine_001..006.TestChangingInt tags:

  client    calls/sec  cached/total    p50 ms  p95 ms  p99 ms  max ms
  dotnet      171.78   30924/30924      3.84   14.06   40.41  542.48
  go          175.46   31590/31590      3.93   13.52   41.26  243.00
  rust        123.26   22188/22188      5.52   15.78   48.11  544.41
  python      145.79   26244/26244      4.86   14.85   41.65  645.84
  java        181.12   32604/32604      3.80   10.59   33.37  344.27

143,550 ReadBulk results across all five clients during the 30s window;
100% were was_cached = true (the worker''s cache fast-path never fell
through to the snapshot lifecycle). Aggregate read throughput ~800
calls/sec against five concurrent sessions sharing the same cached tags.

A second variant with bulk-size 20 sustained the same per-client call
rate while delivering 3.3x more values per call (~37,000 cached reads/sec
aggregate across the five concurrent sessions), confirming the linear
per-tag cache lookup inside one call is not a bottleneck at this scale.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-20 05:17:08 -04:00
parent eaa7093cd6
commit 93633ce99c
6 changed files with 1370 additions and 0 deletions
+189
View File
@@ -14,6 +14,7 @@ import (
"io"
"os"
"os/signal"
"sort"
"strconv"
"strings"
"syscall"
@@ -99,6 +100,8 @@ func runWithIO(ctx context.Context, args []string, stdout, stderr io.Writer) err
return runWriteSecuredBulk(ctx, args[1:], stdout, stderr)
case "write-secured2-bulk":
return runWriteSecured2Bulk(ctx, args[1:], stdout, stderr)
case "bench-read-bulk":
return runBenchReadBulk(ctx, args[1:], stdout, stderr)
case "write":
return runWrite(ctx, args[1:], stdout, stderr)
case "stream-events":
@@ -508,6 +511,192 @@ func runWriteBulkVariant(ctx context.Context, args []string, stdout, stderr io.W
return writeWriteBulkOutput(stdout, *jsonOutput, command, options, results, err)
}
// runBenchReadBulk drives the cross-language ReadBulk stress benchmark from Go:
// opens its own session, subscribes to bulk-size tags so the worker value cache
// populates from real OnDataChange events, runs ReadBulk in a tight loop for
// duration-seconds with per-call timing, and emits the shared JSON schema the
// scripts/bench-read-bulk.ps1 driver collates across all five clients.
func runBenchReadBulk(ctx context.Context, args []string, stdout, stderr io.Writer) error {
flags := flag.NewFlagSet("bench-read-bulk", flag.ContinueOnError)
flags.SetOutput(stderr)
common := bindCommonFlags(flags)
jsonOutput := flags.Bool("json", false, "write JSON output")
clientName := flags.String("client-name", "mxgw-go-bench", "session client name")
durationSeconds := flags.Int("duration-seconds", 30, "steady-state measurement window in seconds")
warmupSeconds := flags.Int("warmup-seconds", 3, "warm-up window before measurement, in seconds")
bulkSize := flags.Int("bulk-size", 6, "tags per ReadBulk call")
tagStart := flags.Int("tag-start", 1, "first machine number")
tagPrefix := flags.String("tag-prefix", "TestMachine_", "tag prefix (machine number appended as %03d)")
tagAttribute := flags.String("tag-attribute", "TestChangingInt", "attribute appended to each tag prefix")
timeoutMs := flags.Int("timeout-ms", 1500, "per-tag snapshot timeout in milliseconds")
if err := flags.Parse(args); err != nil {
return err
}
if *bulkSize < 1 {
return errors.New("bulk-size must be positive")
}
if *durationSeconds < 1 {
return errors.New("duration-seconds must be positive")
}
tags := make([]string, *bulkSize)
for i := 0; i < *bulkSize; i++ {
tags[i] = fmt.Sprintf("%s%03d.%s", *tagPrefix, *tagStart+i, *tagAttribute)
}
client, options, err := dialForCommand(ctx, common)
if err != nil {
return err
}
defer client.Close()
session, err := client.OpenSession(ctx, mxgateway.OpenSessionOptions{ClientSessionName: *clientName})
if err != nil {
return err
}
defer func() {
_, _ = session.Close(context.Background())
}()
serverHandle, err := session.Register(ctx, *clientName)
if err != nil {
return err
}
subscribeResults, err := session.SubscribeBulk(ctx, serverHandle, tags)
if err != nil {
return err
}
itemHandles := make([]int32, 0, len(subscribeResults))
for _, result := range subscribeResults {
if result.GetWasSuccessful() {
itemHandles = append(itemHandles, result.GetItemHandle())
}
}
defer func() {
if len(itemHandles) > 0 {
_, _ = session.UnsubscribeBulk(context.Background(), serverHandle, itemHandles)
}
}()
// Warm-up: drive identical calls so any first-call JIT / connection-pool
// setup is amortised before the measurement window opens.
warmupDeadline := time.Now().Add(time.Duration(*warmupSeconds) * time.Second)
timeout := time.Duration(*timeoutMs) * time.Millisecond
for time.Now().Before(warmupDeadline) {
_, _ = session.ReadBulk(ctx, serverHandle, tags, timeout)
}
// Steady state: per-call latency captured via time.Now() deltas.
latenciesMs := make([]float64, 0, 65536)
var totalReadResults int64
var cachedReadResults int64
var successfulCalls, failedCalls int
steadyStart := time.Now()
steadyDeadline := steadyStart.Add(time.Duration(*durationSeconds) * time.Second)
for time.Now().Before(steadyDeadline) {
callStart := time.Now()
results, err := session.ReadBulk(ctx, serverHandle, tags, timeout)
elapsed := time.Since(callStart)
latenciesMs = append(latenciesMs, float64(elapsed.Nanoseconds())/1e6)
if err != nil {
failedCalls++
continue
}
successfulCalls++
for _, r := range results {
totalReadResults++
if r.GetWasCached() {
cachedReadResults++
}
}
}
steadyElapsed := time.Since(steadyStart)
totalCalls := successfulCalls + failedCalls
callsPerSecond := 0.0
if steadyElapsed.Seconds() > 0 {
callsPerSecond = float64(totalCalls) / steadyElapsed.Seconds()
}
stats := map[string]any{
"language": "go",
"command": "bench-read-bulk",
"endpoint": options.Endpoint,
"clientName": *clientName,
"bulkSize": *bulkSize,
"durationSeconds": *durationSeconds,
"warmupSeconds": *warmupSeconds,
"durationMs": steadyElapsed.Milliseconds(),
"tags": tags,
"totalCalls": totalCalls,
"successfulCalls": successfulCalls,
"failedCalls": failedCalls,
"totalReadResults": totalReadResults,
"cachedReadResults": cachedReadResults,
"callsPerSecond": roundTo(callsPerSecond, 2),
"latencyMs": percentileSummary(latenciesMs),
}
if *jsonOutput {
return writeJSON(stdout, stats)
}
fmt.Fprintln(stdout, callsPerSecond)
return nil
}
// percentileSummary returns the same { p50, p95, p99, max, mean } shape every
// language bench emits, rounded to 3 decimal places so the PowerShell driver
// sees one schema across all five clients.
func percentileSummary(sample []float64) map[string]float64 {
if len(sample) == 0 {
return map[string]float64{"p50": 0, "p95": 0, "p99": 0, "max": 0, "mean": 0}
}
sorted := append([]float64(nil), sample...)
sort.Float64s(sorted)
mean := 0.0
max := sorted[len(sorted)-1]
for _, v := range sample {
mean += v
}
mean /= float64(len(sample))
return map[string]float64{
"p50": roundTo(percentile(sorted, 0.50), 3),
"p95": roundTo(percentile(sorted, 0.95), 3),
"p99": roundTo(percentile(sorted, 0.99), 3),
"max": roundTo(max, 3),
"mean": roundTo(mean, 3),
}
}
// percentile uses nearest-rank with linear interpolation; matches the .NET
// implementation so cross-language comparisons are apples-to-apples.
func percentile(sorted []float64, quantile float64) float64 {
if len(sorted) == 0 {
return 0
}
if len(sorted) == 1 {
return sorted[0]
}
rank := quantile * float64(len(sorted)-1)
lower := int(rank)
upper := lower + 1
if upper >= len(sorted) {
return sorted[lower]
}
fraction := rank - float64(lower)
return sorted[lower] + (sorted[upper]-sorted[lower])*fraction
}
func roundTo(value float64, digits int) float64 {
shift := 1.0
for i := 0; i < digits; i++ {
shift *= 10
}
return float64(int64(value*shift+0.5)) / shift
}
// parseRfc3339Timestamp parses an RFC 3339 timestamp and returns the
// MxValue protobuf representation used for the timestamped write families.
func parseRfc3339Timestamp(text string) (*mxgateway.MxValue, error) {