Cross-language ReadBulk stress benchmark

Adds a bench-read-bulk subcommand to every client CLI (.NET, Go, Rust, Python, Java) and a PowerShell driver that runs all five concurrently against the deployed gateway and prints a side-by-side comparison. Each CLI''s bench: - Opens its own session, registers, subscribes to bulk-size tags so the worker''s MxAccessValueCache populates from real OnDataChange events. - Runs a warmup-seconds-long pre-loop with identical calls so JIT / connection-pool / first-call overhead is amortised before the measurement window. - Runs ReadBulk in a tight in-process loop for duration-seconds with per-call high-resolution latency capture (Stopwatch in .NET, time.Now in Go, std::time::Instant in Rust, time.perf_counter in Python, System.nanoTime in Java). - Unsubscribes + closes the session, then emits one JSON object with the shared schema: { language, durationMs, totalCalls, successfulCalls, failedCalls, totalReadResults, cachedReadResults, callsPerSecond, latencyMs: { p50, p95, p99, max, mean } }. The PS driver (scripts/bench-read-bulk.ps1) launches one detached process per client, waits for all to finish, parses the trailing JSON object from each stdout, prints a comparison table, and persists the combined report under artifacts/bench/. Quoting around Java''s `gradle --args="..."` is handled by writing a one-shot .bat that cmd.exe runs; the .NET CLI''s per-call gRPC timeout is auto-scaled to (Duration + Warmup + 30s) so the channel-wide timeout doesn''t cancel the bench mid-loop. Live 30-second steady-state run against the deployed gateway, all five clients hitting the same six TestMachine_001..006.TestChangingInt tags: client calls/sec cached/total p50 ms p95 ms p99 ms max ms dotnet 171.78 30924/30924 3.84 14.06 40.41 542.48 go 175.46 31590/31590 3.93 13.52 41.26 243.00 rust 123.26 22188/22188 5.52 15.78 48.11 544.41 python 145.79 26244/26244 4.86 14.85 41.65 645.84 java 181.12 32604/32604 3.80 10.59 33.37 344.27 143,550 ReadBulk results across all five clients during the 30s window; 100% were was_cached = true (the worker''s cache fast-path never fell through to the snapshot lifecycle). Aggregate read throughput ~800 calls/sec against five concurrent sessions sharing the same cached tags. A second variant with bulk-size 20 sustained the same per-client call rate while delivering 3.3x more values per call (~37,000 cached reads/sec aggregate across the five concurrent sessions), confirming the linear per-tag cache lookup inside one call is not a bottleneck at this scale. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 05:17:08 -04:00
parent eaa7093cd6
commit 93633ce99c
6 changed files with 1370 additions and 0 deletions
@@ -14,6 +14,7 @@ import (
 	"io"
 	"os"
 	"os/signal"
+	"sort"
 	"strconv"
 	"strings"
 	"syscall"
@@ -99,6 +100,8 @@ func runWithIO(ctx context.Context, args []string, stdout, stderr io.Writer) err
 		return runWriteSecuredBulk(ctx, args[1:], stdout, stderr)
 	case "write-secured2-bulk":
 		return runWriteSecured2Bulk(ctx, args[1:], stdout, stderr)
+	case "bench-read-bulk":
+		return runBenchReadBulk(ctx, args[1:], stdout, stderr)
 	case "write":
 		return runWrite(ctx, args[1:], stdout, stderr)
 	case "stream-events":
@@ -508,6 +511,192 @@ func runWriteBulkVariant(ctx context.Context, args []string, stdout, stderr io.W
 	return writeWriteBulkOutput(stdout, *jsonOutput, command, options, results, err)
 }

+// runBenchReadBulk drives the cross-language ReadBulk stress benchmark from Go:
+// opens its own session, subscribes to bulk-size tags so the worker value cache
+// populates from real OnDataChange events, runs ReadBulk in a tight loop for
+// duration-seconds with per-call timing, and emits the shared JSON schema the
+// scripts/bench-read-bulk.ps1 driver collates across all five clients.
+func runBenchReadBulk(ctx context.Context, args []string, stdout, stderr io.Writer) error {
+	flags := flag.NewFlagSet("bench-read-bulk", flag.ContinueOnError)
+	flags.SetOutput(stderr)
+	common := bindCommonFlags(flags)
+	jsonOutput := flags.Bool("json", false, "write JSON output")
+	clientName := flags.String("client-name", "mxgw-go-bench", "session client name")
+	durationSeconds := flags.Int("duration-seconds", 30, "steady-state measurement window in seconds")
+	warmupSeconds := flags.Int("warmup-seconds", 3, "warm-up window before measurement, in seconds")
+	bulkSize := flags.Int("bulk-size", 6, "tags per ReadBulk call")
+	tagStart := flags.Int("tag-start", 1, "first machine number")
+	tagPrefix := flags.String("tag-prefix", "TestMachine_", "tag prefix (machine number appended as %03d)")
+	tagAttribute := flags.String("tag-attribute", "TestChangingInt", "attribute appended to each tag prefix")
+	timeoutMs := flags.Int("timeout-ms", 1500, "per-tag snapshot timeout in milliseconds")
+
+	if err := flags.Parse(args); err != nil {
+		return err
+	}
+	if *bulkSize < 1 {
+		return errors.New("bulk-size must be positive")
+	}
+	if *durationSeconds < 1 {
+		return errors.New("duration-seconds must be positive")
+	}
+
+	tags := make([]string, *bulkSize)
+	for i := 0; i < *bulkSize; i++ {
+		tags[i] = fmt.Sprintf("%s%03d.%s", *tagPrefix, *tagStart+i, *tagAttribute)
+	}
+
+	client, options, err := dialForCommand(ctx, common)
+	if err != nil {
+		return err
+	}
+	defer client.Close()
+
+	session, err := client.OpenSession(ctx, mxgateway.OpenSessionOptions{ClientSessionName: *clientName})
+	if err != nil {
+		return err
+	}
+	defer func() {
+		_, _ = session.Close(context.Background())
+	}()
+
+	serverHandle, err := session.Register(ctx, *clientName)
+	if err != nil {
+		return err
+	}
+
+	subscribeResults, err := session.SubscribeBulk(ctx, serverHandle, tags)
+	if err != nil {
+		return err
+	}
+	itemHandles := make([]int32, 0, len(subscribeResults))
+	for _, result := range subscribeResults {
+		if result.GetWasSuccessful() {
+			itemHandles = append(itemHandles, result.GetItemHandle())
+		}
+	}
+	defer func() {
+		if len(itemHandles) > 0 {
+			_, _ = session.UnsubscribeBulk(context.Background(), serverHandle, itemHandles)
+		}
+	}()
+
+	// Warm-up: drive identical calls so any first-call JIT / connection-pool
+	// setup is amortised before the measurement window opens.
+	warmupDeadline := time.Now().Add(time.Duration(*warmupSeconds) * time.Second)
+	timeout := time.Duration(*timeoutMs) * time.Millisecond
+	for time.Now().Before(warmupDeadline) {
+		_, _ = session.ReadBulk(ctx, serverHandle, tags, timeout)
+	}
+
+	// Steady state: per-call latency captured via time.Now() deltas.
+	latenciesMs := make([]float64, 0, 65536)
+	var totalReadResults int64
+	var cachedReadResults int64
+	var successfulCalls, failedCalls int
+	steadyStart := time.Now()
+	steadyDeadline := steadyStart.Add(time.Duration(*durationSeconds) * time.Second)
+
+	for time.Now().Before(steadyDeadline) {
+		callStart := time.Now()
+		results, err := session.ReadBulk(ctx, serverHandle, tags, timeout)
+		elapsed := time.Since(callStart)
+		latenciesMs = append(latenciesMs, float64(elapsed.Nanoseconds())/1e6)
+		if err != nil {
+			failedCalls++
+			continue
+		}
+		successfulCalls++
+		for _, r := range results {
+			totalReadResults++
+			if r.GetWasCached() {
+				cachedReadResults++
+			}
+		}
+	}
+	steadyElapsed := time.Since(steadyStart)
+	totalCalls := successfulCalls + failedCalls
+
+	callsPerSecond := 0.0
+	if steadyElapsed.Seconds() > 0 {
+		callsPerSecond = float64(totalCalls) / steadyElapsed.Seconds()
+	}
+
+	stats := map[string]any{
+		"language":          "go",
+		"command":           "bench-read-bulk",
+		"endpoint":          options.Endpoint,
+		"clientName":        *clientName,
+		"bulkSize":          *bulkSize,
+		"durationSeconds":   *durationSeconds,
+		"warmupSeconds":     *warmupSeconds,
+		"durationMs":        steadyElapsed.Milliseconds(),
+		"tags":              tags,
+		"totalCalls":        totalCalls,
+		"successfulCalls":   successfulCalls,
+		"failedCalls":       failedCalls,
+		"totalReadResults":  totalReadResults,
+		"cachedReadResults": cachedReadResults,
+		"callsPerSecond":    roundTo(callsPerSecond, 2),
+		"latencyMs":         percentileSummary(latenciesMs),
+	}
+	if *jsonOutput {
+		return writeJSON(stdout, stats)
+	}
+	fmt.Fprintln(stdout, callsPerSecond)
+	return nil
+}
+
+// percentileSummary returns the same { p50, p95, p99, max, mean } shape every
+// language bench emits, rounded to 3 decimal places so the PowerShell driver
+// sees one schema across all five clients.
+func percentileSummary(sample []float64) map[string]float64 {
+	if len(sample) == 0 {
+		return map[string]float64{"p50": 0, "p95": 0, "p99": 0, "max": 0, "mean": 0}
+	}
+	sorted := append([]float64(nil), sample...)
+	sort.Float64s(sorted)
+	mean := 0.0
+	max := sorted[len(sorted)-1]
+	for _, v := range sample {
+		mean += v
+	}
+	mean /= float64(len(sample))
+	return map[string]float64{
+		"p50":  roundTo(percentile(sorted, 0.50), 3),
+		"p95":  roundTo(percentile(sorted, 0.95), 3),
+		"p99":  roundTo(percentile(sorted, 0.99), 3),
+		"max":  roundTo(max, 3),
+		"mean": roundTo(mean, 3),
+	}
+}
+
+// percentile uses nearest-rank with linear interpolation; matches the .NET
+// implementation so cross-language comparisons are apples-to-apples.
+func percentile(sorted []float64, quantile float64) float64 {
+	if len(sorted) == 0 {
+		return 0
+	}
+	if len(sorted) == 1 {
+		return sorted[0]
+	}
+	rank := quantile * float64(len(sorted)-1)
+	lower := int(rank)
+	upper := lower + 1
+	if upper >= len(sorted) {
+		return sorted[lower]
+	}
+	fraction := rank - float64(lower)
+	return sorted[lower] + (sorted[upper]-sorted[lower])*fraction
+}
+
+func roundTo(value float64, digits int) float64 {
+	shift := 1.0
+	for i := 0; i < digits; i++ {
+		shift *= 10
+	}
+	return float64(int64(value*shift+0.5)) / shift
+}
+
 // parseRfc3339Timestamp parses an RFC 3339 timestamp and returns the
 // MxValue protobuf representation used for the timestamped write families.
 func parseRfc3339Timestamp(text string) (*mxgateway.MxValue, error) {