mxaccess/rust/crates/mxaccess-codec/benches/alloc_count.rs

//! F38 — counting-allocator bench for `mxaccess-codec`.
//!
//! Measures allocation count + bytes-allocated for the proven
//! encode/decode matrix per `design/70-risks-and-open-questions.md`
//! R12 (< 5 allocs per write at steady state). The harness wraps the
//! global allocator with a [`CountingAllocator`] that tracks
//! per-call counts; each scenario records pre-state, runs N
//! iterations, and reports `(alloc_count, bytes_allocated) / N`.
//!
//! Output is the source of truth for `design/M6-bench-baseline.md`.
//!
//! ## Why hand-rolled (not `dhat` / `criterion`)
//!
//! - `dhat` is heap-profiling oriented (snapshots, call-stack
//!   attribution); for "did this op allocate < 5 times?" the simpler
//!   approach is a thin `GlobalAlloc` wrapper that increments two
//!   atomics. No call-stack capture, no JSON output to post-process.
//! - `criterion` measures wall-clock latency; per `60-roadmap.md:104`,
//!   latency is reported but not gating in V1. Allocation count IS
//!   the gating metric for M6 DoD bullet 3.
//!
//! ## Run
//!
//! ```text
//! cargo bench -p mxaccess-codec
//! ```
//!
//! Each scenario runs in release mode by default (cargo bench
//! profile = `bench` which inherits release).

#![allow(
    clippy::unwrap_used,
    clippy::expect_used,
    clippy::cast_possible_truncation,
    clippy::cast_sign_loss
)]

use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicU64, Ordering};

use bytes::BytesMut;
use mxaccess_codec::{
    write_message, write_message::WriteValue, MxReferenceHandle, NmxSubscriptionMessage,
};

// ---- counting allocator -------------------------------------------------

struct CountingAllocator;

static ALLOC_COUNT: AtomicU64 = AtomicU64::new(0);
static ALLOC_BYTES: AtomicU64 = AtomicU64::new(0);
static DEALLOC_COUNT: AtomicU64 = AtomicU64::new(0);

unsafe impl GlobalAlloc for CountingAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        ALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
        ALLOC_BYTES.fetch_add(layout.size() as u64, Ordering::Relaxed);
        // SAFETY: forwarding to the system allocator with the same layout.
        unsafe { System.alloc(layout) }
    }

    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        DEALLOC_COUNT.fetch_add(1, Ordering::Relaxed);
        // SAFETY: forwarding to the system allocator with the same ptr+layout.
        unsafe { System.dealloc(ptr, layout) }
    }
}

#[global_allocator]
static GLOBAL: CountingAllocator = CountingAllocator;

// ---- scenario harness ---------------------------------------------------

#[derive(Debug, Clone, Copy)]
struct Snapshot {
    allocs: u64,
    bytes: u64,
    deallocs: u64,
}

fn snapshot() -> Snapshot {
    Snapshot {
        allocs: ALLOC_COUNT.load(Ordering::Relaxed),
        bytes: ALLOC_BYTES.load(Ordering::Relaxed),
        deallocs: DEALLOC_COUNT.load(Ordering::Relaxed),
    }
}

fn diff(start: Snapshot, end: Snapshot, iterations: u64) -> (f64, f64, f64) {
    (
        (end.allocs - start.allocs) as f64 / iterations as f64,
        (end.bytes - start.bytes) as f64 / iterations as f64,
        (end.deallocs - start.deallocs) as f64 / iterations as f64,
    )
}

/// Run `op` `iterations` times and return per-op alloc/bytes/dealloc
/// counts. The hint is passed through `std::hint::black_box` to keep
/// the compiler from optimising the work away.
fn measure<F>(name: &str, iterations: u64, mut op: F) -> Row
where
    F: FnMut(),
{
    // Warm-up: 1k iterations to settle any one-time setup state.
    for _ in 0..1024 {
        op();
    }
    let start = snapshot();
    for _ in 0..iterations {
        op();
    }
    let end = snapshot();
    let (allocs, bytes, deallocs) = diff(start, end, iterations);
    Row {
        name: name.to_string(),
        iterations,
        allocs_per_op: allocs,
        bytes_per_op: bytes,
        deallocs_per_op: deallocs,
    }
}

struct Row {
    name: String,
    iterations: u64,
    allocs_per_op: f64,
    bytes_per_op: f64,
    deallocs_per_op: f64,
}

fn print_table(rows: &[Row]) {
    println!();
    println!(
        "| {:40} | {:>10} | {:>10} | {:>10} | {:>10} |",
        "scenario", "iters", "allocs/op", "bytes/op", "deallocs/op"
    );
    println!(
        "| {:40} | {:>10} | {:>10} | {:>10} | {:>10} |",
        "-".repeat(40),
        "-".repeat(10),
        "-".repeat(10),
        "-".repeat(10),
        "-".repeat(10)
    );
    for row in rows {
        println!(
            "| {:40} | {:>10} | {:>10.2} | {:>10.0} | {:>10.2} |",
            row.name, row.iterations, row.allocs_per_op, row.bytes_per_op, row.deallocs_per_op
        );
    }
    println!();
}

// ---- scenarios ----------------------------------------------------------

fn make_handle() -> MxReferenceHandle {
    MxReferenceHandle::from_names(0, 1, 2, 3, "TestObject", 0, 1, 0, "TestAttr", false)
        .expect("handle")
}

fn bench_write_int32() -> Row {
    let handle = make_handle();
    let value = WriteValue::Int32(42);
    measure("write_message::encode (Int32)", 10_000, || {
        let bytes = write_message::encode(&handle, &value, 0, 0).unwrap();
        std::hint::black_box(bytes);
    })
}

fn bench_write_float() -> Row {
    let handle = make_handle();
    let value = WriteValue::Float32(1.5);
    measure("write_message::encode (Float32)", 10_000, || {
        let bytes = write_message::encode(&handle, &value, 0, 0).unwrap();
        std::hint::black_box(bytes);
    })
}

fn bench_write_double() -> Row {
    let handle = make_handle();
    let value = WriteValue::Float64(3.25);
    measure("write_message::encode (Float64)", 10_000, || {
        let bytes = write_message::encode(&handle, &value, 0, 0).unwrap();
        std::hint::black_box(bytes);
    })
}

fn bench_write_bool() -> Row {
    let handle = make_handle();
    let value = WriteValue::Boolean(true);
    measure("write_message::encode (Boolean)", 10_000, || {
        let bytes = write_message::encode(&handle, &value, 0, 0).unwrap();
        std::hint::black_box(bytes);
    })
}

fn bench_write_string() -> Row {
    let handle = make_handle();
    let value = WriteValue::String("hello".to_string());
    measure("write_message::encode (String, 5 chars)", 10_000, || {
        let bytes = write_message::encode(&handle, &value, 0, 0).unwrap();
        std::hint::black_box(bytes);
    })
}

// F52.1 — `BytesMut` output. Same alloc count as `encode`; the benefit is
// downstream zero-copy (consumers can `split_to` / `freeze` without copying
// the body bytes).
fn bench_write_int32_bytes_mut() -> Row {
    let handle = make_handle();
    let value = WriteValue::Int32(42);
    measure("write_message::encode_to_bytes_mut (Int32)", 10_000, || {
        let bytes = write_message::encode_to_bytes_mut(&handle, &value, 0, 0).unwrap();
        std::hint::black_box(bytes);
    })
}

// F52.3 — session-level scratch buffer. The caller supplies a `BytesMut`
// that is cleared and resized in place, so the body allocation is amortised
// across a session's writes. Drops the per-write count from 2 → 1 for
// fixed-width scalars (the remaining alloc is the per-value scratch buffer
// inside `encode_scalar_value`) and 1 → 0 for Boolean (no scalar scratch).
fn bench_write_int32_into_pooled() -> Row {
    let handle = make_handle();
    let value = WriteValue::Int32(42);
    let mut buf = BytesMut::new();
    measure(
        "write_message::encode_into_bytes_mut (Int32, pooled)",
        10_000,
        || {
            write_message::encode_into_bytes_mut(&handle, &value, 0, 0, &mut buf).unwrap();
            std::hint::black_box(&buf);
        },
    )
}

fn bench_write_bool_into_pooled() -> Row {
    let handle = make_handle();
    let value = WriteValue::Boolean(true);
    let mut buf = BytesMut::new();
    measure(
        "write_message::encode_into_bytes_mut (Boolean, pooled)",
        10_000,
        || {
            write_message::encode_into_bytes_mut(&handle, &value, 0, 0, &mut buf).unwrap();
            std::hint::black_box(&buf);
        },
    )
}

fn bench_subscription_decode() -> Row {
    // Build a single-record DataUpdate body once; decode N times.
    let body = build_data_update_int32_body(42);
    measure(
        "NmxSubscriptionMessage::parse_inner (DataUpdate, Int32)",
        10_000,
        || {
            let msg = NmxSubscriptionMessage::parse_inner(&body).unwrap();
            std::hint::black_box(msg);
        },
    )
}

fn bench_handle_from_names() -> Row {
    measure("MxReferenceHandle::from_names", 10_000, || {
        let h =
            MxReferenceHandle::from_names(0, 1, 2, 3, "TestChildObject", 0, 1, 0, "TestInt", false)
                .unwrap();
        std::hint::black_box(h);
    })
}

// ---- helpers (mirror the test fixtures in subscription_message.rs) -----

fn build_data_update_int32_body(value: i32) -> Vec<u8> {
    // Operation id + correlation id are arbitrary 16-byte sequences for
    // a synthetic body; the codec doesn't reject any GUID shape.
    const DATA_UPDATE_COMMAND: u8 = 0x33;
    let operation_id = [0x11u8; 16];
    // 15-byte record prefix: status(4) + quality(2) + filetime(8) + wire_kind(1).
    // wire_kind = 0x02 = Int32. Then the 4-byte i32 LE payload.
    let mut record = Vec::with_capacity(15 + 4);
    record.extend_from_slice(&0i32.to_le_bytes()); // status
    record.extend_from_slice(&0x00C0u16.to_le_bytes()); // quality
    record.extend_from_slice(&0i64.to_le_bytes()); // filetime
    record.push(0x02); // wire_kind = Int32
    record.extend_from_slice(&value.to_le_bytes());

    let mut out = Vec::with_capacity(23 + record.len());
    out.push(DATA_UPDATE_COMMAND);
    out.extend_from_slice(&1u16.to_le_bytes()); // version
    out.extend_from_slice(&1i32.to_le_bytes()); // record_count = 1
    out.extend_from_slice(&operation_id);
    out.extend_from_slice(&record);
    out
}

// ---- main --------------------------------------------------------------

fn main() {
    println!("F38 — mxaccess-codec allocation-count baseline");
    println!("Counting allocator: thin GlobalAlloc wrapper around System.");
    println!("R12 target: < 5 allocations per write at steady state.");

    let rows = vec![
        bench_write_int32(),
        bench_write_float(),
        bench_write_double(),
        bench_write_bool(),
        bench_write_string(),
        bench_write_int32_bytes_mut(),
        bench_write_int32_into_pooled(),
        bench_write_bool_into_pooled(),
        bench_handle_from_names(),
        bench_subscription_decode(),
    ];

    print_table(&rows);

    // R12 gate: emit a non-zero exit code if any encode-write scenario
    // exceeds the 5-allocs threshold. Decoders are reported but not
    // gated (the sweep below explicitly excludes them).
    let mut violations = 0;
    for row in &rows {
        if row.name.starts_with("write_message::encode") && row.allocs_per_op >= 5.0 {
            eprintln!(
                "R12 violation: {} allocates {:.2}/op (>= 5)",
                row.name, row.allocs_per_op
            );
            violations += 1;
        }
    }
    if violations > 0 {
        std::process::exit(1);
    }
}