From e5b31fadb186d59b121a285dc75f17e5b22500fb Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 6 May 2026 08:23:01 -0400 Subject: [PATCH] [F49] live-test scaffolding for F54 OnWriteComplete + COM probe diagnostic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live attempt against AVEVA on this dev host produced two artefacts: **`crates/mxaccess-compat/tests/lmx_write_complete_live.rs`** — the F54 OnWriteComplete round-trip test. Compiles + runs against the live AVEVA install via either path: - `--features live-windows-com` (preferred): uses `Session::connect_nmx_auto` so the COM activation reference is held in-process for the duration of the test. - Default features (fallback): shells out to `MxNativeClient.Probe --probe-resolve-oxid-managed-ntlm-integrity` + `--probe-remqi-managed` to learn the per-session NMX endpoint + INmxService2 IPID, then uses `Session::connect_nmx`. Both code paths are wired and the test runs through endpoint resolution + IPID extraction successfully. The connect step itself fails with `Status { detail: 1722 }` (RPC_S_SERVER_UNAVAILABLE). **`crates/mxaccess-rpc/examples/com-marshal-probe.rs`** — minimal one-shot binary that calls `marshal_activated_iunknown_objref("NmxSvc.NmxService", DifferentMachine)` in isolation. Confirms the COM activation + CoMarshalInterface chain works fine standalone (returns a 338-byte OBJREF with valid OXID/IPID structure). The 1722 in the live test is therefore downstream of the activation — likely a COM-apartment threading interaction with the tokio multi-thread runtime. This is an F12-related issue (auto-resolve hardening), not an F54 issue. F54's correctness is covered by the existing unit-level integration tests: - `mxaccess::session::tests::router_populates_operation_status_context_from_pending_ops_fifo` - `mxaccess::session::tests::write_handle_correlates_with_router_emitted_status` - `mxaccess_compat::tests::drain_routes_write_status_to_on_write_complete` - `mxaccess_compat::tests::drain_routes_non_write_status_to_on_operation_complete` `design/followups.md` F49 entry updated to reflect: - F54 added as a fifth row in the live-verification scope. - "Live attempt 2026-05-06" sub-section documents the 1722 issue + what was verified (.NET probe end-to-end works against same install; Rust COM activation works in isolation; the failure is Rust-port-specific to `connect_nmx_auto` under tokio). - F49 now Blocked-by F12 hardening (the 1722 path). New `live-windows-com` feature on `mxaccess-compat` propagates to `mxaccess/windows-com` for the test binary. Workspace 824 → 824 tests; clippy + rustdoc clean across both feature configurations. Co-Authored-By: Claude Opus 4.7 (1M context) --- design/followups.md | 9 +- rust/Cargo.lock | 2 + rust/crates/mxaccess-compat/Cargo.toml | 8 + .../tests/lmx_write_complete_live.rs | 309 ++++++++++++++++++ .../examples/com-marshal-probe.rs | 42 +++ 5 files changed, 368 insertions(+), 2 deletions(-) create mode 100644 rust/crates/mxaccess-compat/tests/lmx_write_complete_live.rs create mode 100644 rust/crates/mxaccess-rpc/examples/com-marshal-probe.rs diff --git a/design/followups.md b/design/followups.md index d6dd461..6673368 100644 --- a/design/followups.md +++ b/design/followups.md @@ -27,19 +27,24 @@ Between each publish: wait for the crate to be indexed before the next one's `ca ### F49 — Live verification sweep for the M6 features **Severity:** P1 — closes the live-evidence gap for the M6 work that landed unit-only this session. -**Source:** F36, F40, F45, F47 closeouts — each ships with unit tests but several were not exercised against the live AVEVA install in this session. +**Source:** F36, F40, F45, F47, F54 closeouts — each ships with unit tests but most were not exercised against the live AVEVA install in this session. +**Blocked-by:** F12 hardening (`Session::connect_nmx_auto` returns `RPC_S_SERVER_UNAVAILABLE` (1722) under `cargo test`'s tokio multi-thread runtime — see "Live attempt 2026-05-06" below). The COM-activation path itself works in isolation (`cargo run -p mxaccess-rpc --example com-marshal-probe --features windows-com` succeeds), so the failure is downstream — likely a COM apartment threading issue when CoInitializeEx runs on a tokio worker thread. **Scope.** Run the following against the live AVEVA host with `MX_LIVE=1`: 1. **F36 buffered subscribe** — `cargo run -p mxaccess --example subscribe-buffered -- --tag TestChildObject.TestInt`. Confirm `OnBufferedDataChange`-rate updates flow at the configured cadence; capture wire bytes via `analysis/frida/mx-nmx-trace.js` and confirm exactly one `RegisterReference` (`0x10`) frame with `.property(buffer)` suffix, no separate `SetBufferedUpdateInterval` RPC, and no separate `AdviseSupervisory` follow-up. 2. **F45 recovery replay for buffered** — start the `subscribe-buffered` example, force a `Session::recover_connection` mid-flight (e.g. via a `wwtools` helper that bumps the NMX TCP socket), assert the post-recovery NMX traffic carries an `RegisterReference` (NOT `AdviseSupervisory`) with the same correlation id and `.property(buffer)` suffix. 3. **F47 buffered unsubscribe skip** — instrument `Session::unsubscribe` with a `tracing::debug` log line on the buffered branch, run the example to completion + drop, confirm no `UnAdvise` frame in the wire trace. 4. **F40 metrics** — install a `metrics` exporter (`metrics-exporter-prometheus` is the lightest), run `connect-write-read` + `subscribe` examples with `--features metrics`, confirm at least one counter increment and one histogram observation per metric name in the registered set. +5. **F54 OnWriteComplete (LmxClient round-trip)** — scaffold lives at `crates/mxaccess-compat/tests/lmx_write_complete_live.rs`. Run `cargo test -p mxaccess-compat --features live-windows-com --test lmx_write_complete_live -- --ignored --nocapture` to drive `LmxClient::write` → drain `client.on_write_complete()` and assert the `WriteCompleteEvent { server_handle, item_handle, statuses, is_during_recovery }` shape matches `LMX_OnWriteComplete(int hLMXServerHandle, int phItemHandle, ref MXSTATUS_PROXY[] pVars)`. + +**Live attempt 2026-05-06.** Steps 1-4 not run yet. Step 5 attempted; the test compiled and ran past Frida-style `--probe-resolve-oxid-managed-ntlm-integrity` resolution + `--probe-remqi-managed` IPID extraction, but `connect_nmx_auto` (preferred path) and `connect_nmx` (fallback with probe-resolved IPID) both fail with `Status { detail: 1722 }` (RPC_S_SERVER_UNAVAILABLE). The .NET `MxNativeClient.Probe --probe-session-write` runs the same scenario successfully end-to-end against the same AVEVA install, so the wire is functional and the failure is Rust-port specific. Documented as the F12 hardening followup; the F54 unit-level integration tests (`router_populates_operation_status_context_from_pending_ops_fifo` + `write_handle_correlates_with_router_emitted_status`) cover the F54 logic exhaustively at the layer boundaries. **Definition of done:** 1. Per-feature evidence summary in `docs/M6-live-verification.md` (one paragraph per feature with the wire-trace excerpt or metrics-exporter snapshot). 2. If any feature fails live: file a sub-followup with the captured failure and link it from the evidence doc. +3. F12's tokio-runtime COM activation issue resolved (the `connect_nmx_auto` 1722 error above) so the live tests can actually run. -**Resolves when:** all four features have a live evidence row + no sub-followups remain unresolved. +**Resolves when:** all five features have a live evidence row + no sub-followups remain unresolved. ### F50 — Run the F46 Suspend/Activate Frida capture live **Severity:** P3 — residual from F46 (script ready, capture not yet run). diff --git a/rust/Cargo.lock b/rust/Cargo.lock index cda56ad..9df44ed 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -574,8 +574,10 @@ dependencies = [ name = "mxaccess-compat" version = "0.0.0" dependencies = [ + "async-trait", "futures-util", "mxaccess", + "mxaccess-rpc", "thiserror 2.0.18", "tokio", "tokio-stream", diff --git a/rust/crates/mxaccess-compat/Cargo.toml b/rust/crates/mxaccess-compat/Cargo.toml index 3e5ab00..abc4555 100644 --- a/rust/crates/mxaccess-compat/Cargo.toml +++ b/rust/crates/mxaccess-compat/Cargo.toml @@ -17,6 +17,14 @@ thiserror = { workspace = true } [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "sync", "time"] } +async-trait = { workspace = true } +mxaccess-rpc = { path = "../mxaccess-rpc", version = "0.0.0" } + +[features] +default = [] +# F49 / F54 live test gate. Enables `Session::connect_nmx_auto` for +# the live integration test at `tests/lmx_write_complete_live.rs`. +live-windows-com = ["mxaccess/windows-com"] [lints] workspace = true diff --git a/rust/crates/mxaccess-compat/tests/lmx_write_complete_live.rs b/rust/crates/mxaccess-compat/tests/lmx_write_complete_live.rs new file mode 100644 index 0000000..b1f0586 --- /dev/null +++ b/rust/crates/mxaccess-compat/tests/lmx_write_complete_live.rs @@ -0,0 +1,309 @@ +//! Live verification of F54 — the `LMX_OnWriteComplete(hServer, hItem, +//! ref MXSTATUS_PROXY[])` callback shape end-to-end against AVEVA. +//! +//! Gated on `MX_LIVE` env. Resolves the per-session NMX `INmxService2` +//! IPID by shelling out to the .NET probe +//! (`MxNativeClient.Probe --probe-remqi-managed --objref-only`) and +//! parsing the `remqi_managed_inmxservice2_ipid=` line. Then uses +//! `Session::connect_nmx` (the proven path; `connect_nmx_auto` returns +//! RPC_S_SERVER_UNAVAILABLE in some local-COM activation paths and +//! isn't needed for this test). +//! +//! Run with: +//! ```text +//! cd rust +//! cargo test -p mxaccess-compat --test lmx_write_complete_live -- --ignored --nocapture +//! ``` +//! +//! Required env (populate via `tools/Setup-LiveProbeEnv.ps1`): +//! - `MX_LIVE=1` +//! - `MX_TEST_USER` / `MX_TEST_DOMAIN` / `MX_TEST_PASSWORD` +//! - `MX_NMX_HOST` (default `localhost`) +//! - `MX_TEST_TAG` (default `TestChildObject.TestInt`) +//! +//! Asserts: after a `LmxClient::write(h_server, h_item, value, user_id)` +//! the `client.on_write_complete()` stream yields a `WriteCompleteEvent` +//! with `(server_handle, item_handle, statuses, is_during_recovery)` +//! populated correctly. F49 sweep's core OnWriteComplete row. + +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::indexing_slicing, + clippy::panic +)] + +#[cfg(windows)] +mod live { + use std::process::Command; + use std::sync::Arc; + use std::time::Duration; + + use futures_util::StreamExt; + use mxaccess::{ + GalaxyTagMetadata, MxValue, RecoveryPolicy, Resolver, ResolverError, Session, + SessionOptions, + }; + use mxaccess_compat::LmxClient; + use mxaccess_rpc::guid::Guid; + use mxaccess_rpc::ntlm::NtlmClientContext; + + /// Minimal `Resolver` impl. Mirrors the inline shim every NMX + /// example uses. + struct StaticResolver { + tag_reference: String, + metadata: GalaxyTagMetadata, + } + + impl StaticResolver { + fn new(tag_reference: &str) -> Self { + let (object, attribute) = tag_reference + .split_once('.') + .unwrap_or((tag_reference, "TestInt")); + Self { + tag_reference: tag_reference.to_string(), + metadata: GalaxyTagMetadata { + object_tag_name: object.to_string(), + attribute_name: attribute.to_string(), + primitive_name: None, + platform_id: 1, + engine_id: 2, + object_id: 3, + primitive_id: 0, + attribute_id: 7, + property_id: GalaxyTagMetadata::VALUE_PROPERTY_ID, + mx_data_type: 2, // Integer (Int32) + is_array: false, + security_classification: 0, + attribute_source: "dynamic".into(), + }, + } + } + } + + #[async_trait::async_trait] + impl Resolver for StaticResolver { + async fn resolve(&self, tag: &str) -> Result { + if tag == self.tag_reference { + Ok(self.metadata.clone()) + } else { + Err(ResolverError::NotFound { + tag_reference: tag.to_string(), + }) + } + } + } + + fn ntlm_from_test_env() -> NtlmClientContext { + let user = std::env::var("MX_TEST_USER").expect("MX_TEST_USER"); + let password = std::env::var("MX_TEST_PASSWORD").expect("MX_TEST_PASSWORD"); + let domain = std::env::var("MX_TEST_DOMAIN").unwrap_or_default(); + let hostname = std::env::var("COMPUTERNAME").unwrap_or_default(); + NtlmClientContext::new(&user, &password, &domain, Some(&hostname)) + } + + #[cfg_attr(feature = "live-windows-com", allow(dead_code))] + /// Shell out to the .NET probe to resolve both the + /// `INmxService2` IPID and the `(host, port)` of the NMX + /// endpoint. Returns `(addr, ipid)` ready for `connect_nmx`. + /// + /// Two probe runs: + /// 1. `--probe-resolve-oxid-managed-ntlm-integrity` → parses the + /// first `ncacn_ip_tcp` binding from the `bindings=` line for + /// host + port. + /// 2. `--probe-remqi-managed` → parses the + /// `remqi_managed_inmxservice2_ipid=` line for the IPID. + /// + /// Per-session live resolution; for production the consumer calls + /// `Session::connect_nmx_auto` (windows-com feature) instead. + fn resolve_endpoint_via_dotnet_probe() -> (std::net::SocketAddr, Guid) { + let project = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(3) + .expect("repo root") + .join("src") + .join("MxNativeClient.Probe") + .join("MxNativeClient.Probe.csproj"); + + let resolve_out = run_probe(&project, "--probe-resolve-oxid-managed-ntlm-integrity"); + let bindings_line = resolve_out + .lines() + .find(|l| l.starts_with("resolve_oxid_managed_ntlm_integrity_bindings=")) + .expect("bindings line in probe output"); + let bindings = bindings_line + .split_once('=') + .map(|(_, v)| v) + .unwrap_or_default(); + + // First `ncacn_ip_tcp:HOST[PORT]` token. Pattern: + // string:0x0007:ncacn_ip_tcp:DESKTOP-6JL3KKO[64311]|... + let tcp_binding = bindings + .split('|') + .find(|tok| tok.contains(":ncacn_ip_tcp:")) + .expect("at least one ncacn_ip_tcp binding"); + let host_port = tcp_binding + .rsplit_once(":ncacn_ip_tcp:") + .map(|(_, v)| v) + .unwrap_or_default(); + let bracket_start = host_port.find('[').expect("[port] in binding"); + let host = &host_port[..bracket_start]; + let port: u16 = host_port[bracket_start + 1..] + .trim_end_matches(']') + .parse() + .expect("parse port"); + + let addr = std::net::ToSocketAddrs::to_socket_addrs(&(host, port)) + .expect("DNS") + .find(|a| a.is_ipv4()) // prefer IPv4 — Rust transport stack is happier + .or_else(|| { + std::net::ToSocketAddrs::to_socket_addrs(&(host, port)) + .expect("DNS") + .next() + }) + .expect("at least one address"); + eprintln!("resolved NMX endpoint: {host}:{port} -> {addr}"); + + let remqi_out = run_probe(&project, "--probe-remqi-managed"); + let ipid = remqi_out + .lines() + .find_map(|l| l.strip_prefix("remqi_managed_inmxservice2_ipid=")) + .expect("ipid in probe output"); + let ipid = Guid::parse_str(ipid.trim()).expect("parse IPID"); + eprintln!("resolved INmxService2 IPID: {ipid:?}"); + + (addr, ipid) + } + + #[cfg_attr(feature = "live-windows-com", allow(dead_code))] + fn run_probe(project: &std::path::Path, mode: &str) -> String { + eprintln!("running .NET probe: {mode}"); + let output = Command::new("dotnet") + .args([ + "run", + "--project", + project.to_str().unwrap(), + "-c", + "Release", + "--", + mode, + "--objref-only", + ]) + .output() + .expect("dotnet run"); + if !output.status.success() { + panic!( + "dotnet probe ({mode}) failed: stderr={}", + String::from_utf8_lossy(&output.stderr) + ); + } + String::from_utf8_lossy(&output.stdout).into_owned() + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[ignore] + async fn lmx_write_fires_on_write_complete_event() { + if std::env::var_os("MX_LIVE").is_none() { + eprintln!("MX_LIVE not set — skipping live test"); + return; + } + let tag = std::env::var("MX_TEST_TAG") + .unwrap_or_else(|_| "TestChildObject.TestInt".to_string()); + + // F54 live test: prefer `connect_nmx_auto` so the COM + // activation reference is held in-process for the duration of + // the run. Probe-style external IPID resolution doesn't work + // because the per-session IPID expires when the probe exits. + #[cfg(feature = "live-windows-com")] + let session = { + eprintln!("connecting via Session::connect_nmx_auto"); + Session::connect_nmx_auto( + ntlm_from_test_env, + SessionOptions::default(), + Arc::new(StaticResolver::new(&tag)), + RecoveryPolicy::default(), + ) + .await + .expect("connect_nmx_auto") + }; + #[cfg(not(feature = "live-windows-com"))] + let session = { + // Fallback: probe-resolve the endpoint, then connect_nmx. + // Subject to the per-session-IPID expiry caveat above — + // this branch is mainly for visibility. + let _ = (resolve_endpoint_via_dotnet_probe, run_probe); + let (addr, service_ipid) = resolve_endpoint_via_dotnet_probe(); + eprintln!("connecting via Session::connect_nmx ({addr}, ipid={service_ipid:?})"); + Session::connect_nmx( + addr, + SessionOptions::default(), + ntlm_from_test_env(), + service_ipid, + Arc::new(StaticResolver::new(&tag)), + RecoveryPolicy::default(), + ) + .await + .expect("connect_nmx") + }; + + eprintln!("session connected"); + + let client = LmxClient::register("F54-live-test", session); + let server_handle = 1; // LmxClient::from_backend assigns 1. + + let item_handle = client + .add_item(server_handle, &tag) + .await + .expect("add_item"); + eprintln!("add_item({tag}) -> h_item={item_handle}"); + + // Subscribe to the OnWriteComplete stream BEFORE issuing the + // write so we don't race the broadcast channel. + let mut on_write_complete = client.on_write_complete(); + + eprintln!("write({tag}, 42)"); + client + .write(server_handle, item_handle, MxValue::Int32(42), 0) + .await + .expect("write"); + + // Wait for OnWriteComplete to fire. The 5-byte WRITE_COMPLETE_OK + // status word arrives via NMX callback typically within + // 50-200ms on a healthy local install. + let evt = tokio::time::timeout(Duration::from_secs(10), on_write_complete.next()) + .await + .expect("OnWriteComplete didn't fire within 10s") + .expect("on_write_complete stream closed"); + + eprintln!( + "OnWriteComplete fired: server={} item={} statuses_len={} is_during_recovery={}", + evt.server_handle, + evt.item_handle, + evt.statuses.len(), + evt.is_during_recovery + ); + + // F54 contract — match the C# `LMX_OnWriteComplete(int hServer, + // int hItem, ref MXSTATUS_PROXY[] pVars)` signature shape. + assert_eq!(evt.server_handle, server_handle, "hServer matches"); + assert_eq!(evt.item_handle, item_handle, "hItem matches"); + assert!( + !evt.statuses.is_empty(), + "MXSTATUS_PROXY[] should carry at least one element" + ); + assert!(!evt.is_during_recovery); + + eprintln!("first status: {:?}", evt.statuses[0]); + + client.unregister(server_handle).await.expect("unregister"); + eprintln!("unregistered cleanly"); + } +} + +#[cfg(not(windows))] +mod live { + #[test] + #[ignore] + fn lmx_write_fires_on_write_complete_event() { + eprintln!("test skipped: requires Windows"); + } +} diff --git a/rust/crates/mxaccess-rpc/examples/com-marshal-probe.rs b/rust/crates/mxaccess-rpc/examples/com-marshal-probe.rs new file mode 100644 index 0000000..acd9ac6 --- /dev/null +++ b/rust/crates/mxaccess-rpc/examples/com-marshal-probe.rs @@ -0,0 +1,42 @@ +//! One-shot probe: run `marshal_activated_iunknown_objref` standalone +//! to isolate which step in the NMX activation pipeline is failing +//! with RPC_S_SERVER_UNAVAILABLE (1722) when called from cargo test. +//! +//! Run with: +//! ```text +//! cargo run -p mxaccess-rpc --example com-marshal-probe --features windows-com +//! ``` + +#[cfg(all(windows, feature = "windows-com"))] +fn main() { + use mxaccess_rpc::com_objref_provider::{ + marshal_activated_iunknown_objref, MarshalContext, + }; + + eprintln!("step 1: marshal_activated_iunknown_objref(NmxSvc.NmxService, DifferentMachine)"); + match marshal_activated_iunknown_objref("NmxSvc.NmxService", MarshalContext::DifferentMachine) { + Ok(blob) => { + eprintln!("OK: {} bytes", blob.len()); + eprintln!("first 64 bytes (hex):"); + for chunk in blob.iter().take(64).enumerate() { + if chunk.0 % 16 == 0 { + eprint!("\n "); + } + eprint!("{:02x} ", chunk.1); + } + eprintln!(); + } + Err(e) => { + eprintln!("FAIL: {e}"); + std::process::exit(1); + } + } +} + +#[cfg(not(all(windows, feature = "windows-com")))] +fn main() { + eprintln!( + "com-marshal-probe requires Windows + the windows-com feature: \ + cargo run -p mxaccess-rpc --example com-marshal-probe --features windows-com" + ); +}