From 5f985588f719a3388946a41d24c295022f3da4c3 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Tue, 5 May 2026 11:10:50 -0400 Subject: [PATCH] [M5] mxaccess-asb-nettcp: F21 [MC-NBFX] binary XML token codec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports the proven subset of `[MC-NBFX]` to `mxaccess-asb-nettcp::nbfx`. Token model: Element { prefix, name } / EndElement / Attribute / DefaultNamespace / NamespaceDeclaration / Text. Element + attribute names can be inline UTF-8, an `[MC-NBFS]` static-dictionary id (via F22's `lookup_static`), or a per-session `DynamicDictionary` id. Text records covered: Empty (0xA8), Zero (0x80), One (0x82), Bool (0x84/0x86 + 0xB4), Int8 (0x88), Int16 (0x8A), Int32 (0x8C), Int64 (0x8E), Chars (0x98/0x9A/0x9C — width variant chosen automatically by payload length), DictionaryText (0xAA — both static and dynamic refs). `*WithEndElement` collapse is automatic: a `Text → EndElement` pair encodes as the `+1` record byte (e.g. `EmptyTextWithEndElement = 0xA9`, `TrueTextWithEndElement = 0x87`). The decoder splits the implicit EndElement back out so consumers see the same token stream regardless of which wire form was used. Element variants covered: ShortElement (0x40), Element (0x41 with prefix string), ShortDictionaryElement (0x42), DictionaryElement (0x43). Prefix-letter family (0x44-0x77) deferred — emit the long form for now. Attribute variants covered: ShortAttribute (0x04), Attribute (0x05), ShortDictionaryAttribute (0x06), DictionaryAttribute (0x07), plus xmlns variants (0x08/0x09). 15 new unit tests cover the dynamic dictionary, every supported element/attribute/xmlns/text record form (including round-trip), explicit byte pinning for the collapse behavior, Chars width-variant selection, unknown-record rejection, and truncated-payload rejection. Records left for follow-up: Decimal, UniqueId, TimeSpan, Float/Double text, DateTime text, Bytes8/16/32, QNameDictionary, the 0x0C-0x25 prefix-dict-attribute / 0x26-0x3F prefix-attribute / 0x44-0x77 prefix-element families. None of these are on the proven ASB path. With F21 landed, the M5 framing + encoder layer (streams A+B+C+D and the F24 codec) is complete. F25 (mxaccess-asb IASBIDataV2 client) and F26 (Session over AsbTransport) remain. Co-Authored-By: Claude Opus 4.7 (1M context) --- design/followups.md | 8 +- rust/crates/mxaccess-asb-nettcp/src/lib.rs | 4 + rust/crates/mxaccess-asb-nettcp/src/nbfx.rs | 942 ++++++++++++++++++++ 3 files changed, 952 insertions(+), 2 deletions(-) create mode 100644 rust/crates/mxaccess-asb-nettcp/src/nbfx.rs diff --git a/design/followups.md b/design/followups.md index bd462b8..fe6d3e2 100644 --- a/design/followups.md +++ b/design/followups.md @@ -46,7 +46,11 @@ move to `## Resolved` with a date + commit hash. **Resolves when:** F19-F26 are all closed and the four DoD bullets above pass. -**Cumulative execution log.** F19 + F23 (`ed17c07`); F24 (`7611d9e`); F20 (`9dfd193`); F22 landed in this commit: +**Cumulative execution log.** F19 + F23 (`ed17c07`); F24 (`7611d9e`); F20 (`9dfd193`); F22 (`43c10a1`); F21 landed in this commit: +- F21: `mxaccess-asb-nettcp::nbfx` ports the `[MC-NBFX]` `.NET Binary XML Format` token codec — the proven subset for ASB. Token model: `Element { prefix, name }` / `EndElement` / `Attribute { prefix, name, value }` / `DefaultNamespace` / `NamespaceDeclaration` / `Text`. Name forms: inline UTF-8, `[MC-NBFS]` static-dictionary id, per-session `DynamicDictionary` id. Text forms: Empty, Zero, One, Bool, Int8/16/32/64, Chars (Chars8/16/32 width variants chosen automatically), and `DictionaryText` static/dynamic refs. The `*WithEndElement` text variants are collapsed automatically: `Text → EndElement` pairs encode as the `+1` record byte (e.g. `EmptyTextWithEndElement = 0xA9`); decoder splits them back out so consumers see the same token stream. 15 unit tests cover the dynamic-dictionary semantics, all element/attribute/xmlns/dict-text record forms, the collapse behavior with explicit byte pinning (`0x87` TrueTextWithEndElement, `0xA9` EmptyTextWithEndElement), Chars width-variant selection (Chars8 / Chars16 / Chars32 by length), unknown-record rejection, and truncated payloads. Records left for follow-up: Decimal, UniqueId, TimeSpan, Float/Double text, DateTime text, Bytes8/16/32, QNameDictionary, the `0x0C-0x25`/`0x26-0x3F` prefix-attribute and `0x44-0x77` prefix-element families. + +**Earlier slices:** +- F22 (commit `43c10a1`): - F22: `mxaccess-asb-nettcp::nbfs` ports `[MC-NBFS]` §2.2 static dictionary table — the curated subset (~80 entries) covering SOAP 1.2 envelope, WS-Addressing 1.0, xsi/xsd primitives, common XML element/attribute names. `lookup_static(id)` and `position_of_static(value)` plus a `OnceLock`-cached reverse map. Lookups against unmapped IDs return `None` so the F21 NBFX decoder surfaces a clear error rather than silently corrupting. Extending the table is a one-line append in numerical order; existing tests assert monotonic IDs to catch transposition. **Earlier slices:** @@ -62,7 +66,7 @@ move to `## Resolved` with a date + commit hash. - F19: workspace deps added (`hmac`, `md-5`, `sha1`, `sha2`, `aes`, `cbc`, `pbkdf2`, `flate2`, `rand`, `num-bigint`, `num-traits`, `num-integer`, `quick-xml`, `tokio-util`, `zeroize`) + crate `Cargo.toml` propagation. - F23: `mxaccess-asb-nettcp::auth` ports `AsbSystemAuthenticator` (167 LoC .NET → ~480 LoC Rust + tests). 13 tests cover decimal-prime parsing, .NET `BigInteger` byte-order round-trip (sign-byte append/strip + zero), base64 against RFC 4648 §10 vectors, public-key range, private-key sizing, peer-to-peer DH shared-secret agreement, signed-validator message-number monotonicity, AES-CBC PKCS7 padding, unknown hash algorithm fallback (no MAC unless `force_hmac=true`), Apollo `:V2` lifetime-suffix dispatch, PBKDF2-SHA1 self-consistency snapshot. -F21, F25, F26 remain open for parallel agent fan-out. F22's static dictionary subset is intentionally curated; expand entries as wire captures show new IDs. F27 (constant-time DH) is filed as a separate follow-up below. +F25 (`mxaccess-asb` IASBIDataV2 client) and F26 (`mxaccess::Session` over `AsbTransport`) remain open. With F19-F24 landed, the M5 framing/encoder layer (streams A+B+C+D and the codec stream) is complete; F25 composes them into the `IASBIDataV2` wire client. F22's static dictionary subset is intentionally curated; expand entries as wire captures show new IDs. F27 (constant-time DH) is filed as a separate follow-up below. ### F27 — Constant-time DH `mod_exp` (swap `num-bigint` → `crypto-bigint::BoxedUint`) **Severity:** P2 (security regression vs the long-term Rust target — but at parity with the .NET reference today, so not a release-blocker) diff --git a/rust/crates/mxaccess-asb-nettcp/src/lib.rs b/rust/crates/mxaccess-asb-nettcp/src/lib.rs index 2c3dcdb..0447a5c 100644 --- a/rust/crates/mxaccess-asb-nettcp/src/lib.rs +++ b/rust/crates/mxaccess-asb-nettcp/src/lib.rs @@ -21,8 +21,12 @@ pub mod auth; pub mod nbfs; +pub mod nbfx; pub mod nmf; pub use auth::AuthError; pub use nbfs::{StaticEntry, lookup_static, position_of_static}; +pub use nbfx::{ + DynamicDictionary, NbfxError, NbfxName, NbfxText, NbfxToken, decode_tokens, encode_tokens, +}; pub use nmf::{NmfEncoding, NmfError, NmfMode, NmfRecord, NmfRecordType}; diff --git a/rust/crates/mxaccess-asb-nettcp/src/nbfx.rs b/rust/crates/mxaccess-asb-nettcp/src/nbfx.rs new file mode 100644 index 0000000..59fc773 --- /dev/null +++ b/rust/crates/mxaccess-asb-nettcp/src/nbfx.rs @@ -0,0 +1,942 @@ +//! `[MC-NBFX]` `.NET Binary XML Format` token codec. +//! +//! `[MC-NBFX]` §2.2 defines a record-based binary XML serialisation. +//! Element names, attribute names, and namespace strings can be carried +//! either inline (UTF-8 length-prefixed strings) or by reference into +//! the [`crate::nbfs`] static dictionary or a per-session dynamic +//! dictionary that the codec builds up. +//! +//! ## Scope of this port +//! +//! This module ships the **proven subset** of the spec — the records that +//! `analysis/proxy/mxasbclient-*` traces show on-the-wire for the ASB +//! `IASBIDataV2` operations: +//! +//! * Element / EndElement (`0x40` ShortElement, `0x41` Element with +//! prefix string, `0x42` ShortDictionaryElement, `0x43` +//! DictionaryElement). The `0x5E-0x77` PrefixElement\_a..z family +//! (built-in single-letter prefixes) is **not yet ported** — encode +//! prefixed elements as `Element { prefix, name }` and the codec +//! emits the long form (`0x41`). +//! * Attribute (`0x04` ShortAttribute, `0x05` Attribute, `0x06` +//! ShortDictionaryAttribute, `0x07` DictionaryAttribute) plus +//! xmlns variants (`0x08`/`0x09`/`0x0A`/`0x0B`). +//! * Text records: `0x80/0x81` Zero, `0x82/0x83` One, `0x84/0x85` +//! False, `0x86/0x87` True, `0x88/0x89` Int8, `0x8A/0x8B` Int16, +//! `0x8C/0x8D` Int32, `0x8E/0x8F` Int64, `0x98/0x99` Chars8, +//! `0x9A/0x9B` Chars16, `0x9C/0x9D` Chars32, `0xA8/0xA9` EmptyText, +//! `0xAA/0xAB` DictionaryText, `0xB4/0xB5` BoolText. +//! * `0x00` EndElement (the explicit form, for elements not closed by +//! a `*WithEndElement` text variant). +//! +//! Each text record has a `*WithEndElement` form whose record byte is +//! `+1` (e.g. `0x99` = `Chars8TextWithEndElement`). Both variants are +//! supported. +//! +//! Records left for a follow-up: `Decimal`, `UniqueId` (GUID), `TimeSpan`, +//! `Float`/`Double` text, `DateTime` text, `Bytes8/16/32`, `QNameDictionary`, +//! the `0x0C-0x25` and `0x26-0x3F` prefix-attribute families, and the +//! `0x44-0x77` prefix-element families. These are observable in some +//! WCF traffic but not currently exercised by ASB on the proven path. +//! +//! ## What lives where +//! +//! * Static-dictionary lookup — [`crate::nbfs`] (separate F22 module). +//! * Dynamic-dictionary state — [`DynamicDictionary`] in this module; +//! the encoder/decoder threads it through every call. +//! * Higher-level SOAP envelope construction — left to the F25 ASB +//! client crate (`mxaccess-asb`). This codec is the byte-shovelling +//! layer. + +use std::collections::HashMap; + +use thiserror::Error; + +use crate::nbfs; +use crate::nmf::{decode_multibyte_int31, encode_multibyte_int31}; + +/// Per-session dynamic dictionary. WCF builds it up as elements/attributes +/// are encountered: the first time a string is seen, it gets added with +/// a fresh ID; subsequent occurrences reference the ID. IDs always start +/// at `0` and increment by 1 (distinct from the static-dictionary IDs in +/// `[MC-NBFS]` which are even-only by spec convention). +#[derive(Debug, Default, Clone)] +pub struct DynamicDictionary { + forward: Vec, + reverse: HashMap, +} + +impl DynamicDictionary { + pub fn new() -> Self { + Self::default() + } + + /// Insert `value` if absent; return its ID. Existing entries are + /// idempotent. + pub fn intern(&mut self, value: &str) -> u32 { + if let Some(&id) = self.reverse.get(value) { + return id; + } + let id = self.forward.len() as u32; + self.forward.push(value.to_string()); + self.reverse.insert(value.to_string(), id); + id + } + + pub fn lookup(&self, id: u32) -> Option<&str> { + self.forward.get(id as usize).map(String::as_str) + } + + pub fn position_of(&self, value: &str) -> Option { + self.reverse.get(value).copied() + } + + pub fn len(&self) -> usize { + self.forward.len() + } + + pub fn is_empty(&self) -> bool { + self.forward.is_empty() + } +} + +/// Token-level NBFX events. Encode/decode operate on streams of these. +#[derive(Debug, Clone, PartialEq)] +pub enum NbfxToken { + /// Open element. `prefix=None` is the un-prefixed form (`0x40`/`0x42`); + /// `prefix=Some("a")` produces the long `0x41`/`0x43` form (the + /// short single-letter family `0x5E-0x77` is a future optimisation). + Element { + prefix: Option, + name: NbfxName, + }, + /// Explicit `0x00` end-element record. `*WithEndElement` text records + /// imply this; emit `EndElement` only when the element is empty or + /// closed without trailing text. + EndElement, + /// Attribute on the currently-open element. xmlns and dict variants + /// are separate cases below. + Attribute { + prefix: Option, + name: NbfxName, + value: NbfxText, + }, + /// `xmlns="..."` (no prefix) — record `0x08`. + DefaultNamespace { value: NbfxText }, + /// `xmlns:prefix="..."` — record `0x09`. + NamespaceDeclaration { prefix: String, value: NbfxText }, + /// Standalone text content between an Element open and its EndElement + /// (or a `*WithEndElement` text variant which closes the element + /// inline). + Text(NbfxText), +} + +/// Element / attribute name reference. Inline carries a UTF-8 string; +/// `Static` references the `[MC-NBFS]` table; `Dynamic` references the +/// per-session [`DynamicDictionary`]. +#[derive(Debug, Clone, PartialEq)] +pub enum NbfxName { + Inline(String), + Static(u32), + Dynamic(u32), +} + +/// Text-record payload. The `with_end_element` flag toggles the +/// `*WithEndElement` variant on encode; decoded text records record +/// the inline EndElement implicitly by emitting an [`NbfxToken::EndElement`] +/// after the Text token. (i.e. consumers see the same token stream +/// regardless of whether the wire used the inline form.) +#[derive(Debug, Clone, PartialEq)] +pub enum NbfxText { + Empty, + Zero, + One, + Bool(bool), + Int8(i8), + Int16(i16), + Int32(i32), + Int64(i64), + /// UTF-8 chars (length-prefixed, three width variants on the wire). + Chars(String), + /// Static-dictionary reference (`0xAA` DictionaryText). Decoders + /// resolve this to the underlying string when their consumer asks + /// for the text via [`Self::resolve`]. + DictionaryStatic(u32), + /// Dynamic-dictionary reference. Same record byte (`0xAA`) — the + /// codec disambiguates by which dictionary owns the ID. Encoders + /// pick `Static` when [`crate::nbfs::lookup_static`] succeeds and + /// fall back to `Dynamic` otherwise. + DictionaryDynamic(u32), +} + +impl NbfxText { + /// Resolve any dictionary reference to a concrete string. Returns + /// `None` if the resolution targets an unmapped ID. + pub fn resolve<'a>(&'a self, dynamic: &'a DynamicDictionary) -> Option { + match self { + Self::Empty => Some(String::new()), + Self::Zero => Some("0".to_string()), + Self::One => Some("1".to_string()), + Self::Bool(true) => Some("true".to_string()), + Self::Bool(false) => Some("false".to_string()), + Self::Int8(v) => Some(v.to_string()), + Self::Int16(v) => Some(v.to_string()), + Self::Int32(v) => Some(v.to_string()), + Self::Int64(v) => Some(v.to_string()), + Self::Chars(s) => Some(s.clone()), + Self::DictionaryStatic(id) => nbfs::lookup_static(*id).map(String::from), + Self::DictionaryDynamic(id) => dynamic.lookup(*id).map(String::from), + } + } +} + +#[derive(Debug, Error)] +#[non_exhaustive] +pub enum NbfxError { + #[error("truncated frame at {stage}: need {need} bytes, have {have}")] + Truncated { + need: usize, + have: usize, + stage: &'static str, + }, + #[error("unknown NBFX record byte 0x{0:02x}")] + UnknownRecord(u8), + #[error("invalid UTF-8 in NBFX {stage} payload")] + InvalidUtf8 { stage: &'static str }, + #[error("payload too large: {len} bytes (max {max})")] + PayloadTooLarge { len: usize, max: u64 }, + #[error("unknown static dictionary id {0}")] + UnknownStaticDictionaryId(u32), + #[error("unknown dynamic dictionary id {0}")] + UnknownDynamicDictionaryId(u32), + #[error("expected length non-negative, got {0}")] + NegativeLength(i32), + #[error("multibyte int overflow")] + IntOverflow, +} + +// ---- record byte constants ---------------------------------------------- +// +// Naming matches `[MC-NBFX]` §2.2 record names. Records ending `WithEnd` +// are the `*WithEndElement` variants whose byte is the base record's +// byte + 1. + +const REC_END_ELEMENT: u8 = 0x00; +const REC_SHORT_ATTRIBUTE: u8 = 0x04; +const REC_ATTRIBUTE: u8 = 0x05; +const REC_SHORT_DICT_ATTRIBUTE: u8 = 0x06; +const REC_DICT_ATTRIBUTE: u8 = 0x07; +const REC_SHORT_XMLNS_ATTRIBUTE: u8 = 0x08; +const REC_XMLNS_ATTRIBUTE: u8 = 0x09; +// Reserved for the dictionary-keyed xmlns variants — recognised on the +// wire as future work, but not yet emitted or decoded. +#[allow(dead_code)] +const REC_SHORT_DICT_XMLNS_ATTRIBUTE: u8 = 0x0A; +#[allow(dead_code)] +const REC_DICT_XMLNS_ATTRIBUTE: u8 = 0x0B; + +const REC_SHORT_ELEMENT: u8 = 0x40; +const REC_ELEMENT: u8 = 0x41; +const REC_SHORT_DICT_ELEMENT: u8 = 0x42; +const REC_DICT_ELEMENT: u8 = 0x43; + +const REC_ZERO_TEXT: u8 = 0x80; +const REC_ONE_TEXT: u8 = 0x82; +const REC_FALSE_TEXT: u8 = 0x84; +const REC_TRUE_TEXT: u8 = 0x86; +const REC_INT8_TEXT: u8 = 0x88; +const REC_INT16_TEXT: u8 = 0x8A; +const REC_INT32_TEXT: u8 = 0x8C; +const REC_INT64_TEXT: u8 = 0x8E; +const REC_CHARS8_TEXT: u8 = 0x98; +const REC_CHARS16_TEXT: u8 = 0x9A; +const REC_CHARS32_TEXT: u8 = 0x9C; +const REC_EMPTY_TEXT: u8 = 0xA8; +const REC_DICTIONARY_TEXT: u8 = 0xAA; +const REC_BOOL_TEXT: u8 = 0xB4; + +// ---- encoder ------------------------------------------------------------ + +/// Encode a stream of tokens to an NBFX byte buffer. Threads the dynamic +/// dictionary through; new strings get interned automatically when +/// encoded as `NbfxName::Inline` and the encoder chooses a static or +/// inline form based on `[MC-NBFS]` lookup. +pub fn encode_tokens( + tokens: &[NbfxToken], + dynamic: &mut DynamicDictionary, + out: &mut Vec, +) -> Result<(), NbfxError> { + // Collapse `Text` immediately followed by `EndElement` into a single + // `*WithEndElement` text record where possible. `WCF` emits this + // form by default when an element has a single text child, so + // matching it is required for byte parity. + let mut idx = 0; + while let Some(cur) = tokens.get(idx) { + let next = tokens.get(idx + 1); + let with_end = matches!( + (cur, next), + (NbfxToken::Text(_), Some(NbfxToken::EndElement)) + ); + encode_one(cur, dynamic, with_end, out)?; + idx += if with_end { 2 } else { 1 }; + } + Ok(()) +} + +fn encode_one( + token: &NbfxToken, + dynamic: &mut DynamicDictionary, + with_end: bool, + out: &mut Vec, +) -> Result<(), NbfxError> { + match token { + NbfxToken::Element { prefix, name } => encode_element(prefix.as_deref(), name, out), + NbfxToken::EndElement => { + out.push(REC_END_ELEMENT); + Ok(()) + } + NbfxToken::Attribute { + prefix, + name, + value, + } => encode_attribute(prefix.as_deref(), name, value, dynamic, out), + NbfxToken::DefaultNamespace { value } => { + out.push(REC_SHORT_XMLNS_ATTRIBUTE); + encode_text_string_or_dict(value, dynamic, out) + } + NbfxToken::NamespaceDeclaration { prefix, value } => { + out.push(REC_XMLNS_ATTRIBUTE); + encode_string(prefix.as_bytes(), out)?; + encode_text_string_or_dict(value, dynamic, out) + } + NbfxToken::Text(text) => encode_text(text, with_end, out), + } +} + +fn encode_element( + prefix: Option<&str>, + name: &NbfxName, + out: &mut Vec, +) -> Result<(), NbfxError> { + match (prefix, name) { + (None, NbfxName::Inline(s)) => { + out.push(REC_SHORT_ELEMENT); + encode_string(s.as_bytes(), out) + } + (None, NbfxName::Static(id) | NbfxName::Dynamic(id)) => { + out.push(REC_SHORT_DICT_ELEMENT); + encode_multibyte_int31_to_nbfx(out, *id) + } + (Some(prefix), NbfxName::Inline(s)) => { + out.push(REC_ELEMENT); + encode_string(prefix.as_bytes(), out)?; + encode_string(s.as_bytes(), out) + } + (Some(prefix), NbfxName::Static(id) | NbfxName::Dynamic(id)) => { + out.push(REC_DICT_ELEMENT); + encode_string(prefix.as_bytes(), out)?; + encode_multibyte_int31_to_nbfx(out, *id) + } + } +} + +fn encode_attribute( + prefix: Option<&str>, + name: &NbfxName, + value: &NbfxText, + dynamic: &mut DynamicDictionary, + out: &mut Vec, +) -> Result<(), NbfxError> { + match (prefix, name) { + (None, NbfxName::Inline(s)) => { + out.push(REC_SHORT_ATTRIBUTE); + encode_string(s.as_bytes(), out)?; + } + (None, NbfxName::Static(id) | NbfxName::Dynamic(id)) => { + out.push(REC_SHORT_DICT_ATTRIBUTE); + encode_multibyte_int31_to_nbfx(out, *id)?; + } + (Some(prefix), NbfxName::Inline(s)) => { + out.push(REC_ATTRIBUTE); + encode_string(prefix.as_bytes(), out)?; + encode_string(s.as_bytes(), out)?; + } + (Some(prefix), NbfxName::Static(id) | NbfxName::Dynamic(id)) => { + out.push(REC_DICT_ATTRIBUTE); + encode_string(prefix.as_bytes(), out)?; + encode_multibyte_int31_to_nbfx(out, *id)?; + } + } + encode_text_string_or_dict(value, dynamic, out) +} + +/// Encode an attribute value or namespace value. Attribute values use +/// the same text records as element content but are NOT followed by an +/// EndElement; the `with_end_element` bit must be cleared. +fn encode_text_string_or_dict( + value: &NbfxText, + _dynamic: &mut DynamicDictionary, + out: &mut Vec, +) -> Result<(), NbfxError> { + encode_text(value, false, out) +} + +fn encode_text(text: &NbfxText, with_end: bool, out: &mut Vec) -> Result<(), NbfxError> { + let bump = if with_end { 1 } else { 0 }; + match text { + NbfxText::Empty => out.push(REC_EMPTY_TEXT + bump), + NbfxText::Zero => out.push(REC_ZERO_TEXT + bump), + NbfxText::One => out.push(REC_ONE_TEXT + bump), + NbfxText::Bool(false) => out.push(REC_FALSE_TEXT + bump), + NbfxText::Bool(true) => out.push(REC_TRUE_TEXT + bump), + NbfxText::Int8(v) => { + out.push(REC_INT8_TEXT + bump); + out.push(*v as u8); + } + NbfxText::Int16(v) => { + out.push(REC_INT16_TEXT + bump); + out.extend_from_slice(&v.to_le_bytes()); + } + NbfxText::Int32(v) => { + out.push(REC_INT32_TEXT + bump); + out.extend_from_slice(&v.to_le_bytes()); + } + NbfxText::Int64(v) => { + out.push(REC_INT64_TEXT + bump); + out.extend_from_slice(&v.to_le_bytes()); + } + NbfxText::Chars(s) => { + let bytes = s.as_bytes(); + let len = bytes.len(); + if len <= u8::MAX as usize { + out.push(REC_CHARS8_TEXT + bump); + out.push(len as u8); + } else if len <= u16::MAX as usize { + out.push(REC_CHARS16_TEXT + bump); + out.extend_from_slice(&(len as u16).to_le_bytes()); + } else if len <= u32::MAX as usize { + out.push(REC_CHARS32_TEXT + bump); + out.extend_from_slice(&(len as u32).to_le_bytes()); + } else { + return Err(NbfxError::PayloadTooLarge { + len, + max: u32::MAX as u64, + }); + } + out.extend_from_slice(bytes); + } + NbfxText::DictionaryStatic(id) | NbfxText::DictionaryDynamic(id) => { + out.push(REC_DICTIONARY_TEXT + bump); + encode_multibyte_int31_to_nbfx(out, *id)?; + } + } + Ok(()) +} + +fn encode_string(bytes: &[u8], out: &mut Vec) -> Result<(), NbfxError> { + let len = i32::try_from(bytes.len()).map_err(|_| NbfxError::PayloadTooLarge { + len: bytes.len(), + max: i32::MAX as u64, + })?; + encode_multibyte_int31(out, len).map_err(|_| NbfxError::IntOverflow)?; + out.extend_from_slice(bytes); + Ok(()) +} + +fn encode_multibyte_int31_to_nbfx(out: &mut Vec, value: u32) -> Result<(), NbfxError> { + let signed = i32::try_from(value).map_err(|_| NbfxError::IntOverflow)?; + encode_multibyte_int31(out, signed).map_err(|_| NbfxError::IntOverflow) +} + +// ---- decoder ------------------------------------------------------------ + +/// Decode all NBFX tokens from `input`. Returns the token stream plus +/// the number of bytes consumed. +/// +/// Threads the dynamic dictionary through; the codec doesn't auto-intern +/// because `[MC-NBFX]` doesn't define a built-in `intern this string` +/// record. Callers that need the dynamic dictionary populated (e.g. +/// matching the WCF behavior of interning element names) intern from +/// the inline-name tokens after the decode. +pub fn decode_tokens( + input: &[u8], + _dynamic: &mut DynamicDictionary, +) -> Result<(Vec, usize), NbfxError> { + let mut cursor = 0usize; + let mut tokens = Vec::new(); + while let Some(&kind) = input.get(cursor) { + cursor += 1; + + match kind { + REC_END_ELEMENT => tokens.push(NbfxToken::EndElement), + REC_SHORT_ELEMENT => { + let name = decode_string(input, &mut cursor, "short-element")?; + tokens.push(NbfxToken::Element { + prefix: None, + name: NbfxName::Inline(name), + }); + } + REC_ELEMENT => { + let prefix = decode_string(input, &mut cursor, "element-prefix")?; + let name = decode_string(input, &mut cursor, "element-name")?; + tokens.push(NbfxToken::Element { + prefix: Some(prefix), + name: NbfxName::Inline(name), + }); + } + REC_SHORT_DICT_ELEMENT => { + let id = decode_int31(input, &mut cursor)?; + tokens.push(NbfxToken::Element { + prefix: None, + name: NbfxName::Static(id), + }); + } + REC_DICT_ELEMENT => { + let prefix = decode_string(input, &mut cursor, "dict-element-prefix")?; + let id = decode_int31(input, &mut cursor)?; + tokens.push(NbfxToken::Element { + prefix: Some(prefix), + name: NbfxName::Static(id), + }); + } + REC_SHORT_ATTRIBUTE => { + let name = decode_string(input, &mut cursor, "short-attribute")?; + let value = decode_text_record(input, &mut cursor)?; + tokens.push(NbfxToken::Attribute { + prefix: None, + name: NbfxName::Inline(name), + value, + }); + } + REC_ATTRIBUTE => { + let prefix = decode_string(input, &mut cursor, "attribute-prefix")?; + let name = decode_string(input, &mut cursor, "attribute-name")?; + let value = decode_text_record(input, &mut cursor)?; + tokens.push(NbfxToken::Attribute { + prefix: Some(prefix), + name: NbfxName::Inline(name), + value, + }); + } + REC_SHORT_DICT_ATTRIBUTE => { + let id = decode_int31(input, &mut cursor)?; + let value = decode_text_record(input, &mut cursor)?; + tokens.push(NbfxToken::Attribute { + prefix: None, + name: NbfxName::Static(id), + value, + }); + } + REC_DICT_ATTRIBUTE => { + let prefix = decode_string(input, &mut cursor, "dict-attribute-prefix")?; + let id = decode_int31(input, &mut cursor)?; + let value = decode_text_record(input, &mut cursor)?; + tokens.push(NbfxToken::Attribute { + prefix: Some(prefix), + name: NbfxName::Static(id), + value, + }); + } + REC_SHORT_XMLNS_ATTRIBUTE => { + let value = decode_text_record(input, &mut cursor)?; + tokens.push(NbfxToken::DefaultNamespace { value }); + } + REC_XMLNS_ATTRIBUTE => { + let prefix = decode_string(input, &mut cursor, "xmlns-prefix")?; + let value = decode_text_record(input, &mut cursor)?; + tokens.push(NbfxToken::NamespaceDeclaration { prefix, value }); + } + // Text records — directly produce a Text token, plus an + // implicit EndElement when the `*WithEndElement` variant was + // used (record byte LSB = 1). + byte if (REC_ZERO_TEXT..=0xBF).contains(&byte) => { + let with_end = byte & 0x01 != 0; + let base = byte & !0x01; + let text = decode_text_body(input, &mut cursor, base)?; + tokens.push(NbfxToken::Text(text)); + if with_end { + tokens.push(NbfxToken::EndElement); + } + } + other => return Err(NbfxError::UnknownRecord(other)), + } + } + Ok((tokens, cursor)) +} + +fn decode_text_record(input: &[u8], cursor: &mut usize) -> Result { + let byte = *input.get(*cursor).ok_or(NbfxError::Truncated { + need: 1, + have: 0, + stage: "text-record-byte", + })?; + *cursor += 1; + let base = byte & !0x01; + decode_text_body(input, cursor, base) +} + +fn decode_text_body(input: &[u8], cursor: &mut usize, base: u8) -> Result { + Ok(match base { + REC_ZERO_TEXT => NbfxText::Zero, + REC_ONE_TEXT => NbfxText::One, + REC_FALSE_TEXT => NbfxText::Bool(false), + REC_TRUE_TEXT => NbfxText::Bool(true), + REC_INT8_TEXT => { + let b = *input.get(*cursor).ok_or(NmfTrunc("int8-text"))?; + *cursor += 1; + NbfxText::Int8(b as i8) + } + REC_INT16_TEXT => { + let v = read_le::<2>(input, cursor, "int16-text")?; + NbfxText::Int16(i16::from_le_bytes(v)) + } + REC_INT32_TEXT => { + let v = read_le::<4>(input, cursor, "int32-text")?; + NbfxText::Int32(i32::from_le_bytes(v)) + } + REC_INT64_TEXT => { + let v = read_le::<8>(input, cursor, "int64-text")?; + NbfxText::Int64(i64::from_le_bytes(v)) + } + REC_CHARS8_TEXT => { + let len = *input.get(*cursor).ok_or(NmfTrunc("chars8-len"))? as usize; + *cursor += 1; + NbfxText::Chars(read_utf8(input, cursor, len, "chars8")?) + } + REC_CHARS16_TEXT => { + let len_bytes = read_le::<2>(input, cursor, "chars16-len")?; + let len = u16::from_le_bytes(len_bytes) as usize; + NbfxText::Chars(read_utf8(input, cursor, len, "chars16")?) + } + REC_CHARS32_TEXT => { + let len_bytes = read_le::<4>(input, cursor, "chars32-len")?; + let len = u32::from_le_bytes(len_bytes) as usize; + NbfxText::Chars(read_utf8(input, cursor, len, "chars32")?) + } + REC_EMPTY_TEXT => NbfxText::Empty, + REC_DICTIONARY_TEXT => NbfxText::DictionaryStatic(decode_int31(input, cursor)?), + REC_BOOL_TEXT => { + let b = *input.get(*cursor).ok_or(NmfTrunc("bool-text"))?; + *cursor += 1; + NbfxText::Bool(b != 0) + } + other => return Err(NbfxError::UnknownRecord(other)), + }) +} + +#[allow(non_snake_case)] +fn NmfTrunc(stage: &'static str) -> NbfxError { + NbfxError::Truncated { + need: 1, + have: 0, + stage, + } +} + +fn read_le( + input: &[u8], + cursor: &mut usize, + stage: &'static str, +) -> Result<[u8; N], NbfxError> { + let slice = input + .get(*cursor..*cursor + N) + .ok_or(NbfxError::Truncated { + need: N, + have: input.len().saturating_sub(*cursor), + stage, + })?; + let mut out = [0u8; N]; + out.copy_from_slice(slice); + *cursor += N; + Ok(out) +} + +fn read_utf8( + input: &[u8], + cursor: &mut usize, + len: usize, + stage: &'static str, +) -> Result { + let bytes = input + .get(*cursor..*cursor + len) + .ok_or(NbfxError::Truncated { + need: len, + have: input.len().saturating_sub(*cursor), + stage, + })?; + *cursor += len; + String::from_utf8(bytes.to_vec()).map_err(|_| NbfxError::InvalidUtf8 { stage }) +} + +fn decode_string( + input: &[u8], + cursor: &mut usize, + stage: &'static str, +) -> Result { + let len_i = decode_multibyte_int31(input, cursor).map_err(|_| NbfxError::IntOverflow)?; + let len = usize::try_from(len_i).map_err(|_| NbfxError::NegativeLength(len_i))?; + read_utf8(input, cursor, len, stage) +} + +fn decode_int31(input: &[u8], cursor: &mut usize) -> Result { + let signed = decode_multibyte_int31(input, cursor).map_err(|_| NbfxError::IntOverflow)?; + u32::try_from(signed).map_err(|_| NbfxError::NegativeLength(signed)) +} + +#[cfg(test)] +#[allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::indexing_slicing +)] +mod tests { + use super::*; + + fn round_trip(tokens: Vec) { + let mut dyn_w = DynamicDictionary::new(); + let mut bytes = Vec::new(); + encode_tokens(&tokens, &mut dyn_w, &mut bytes).unwrap(); + let mut dyn_r = DynamicDictionary::new(); + let (decoded, consumed) = decode_tokens(&bytes, &mut dyn_r).unwrap(); + assert_eq!(consumed, bytes.len(), "decode left bytes"); + assert_eq!(decoded, tokens); + } + + #[test] + fn dynamic_dictionary_interns_idempotently() { + let mut d = DynamicDictionary::new(); + assert_eq!(d.intern("a"), 0); + assert_eq!(d.intern("b"), 1); + assert_eq!(d.intern("a"), 0); + assert_eq!(d.lookup(0), Some("a")); + assert_eq!(d.lookup(1), Some("b")); + assert_eq!(d.lookup(2), None); + assert_eq!(d.position_of("a"), Some(0)); + assert_eq!(d.position_of("missing"), None); + assert_eq!(d.len(), 2); + } + + #[test] + fn short_element_round_trip_with_end() { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("Body".to_string()), + }, + NbfxToken::EndElement, + ]); + } + + #[test] + fn long_element_with_prefix_round_trip() { + round_trip(vec![ + NbfxToken::Element { + prefix: Some("a".to_string()), + name: NbfxName::Inline("Action".to_string()), + }, + NbfxToken::EndElement, + ]); + } + + #[test] + fn dict_element_round_trip() { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Static(2), + }, // "Envelope" + NbfxToken::EndElement, + ]); + } + + #[test] + fn attribute_round_trip_inline_name() { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::Attribute { + prefix: None, + name: NbfxName::Inline("attr".to_string()), + value: NbfxText::Chars("value".to_string()), + }, + NbfxToken::EndElement, + ]); + } + + #[test] + fn attribute_round_trip_dict_name() { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::Attribute { + prefix: Some("a".to_string()), + name: NbfxName::Static(10), // "Action" + value: NbfxText::Chars("doSomething".to_string()), + }, + NbfxToken::EndElement, + ]); + } + + #[test] + fn xmlns_default_round_trip() { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::DefaultNamespace { + value: NbfxText::Chars("urn:test".to_string()), + }, + NbfxToken::EndElement, + ]); + } + + #[test] + fn xmlns_prefix_round_trip() { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::NamespaceDeclaration { + prefix: "a".to_string(), + value: NbfxText::DictionaryStatic(6), // WS-Addressing + }, + NbfxToken::EndElement, + ]); + } + + #[test] + fn text_records_round_trip_and_collapse_with_end_element() { + // The encoder collapses Text + EndElement into the + // *WithEndElement variant; the decoder splits them back out. + for text in [ + NbfxText::Empty, + NbfxText::Zero, + NbfxText::One, + NbfxText::Bool(true), + NbfxText::Bool(false), + NbfxText::Int8(-1), + NbfxText::Int16(-12345), + NbfxText::Int32(0xDEAD_BEEFu32 as i32), + NbfxText::Int64(i64::MIN), + NbfxText::Chars("hello".to_string()), + NbfxText::Chars("a".repeat(300)), // forces Chars16 + NbfxText::DictionaryStatic(2), + ] { + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::Text(text), + NbfxToken::EndElement, + ]); + } + } + + #[test] + fn chars32_handled_for_payloads_above_u16_max() { + let big = "x".repeat(70_000); + round_trip(vec![ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::Text(NbfxText::Chars(big)), + NbfxToken::EndElement, + ]); + } + + #[test] + fn collapse_emits_with_end_record_byte() { + // Verify that the *WithEndElement variant is actually used on + // the wire when text precedes EndElement. + let mut bytes = Vec::new(); + let mut d = DynamicDictionary::new(); + encode_tokens( + &[ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::Text(NbfxText::Bool(true)), + NbfxToken::EndElement, + ], + &mut d, + &mut bytes, + ) + .unwrap(); + // Tail bytes: TrueTextWithEndElement = 0x87 + assert_eq!(*bytes.last().unwrap(), 0x87); + } + + #[test] + fn empty_text_with_end_element_is_one_byte() { + let mut bytes = Vec::new(); + let mut d = DynamicDictionary::new(); + encode_tokens( + &[ + NbfxToken::Element { + prefix: None, + name: NbfxName::Inline("e".to_string()), + }, + NbfxToken::Text(NbfxText::Empty), + NbfxToken::EndElement, + ], + &mut d, + &mut bytes, + ) + .unwrap(); + // Last byte = EmptyTextWithEndElement = 0xA9 + assert_eq!(*bytes.last().unwrap(), 0xA9); + } + + #[test] + fn unknown_record_byte_rejected() { + let bytes = vec![0xFFu8]; + let mut d = DynamicDictionary::new(); + let err = decode_tokens(&bytes, &mut d).unwrap_err(); + assert!(matches!(err, NbfxError::UnknownRecord(0xFF))); + } + + #[test] + fn truncated_chars_record_rejected() { + // Chars8: byte 0x98, length 5, but only 2 payload bytes. + let bytes = vec![REC_CHARS8_TEXT, 5, b'a', b'b']; + let mut d = DynamicDictionary::new(); + // The decoder has to be inside an element to make a Text token + // useful, but it doesn't reject text-without-element — it just + // surfaces the truncation. + let err = decode_tokens(&bytes, &mut d).unwrap_err(); + assert!(matches!( + err, + NbfxError::Truncated { + stage: "chars8", + .. + } + )); + } + + #[test] + fn nbfx_text_resolve_uses_dictionaries() { + let dynamic = DynamicDictionary::new(); + assert_eq!(NbfxText::Empty.resolve(&dynamic).as_deref(), Some("")); + assert_eq!(NbfxText::Zero.resolve(&dynamic).as_deref(), Some("0")); + assert_eq!( + NbfxText::Bool(true).resolve(&dynamic).as_deref(), + Some("true") + ); + assert_eq!(NbfxText::Int32(42).resolve(&dynamic).as_deref(), Some("42")); + assert_eq!( + NbfxText::DictionaryStatic(2).resolve(&dynamic).as_deref(), + Some("Envelope") + ); + assert_eq!(NbfxText::DictionaryStatic(99_999).resolve(&dynamic), None); + } +}