merge: integrate WaitAsync/M5-audit (parallel session) with galaxy array-write + inbound-timeout fixes

2026-06-17 09:28:15 -04:00
parent bf2f481bb4 11534089b9
commit af54c8ad11
88 changed files with 7714 additions and 169 deletions
@@ -39,10 +39,12 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
 public sealed class AuditCentralHealthSnapshot
    : IAuditCentralHealthSnapshot,
      ICentralAuditWriteFailureCounter,
-      IAuditRedactionFailureCounter
+      IAuditRedactionFailureCounter,
+      IAuditInboundCeilingHitsCounter
 {
    private int _centralAuditWriteFailures;
    private int _auditRedactionFailure;
+    private int _auditInboundCeilingHits;
    private readonly ConcurrentDictionary<string, bool> _stalled = new();

    /// <inheritdoc/>
@@ -53,6 +55,10 @@ public sealed class AuditCentralHealthSnapshot
    public int AuditRedactionFailure =>
        Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);

+    /// <inheritdoc/>
+    public int AuditInboundCeilingHits =>
+        Interlocked.CompareExchange(ref _auditInboundCeilingHits, 0, 0);
+
    /// <inheritdoc/>
    public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
        new Dictionary<string, bool>(_stalled);
@@ -78,4 +84,8 @@ public sealed class AuditCentralHealthSnapshot
    /// <inheritdoc/>
    void IAuditRedactionFailureCounter.Increment() =>
        Interlocked.Increment(ref _auditRedactionFailure);
+
+    /// <inheritdoc/>
+    void IAuditInboundCeilingHitsCounter.Increment() =>
+        Interlocked.Increment(ref _auditInboundCeilingHits);
 }
@@ -167,6 +167,9 @@ public class AuditLogPurgeActor : ReceiveActor

        if (boundaries.Count == 0)
        {
+            // No whole-month partitions are eligible, but per-channel overrides may
+            // still expire rows earlier than the global window — run them below.
+            await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
            return;
        }

@@ -202,6 +205,80 @@ public class AuditLogPurgeActor : ReceiveActor
                    sw.ElapsedMilliseconds);
            }
        }
+
+        // M5.5 (T3): after the channel-blind global partition switch-out, apply any
+        // per-channel retention overrides that are SHORTER than the global window via
+        // a bounded, batched row DELETE on the same maintenance path. The global
+        // switch-out has already dropped whole months older than RetentionDays; these
+        // deletes only ever expire rows EARLIER than that, so they run last and are a
+        // strict tightening.
+        await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
+    }
+
+    /// <summary>
+    /// M5.5 (T3): runs each per-channel retention override whose window is strictly
+    /// shorter than the global <see cref="AuditLogOptions.RetentionDays"/>, deleting
+    /// rows of that channel older than the channel-specific threshold via a bounded,
+    /// batched maintenance-path DELETE. Each channel runs inside its own try/catch so
+    /// one bad channel does not abandon the others on the same tick, mirroring the
+    /// per-boundary error isolation of the partition switch-out loop.
+    /// </summary>
+    /// <param name="repository">The repository resolved for this tick's DI scope.</param>
+    private async Task RunPerChannelOverridesAsync(IAuditLogRepository repository)
+    {
+        var overrides = _auditOptions.PerChannelRetentionDays;
+        if (overrides is null || overrides.Count == 0)
+        {
+            return;
+        }
+
+        var globalDays = _auditOptions.RetentionDays;
+
+        foreach (var (channel, days) in overrides)
+        {
+            // Only act when the per-channel window is strictly shorter than the global
+            // one. Equal/longer windows are already covered by the global partition
+            // switch-out, so a row DELETE would be redundant work (and a longer window
+            // is meaningless — the partition is dropped on the global schedule).
+            if (days >= globalDays)
+            {
+                continue;
+            }
+
+            var channelThreshold = DateTime.UtcNow - TimeSpan.FromDays(days);
+            var sw = Stopwatch.StartNew();
+            try
+            {
+                var rowsDeleted = await repository
+                    .PurgeChannelOlderThanAsync(channel, channelThreshold, _purgeOptions.ChannelPurgeBatchSize)
+                    .ConfigureAwait(false);
+                sw.Stop();
+
+                if (rowsDeleted > 0)
+                {
+                    _logger.LogInformation(
+                        "Purged {RowsDeleted} AuditLog rows for channel {Channel} older than {Threshold:o} " +
+                        "(per-channel override {Days}d < global {GlobalDays}d) in {DurationMs} ms.",
+                        rowsDeleted,
+                        channel,
+                        channelThreshold,
+                        days,
+                        globalDays,
+                        sw.ElapsedMilliseconds);
+                }
+            }
+            catch (Exception ex)
+            {
+                sw.Stop();
+                _logger.LogError(
+                    ex,
+                    "Failed to apply per-channel retention override for channel {Channel} " +
+                    "({Days}d); other channels continue. Elapsed {DurationMs} ms.",
+                    channel,
+                    days,
+                    sw.ElapsedMilliseconds);
+            }
+        }
    }

    /// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
@@ -28,6 +28,24 @@ public sealed class AuditLogPurgeOptions
    /// <summary>Period of the purge tick in hours (default 24).</summary>
    public int IntervalHours { get; set; } = 24;

+    /// <summary>
+    /// M5.5 (T3): batch size for the per-channel retention-override row DELETE
+    /// (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories.IAuditLogRepository.PurgeChannelOlderThanAsync"/>).
+    /// Each <c>DELETE TOP (@batch)</c> caps the transaction-log and lock footprint
+    /// per statement; the repository loops batches until no rows remain. Default
+    /// 5000 keeps individual deletes short on a busy central DB while still draining
+    /// a large backlog within a tick. Clamped to a sane minimum in
+    /// <see cref="ChannelPurgeBatchSize"/>.
+    /// </summary>
+    public int ChannelPurgeBatchSizeConfigured { get; set; } = 5000;
+
+    /// <summary>
+    /// Resolves the effective per-channel purge batch size, clamped to at least 1 so
+    /// a misconfigured <c>0</c>/negative value cannot make the repository's DELETE
+    /// loop spin or throw.
+    /// </summary>
+    public int ChannelPurgeBatchSize => ChannelPurgeBatchSizeConfigured < 1 ? 1 : ChannelPurgeBatchSizeConfigured;
+
    /// <summary>
    /// Test-only override for finer control over the tick cadence than
    /// whole-hour resolution allows. When non-null, takes precedence over
@@ -50,6 +50,17 @@ public interface IAuditCentralHealthSnapshot
    /// </summary>
    int AuditRedactionFailure { get; }

+    /// <summary>
+    /// Count of inbound request/response body truncations at the
+    /// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.InboundMaxBytes"/>
+    /// ceiling since process start. Incremented by
+    /// <see cref="ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware.AuditWriteMiddleware"/>
+    /// whenever either the request or response body exceeds the cap and is
+    /// truncated in the audit copy. A sustained non-zero count can indicate
+    /// callers sending unexpectedly large bodies.
+    /// </summary>
+    int AuditInboundCeilingHits { get; }
+
    /// <summary>
    /// Per-site latched stalled state: <c>true</c> when the
    /// <see cref="SiteAuditReconciliationActor"/> has observed two
@@ -0,0 +1,24 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Audit Log (#23) M5.3 (T7) counter sink incremented by
+/// <see cref="ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware.AuditWriteMiddleware"/>
+/// whenever an inbound request or response body is truncated at the
+/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.InboundMaxBytes"/>
+/// ceiling. Mirrors the <see cref="ICentralAuditWriteFailureCounter"/> shape:
+/// one-method, NoOp default, must-never-abort-the-user-facing-action invariant.
+/// </summary>
+/// <remarks>
+/// A ceiling hit is a normal operational event (the caller sent a large
+/// body) rather than a failure, but surfacing a cumulative count lets
+/// operators detect over-size callers early. The
+/// <see cref="AuditCentralHealthSnapshot"/> production implementation
+/// accumulates the count via an <c>Interlocked</c> field alongside
+/// <see cref="ICentralAuditWriteFailureCounter"/> and
+/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Payload.IAuditRedactionFailureCounter"/>.
+/// </remarks>
+public interface IAuditInboundCeilingHitsCounter
+{
+    /// <summary>Increment the inbound body-ceiling hit counter by one.</summary>
+    void Increment();
+}
@@ -0,0 +1,13 @@
+namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
+
+/// <summary>
+/// Default <see cref="IAuditInboundCeilingHitsCounter"/> binding used when
+/// the central health snapshot is not wired (e.g. site composition roots,
+/// test harnesses that have no health dashboard). All increments are silently
+/// dropped — correct for environments that have no audit KPI surface.
+/// </summary>
+public sealed class NoOpAuditInboundCeilingHitsCounter : IAuditInboundCeilingHitsCounter
+{
+    /// <inheritdoc/>
+    public void Increment() { }
+}
@@ -37,6 +37,33 @@ public sealed class AuditLogOptions
    /// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
    public int RetentionDays { get; set; } = 365;

+    /// <summary>
+    /// M5.5 (T3) per-channel retention overrides, keyed by the canonical channel name
+    /// (the <see cref="AuditChannel"/> enum name — e.g. <c>ApiOutbound</c>,
+    /// <c>DbOutbound</c>, <c>Notification</c>, <c>ApiInbound</c>). The value is a
+    /// retention window in days that MUST be SHORTER than or equal to the global
+    /// <see cref="RetentionDays"/>.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// The global <see cref="RetentionDays"/> window is enforced by month-partition
+    /// switch-out, which is channel-blind: it can only drop a whole month once every
+    /// row in it is older than the global window. A per-channel override therefore
+    /// can only ever expire rows EARLIER than the global purge would — never later
+    /// (a longer per-channel window is meaningless because the partition switch-out
+    /// would already have dropped the month). Overrides shorter than the global window
+    /// are honoured by the purge actor as a bounded, batched row DELETE on the
+    /// maintenance path (see <c>AuditLogPurgeActor</c>); the append-only writer/ingest
+    /// role is unaffected.
+    /// </para>
+    /// <para>
+    /// Each value is validated to be in <c>[30, RetentionDays]</c> by
+    /// <c>AuditLogOptionsValidator</c>; keys that are not recognized
+    /// <see cref="AuditChannel"/> names are rejected.
+    /// </para>
+    /// </remarks>
+    public Dictionary<string, int> PerChannelRetentionDays { get; set; } = new();
+
    /// <summary>
    /// Per-body byte ceiling applied to <see cref="AuditEvent.RequestSummary"/> and
    /// <see cref="AuditEvent.ResponseSummary"/> for <see cref="AuditChannel.ApiInbound"/> rows
@@ -1,4 +1,5 @@
 using ZB.MOM.WW.Configuration;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;

 namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;

@@ -52,5 +53,27 @@ public sealed class AuditLogOptionsValidator : OptionsValidatorBase<AuditLogOpti
            !(options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes),
            $"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
            $"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
+
+        // M5.5 (T3): per-channel retention overrides. Each entry must be keyed by a
+        // recognized AuditChannel name and carry a window in [MinRetentionDays,
+        // RetentionDays] — i.e. SHORTER than or equal to the global window. A longer
+        // per-channel window is meaningless under month-partition switch-out (governed
+        // by the global window), so it is rejected rather than silently ignored.
+        foreach (var (channelKey, days) in options.PerChannelRetentionDays)
+        {
+            builder.RequireThat(
+                Enum.TryParse<AuditChannel>(channelKey, ignoreCase: false, out _),
+                $"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)} key '{channelKey}' " +
+                $"is not a recognized channel name. Valid keys: {string.Join(", ", Enum.GetNames<AuditChannel>())}.");
+
+            // Valid when days is within [MinRetentionDays, RetentionDays] inclusive.
+            // The lower bound matches the global RetentionDays floor; the upper bound
+            // is the configured global window (longer is meaningless — see remarks).
+            builder.RequireThat(
+                !(days < MinRetentionDays || days > options.RetentionDays),
+                $"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)}['{channelKey}'] ({days}) " +
+                $"must be in [{MinRetentionDays}, {nameof(AuditLogOptions.RetentionDays)}={options.RetentionDays}] days " +
+                "— a per-channel window must be shorter than or equal to the global retention window.");
+        }
    }
 }
@@ -25,4 +25,15 @@ public sealed class PerTargetRedactionOverride
    /// rows.
    /// </summary>
    public string? RedactSqlParamsMatching { get; set; }
+
+    /// <summary>
+    /// When <c>true</c>, the inbound API audit row for this target records
+    /// request/response headers and metadata (status, duration, actor, etc.)
+    /// but the request and response body strings are omitted
+    /// (<c>RequestSummary</c> / <c>ResponseSummary</c> are left null). The
+    /// audit row itself is always emitted — only the body content is suppressed.
+    /// Null (the default, equivalent to <c>false</c>) means body capture
+    /// proceeds normally up to <see cref="AuditLogOptions.InboundMaxBytes"/>.
+    /// </summary>
+    public bool SkipBodyCapture { get; set; }
 }
@@ -200,6 +200,13 @@ public static class ServiceCollectionExtensions
        // surface on the central dashboard.
        services.TryAddSingleton<ICentralAuditWriteFailureCounter, NoOpCentralAuditWriteFailureCounter>();

+        // M5.3 (T7): inbound body-ceiling hit counter — NoOp default for
+        // site/test roots. AddAuditLogCentralMaintenance replaces this binding
+        // with the AuditCentralHealthSnapshot implementation so ceiling-hit
+        // counts surface on the central dashboard alongside write-failure and
+        // redaction-failure counters.
+        services.TryAddSingleton<IAuditInboundCeilingHitsCounter, NoOpAuditInboundCeilingHitsCounter>();
+
        // M4 Bundle B: central direct-write audit writer used by
        // NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
        // emit AuditLog rows that originate ON central, not via site telemetry.
@@ -383,6 +390,12 @@ public static class ServiceCollectionExtensions
        // HealthMetricsAuditRedactionFailureCounter shape one-for-one.
        services.Replace(ServiceDescriptor.Singleton<IAuditRedactionFailureCounter,
            CentralAuditRedactionFailureCounter>());
+        // M5.3 (T7): replace the NoOp IAuditInboundCeilingHitsCounter with the
+        // AuditCentralHealthSnapshot so ceiling-hit counts surface on the
+        // central dashboard. Same singleton-forward pattern as
+        // ICentralAuditWriteFailureCounter above.
+        services.Replace(ServiceDescriptor.Singleton<IAuditInboundCeilingHitsCounter>(
+            sp => sp.GetRequiredService<AuditCentralHealthSnapshot>()));

        return services;
    }
@@ -0,0 +1,113 @@
+using System.Text;
+using System.Text.Json;
+
+namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
+
+/// <summary>
+/// Arguments for an <c>audit backfill-source-node</c> invocation.
+/// </summary>
+public sealed class AuditBackfillSourceNodeArgs
+{
+    /// <summary>
+    /// Value written into <c>SourceNode</c> for NULL rows (default <c>"unknown"</c>).
+    /// </summary>
+    public string Sentinel { get; set; } = "unknown";
+
+    /// <summary>
+    /// Only rows with <c>OccurredAtUtc</c> strictly before this UTC datetime are
+    /// eligible. Required — must be an ISO-8601 UTC datetime.
+    /// </summary>
+    public string Before { get; set; } = string.Empty;
+
+    /// <summary>
+    /// Maximum rows updated per batch (default 5000). Caps the per-transaction
+    /// log footprint; the loop repeats until no rows remain.
+    /// </summary>
+    public int BatchSize { get; set; } = 5000;
+}
+
+/// <summary>
+/// Pure helpers for the <c>audit backfill-source-node</c> subcommand (Audit Log
+/// #23 M5.6 T5). Builds the request body, POSTs to
+/// <c>/api/audit/backfill-source-node</c>, and renders the result. Kept separate
+/// from the command wiring so each piece is unit-testable without standing up the
+/// command tree.
+/// </summary>
+public static class AuditBackfillHelpers
+{
+    private static readonly JsonSerializerOptions JsonWriteOptions = new()
+    {
+        WriteIndented = true,
+    };
+
+    /// <summary>
+    /// Builds the JSON request body for <c>POST /api/audit/backfill-source-node</c>.
+    /// </summary>
+    /// <param name="args">The backfill arguments.</param>
+    /// <returns>A JSON string suitable for the request body.</returns>
+    public static string BuildRequestBody(AuditBackfillSourceNodeArgs args)
+    {
+        var obj = new
+        {
+            sentinel = args.Sentinel,
+            before = args.Before,
+            batchSize = args.BatchSize,
+        };
+        return JsonSerializer.Serialize(obj);
+    }
+
+    /// <summary>
+    /// Executes the backfill: POSTs <c>/api/audit/backfill-source-node</c> and
+    /// prints the result. Returns the process exit code (0 = success,
+    /// 1 = error, 2 = authorization failure).
+    /// </summary>
+    /// <param name="client">The management HTTP client.</param>
+    /// <param name="args">The backfill arguments.</param>
+    /// <param name="output">The output writer for results.</param>
+    /// <returns>A task that resolves to the process exit code.</returns>
+    public static async Task<int> RunBackfillAsync(
+        ManagementHttpClient client,
+        AuditBackfillSourceNodeArgs args,
+        TextWriter output)
+    {
+        var body = BuildRequestBody(args);
+        var response = await client.SendPostAsync(
+            "api/audit/backfill-source-node", body, TimeSpan.FromMinutes(10));
+
+        if (response.JsonData == null)
+        {
+            OutputFormatter.WriteError(
+                response.Error ?? "Backfill request failed.", response.ErrorCode ?? "ERROR");
+            return CommandHelpers.IsAuthorizationFailure(response) ? 2 : 1;
+        }
+
+        // Parse and display the result.
+        try
+        {
+            using var doc = JsonDocument.Parse(response.JsonData);
+            var root = doc.RootElement;
+            var rowsUpdated = root.TryGetProperty("rowsUpdated", out var r)
+                ? r.GetInt64()
+                : 0L;
+            var sentinel = root.TryGetProperty("sentinel", out var s)
+                ? s.GetString() ?? args.Sentinel
+                : args.Sentinel;
+            var before = root.TryGetProperty("before", out var b)
+                ? b.GetString() ?? args.Before
+                : args.Before;
+
+            output.WriteLine($"SourceNode backfill complete.");
+            output.WriteLine($"  rows updated : {rowsUpdated}");
+            output.WriteLine($"  sentinel     : {sentinel}");
+            output.WriteLine($"  before       : {before}");
+        }
+        catch (JsonException)
+        {
+            // Server returned success but non-JSON body — not expected; print raw.
+            output.WriteLine(response.JsonData);
+        }
+
+        output.Flush();
+        return 0;
+    }
+}
@@ -6,13 +6,15 @@ namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
 /// <summary>
 /// The <c>scadabridge audit</c> command group (Audit Log #23 M8). Provides read access to
 /// the centralized append-only Audit Log via the Bundle B REST endpoints
-/// (<c>GET /api/audit/query</c>, <c>GET /api/audit/export</c>), plus a v1 no-op
-/// <c>verify-chain</c> placeholder for the deferred hash-chain tamper-evidence feature.
+/// (<c>GET /api/audit/query</c>, <c>GET /api/audit/export</c>,
+/// <c>GET /api/audit/tree</c>), plus a v1 no-op <c>verify-chain</c> placeholder
+/// for the deferred hash-chain tamper-evidence feature.
 /// </summary>
 public static class AuditCommands
 {
    /// <summary>
-    /// Builds the <c>audit</c> command group with query, export, and verify-chain sub-commands.
+    /// Builds the <c>audit</c> command group with query, export, tree, and verify-chain
+    /// sub-commands.
    /// </summary>
    /// <param name="urlOption">Global <c>--url</c> option for the management API endpoint.</param>
    /// <param name="formatOption">Global <c>--format</c> option for output format.</param>
@@ -25,7 +27,9 @@ public static class AuditCommands

        command.Add(BuildQuery(urlOption, formatOption, usernameOption, passwordOption));
        command.Add(BuildExport(urlOption, formatOption, usernameOption, passwordOption));
+        command.Add(BuildTree(urlOption, formatOption, usernameOption, passwordOption));
        command.Add(BuildVerifyChain(urlOption, formatOption, usernameOption, passwordOption));
+        command.Add(BuildBackfillSourceNode(urlOption, formatOption, usernameOption, passwordOption));

        return command;
    }
@@ -224,6 +228,44 @@ public static class AuditCommands
        return cmd;
    }

+    private static Command BuildTree(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
+    {
+        var executionIdOption = new Option<string>("--execution-id")
+        {
+            Description = "Execution ID (GUID) to look up — may be any node in the chain",
+            Required = true,
+        };
+
+        var cmd = new Command("tree") { Description = "Display the full execution-chain tree for an audit execution" };
+        cmd.Add(executionIdOption);
+
+        cmd.SetAction(async (ParseResult result) =>
+        {
+            var connection = AuditCommandHelpers.ResolveConnection(result, urlOption, usernameOption, passwordOption);
+            if (connection.Error != null)
+            {
+                OutputFormatter.WriteError(connection.Error, connection.ErrorCode!);
+                return 1;
+            }
+
+            var rawId = result.GetValue(executionIdOption);
+            if (!Guid.TryParse(rawId, out var executionId))
+            {
+                OutputFormatter.WriteError(
+                    $"Invalid execution ID '{rawId}'. Expected a GUID (e.g. 11111111-1111-1111-1111-111111111111).",
+                    "INVALID_ARGUMENT");
+                return 1;
+            }
+
+            var format = AuditCommandHelpers.ResolveFormat(result, formatOption);
+
+            using var client = new ManagementHttpClient(connection.Url!, connection.Username!, connection.Password!);
+            return await AuditTreeHelpers.RunTreeAsync(client, executionId, format, Console.Out);
+        });
+
+        return cmd;
+    }
+
    private static Command BuildVerifyChain(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
    {
        var monthOption = new Option<string>("--month") { Description = "Month to verify (YYYY-MM)", Required = true };
@@ -247,4 +289,76 @@ public static class AuditCommands
        });
        return cmd;
    }
+
+    /// <summary>
+    /// Builds the <c>audit backfill-source-node</c> sub-command (Audit Log #23 M5.6 T5).
+    /// Sets <c>SourceNode</c> on historical pre-feature rows whose <c>SourceNode IS NULL</c>
+    /// and <c>OccurredAtUtc</c> is older than <c>--before</c>, in batches. Admin-only.
+    /// </summary>
+    private static Command BuildBackfillSourceNode(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
+    {
+        var sentinelOption = new Option<string>("--sentinel")
+        {
+            Description = "Value to write for pre-feature rows whose node-of-origin is unknown (default: unknown)",
+        };
+        sentinelOption.DefaultValueFactory = _ => "unknown";
+
+        var beforeOption = new Option<string>("--before")
+        {
+            Description = "ISO-8601 UTC datetime; only rows older than this date are eligible (required)",
+            Required = true,
+        };
+
+        var batchOption = new Option<int>("--batch")
+        {
+            Description = "Max rows updated per batch (default: 5000)",
+        };
+        batchOption.DefaultValueFactory = _ => 5000;
+
+        var cmd = new Command("backfill-source-node")
+        {
+            Description = "Set SourceNode to a sentinel value on pre-feature rows where it is NULL (admin-only, maintenance path)",
+        };
+        cmd.Add(sentinelOption);
+        cmd.Add(beforeOption);
+        cmd.Add(batchOption);
+
+        cmd.SetAction(async (ParseResult result) =>
+        {
+            var connection = AuditCommandHelpers.ResolveConnection(result, urlOption, usernameOption, passwordOption);
+            if (connection.Error != null)
+            {
+                OutputFormatter.WriteError(connection.Error, connection.ErrorCode!);
+                return 1;
+            }
+
+            var sentinel = result.GetValue(sentinelOption) ?? "unknown";
+            var before = result.GetValue(beforeOption)!;
+            var batch = result.GetValue(batchOption);
+
+            if (string.IsNullOrWhiteSpace(sentinel))
+            {
+                OutputFormatter.WriteError("--sentinel must be a non-empty string.", "INVALID_ARGUMENT");
+                return 1;
+            }
+
+            if (batch <= 0)
+            {
+                OutputFormatter.WriteError("--batch must be > 0.", "INVALID_ARGUMENT");
+                return 1;
+            }
+
+            var args = new AuditBackfillSourceNodeArgs
+            {
+                Sentinel = sentinel,
+                Before = before,
+                BatchSize = batch,
+            };
+
+            using var client = new ManagementHttpClient(connection.Url!, connection.Username!, connection.Password!);
+            return await AuditBackfillHelpers.RunBackfillAsync(client, args, Console.Out);
+        });
+
+        return cmd;
+    }
 }
@@ -0,0 +1,208 @@
+using System.Text;
+using System.Text.Json;
+
+namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
+
+/// <summary>
+/// Arguments for an <c>audit tree</c> invocation.
+/// </summary>
+public sealed class AuditTreeArgs
+{
+    /// <summary>
+    /// The execution ID (GUID) to look up. May be any node in the chain — the
+    /// server walks to the root and returns the full tree.
+    /// </summary>
+    public string ExecutionId { get; set; } = string.Empty;
+}
+
+/// <summary>
+/// Represents one execution node as returned by <c>GET /api/audit/tree</c>.
+/// Property names match the server's camelCase JSON serialisation of
+/// <c>ExecutionTreeNode</c>.
+/// </summary>
+internal sealed class AuditTreeNodeDto
+{
+    public Guid ExecutionId { get; init; }
+    public Guid? ParentExecutionId { get; init; }
+    public int RowCount { get; init; }
+    public string[] Channels { get; init; } = Array.Empty<string>();
+    public string[] Statuses { get; init; } = Array.Empty<string>();
+    public string? SourceSiteId { get; init; }
+    public string? SourceInstanceId { get; init; }
+    public DateTime? FirstOccurredAtUtc { get; init; }
+    public DateTime? LastOccurredAtUtc { get; init; }
+}
+
+/// <summary>
+/// Pure helpers for the <c>audit tree</c> subcommand: builds the query string,
+/// calls <c>GET /api/audit/tree</c>, and renders the result as either an
+/// indented ASCII tree (table format) or raw JSON. Kept separate from the
+/// command wiring so each piece is unit-testable without standing up the
+/// command tree.
+/// </summary>
+public static class AuditTreeHelpers
+{
+    private static readonly JsonSerializerOptions JsonReadOptions = new()
+    {
+        PropertyNameCaseInsensitive = true,
+    };
+
+    private static readonly JsonSerializerOptions JsonWriteOptions = new()
+    {
+        PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
+        WriteIndented = true,
+    };
+
+    /// <summary>
+    /// Builds the query string for <c>GET /api/audit/tree</c>.
+    /// </summary>
+    /// <param name="executionId">The execution ID GUID.</param>
+    /// <returns>A relative path + query string ready to append to the base URL.</returns>
+    public static string BuildUrl(Guid executionId)
+        => $"api/audit/tree?executionId={executionId:D}";
+
+    /// <summary>
+    /// Executes the tree lookup: GETs <c>/api/audit/tree</c> and renders the result
+    /// in the requested format. Returns the process exit code (0 = success,
+    /// 1 = error, 2 = authorization failure).
+    /// </summary>
+    /// <param name="client">The management HTTP client.</param>
+    /// <param name="executionId">The execution ID to look up.</param>
+    /// <param name="format">"table" (default) or "json".</param>
+    /// <param name="output">The output writer for results.</param>
+    /// <returns>A task that resolves to the process exit code.</returns>
+    public static async Task<int> RunTreeAsync(
+        ManagementHttpClient client,
+        Guid executionId,
+        string format,
+        TextWriter output)
+    {
+        var url = BuildUrl(executionId);
+        var response = await client.SendGetAsync(url, TimeSpan.FromSeconds(30));
+
+        if (response.JsonData == null)
+        {
+            OutputFormatter.WriteError(
+                response.Error ?? "Audit tree request failed.", response.ErrorCode ?? "ERROR");
+            return CommandHelpers.IsAuthorizationFailure(response) ? 2 : 1;
+        }
+
+        var nodes = ParseNodes(response.JsonData);
+
+        if (format == "json")
+        {
+            WriteJson(nodes, output);
+        }
+        else
+        {
+            WriteTable(nodes, executionId, output);
+        }
+
+        output.Flush();
+        return 0;
+    }
+
+    /// <summary>
+    /// Parses the JSON array from the server into an array of
+    /// <see cref="AuditTreeNodeDto"/>.
+    /// </summary>
+    /// <param name="json">The raw JSON response body.</param>
+    /// <returns>An array of deserialized tree nodes (empty on parse failure).</returns>
+    internal static AuditTreeNodeDto[] ParseNodes(string json)
+    {
+        try
+        {
+            return JsonSerializer.Deserialize<AuditTreeNodeDto[]>(json, JsonReadOptions)
+                   ?? Array.Empty<AuditTreeNodeDto>();
+        }
+        catch (JsonException)
+        {
+            return Array.Empty<AuditTreeNodeDto>();
+        }
+    }
+
+    /// <summary>
+    /// Renders the nodes as pretty-printed JSON to <paramref name="output"/>.
+    /// </summary>
+    internal static void WriteJson(AuditTreeNodeDto[] nodes, TextWriter output)
+    {
+        output.WriteLine(JsonSerializer.Serialize(nodes, JsonWriteOptions));
+    }
+
+    /// <summary>
+    /// Renders the nodes as an indented ASCII tree. The root node (null
+    /// <c>ParentExecutionId</c>) is printed first; each child is indented
+    /// two spaces per depth level. The queried/entry-point node is marked
+    /// with <c> [*]</c>.
+    /// </summary>
+    internal static void WriteTable(
+        AuditTreeNodeDto[] nodes,
+        Guid queriedExecutionId,
+        TextWriter output)
+    {
+        if (nodes.Length == 0)
+        {
+            output.WriteLine("(no execution tree found)");
+            return;
+        }
+
+        // Build a parent → children lookup (keyed by non-null parent Guid).
+        // Nodes whose ParentExecutionId is null are roots and are not placed in
+        // the lookup; they are identified separately below.
+        var childrenOf = new Dictionary<Guid, List<AuditTreeNodeDto>>();
+        foreach (var node in nodes)
+        {
+            if (node.ParentExecutionId is { } parentId)
+            {
+                if (!childrenOf.ContainsKey(parentId))
+                    childrenOf[parentId] = new List<AuditTreeNodeDto>();
+                childrenOf[parentId].Add(node);
+            }
+        }
+
+        // Identify roots: nodes whose ParentExecutionId is null, or whose parent
+        // is not present in the node set (stub-root case).
+        var nodeIds = new HashSet<Guid>(nodes.Select(n => n.ExecutionId));
+        var roots = nodes
+            .Where(n => n.ParentExecutionId == null || !nodeIds.Contains(n.ParentExecutionId.Value))
+            .ToList();
+
+        // Render depth-first.
+        var sb = new StringBuilder();
+        foreach (var root in roots)
+        {
+            RenderNode(root, depth: 0, childrenOf, queriedExecutionId, sb);
+        }
+
+        output.Write(sb.ToString());
+    }
+
+    private static void RenderNode(
+        AuditTreeNodeDto node,
+        int depth,
+        Dictionary<Guid, List<AuditTreeNodeDto>> childrenOf,
+        Guid queriedExecutionId,
+        StringBuilder sb)
+    {
+        var indent = new string(' ', depth * 2);
+        var marker = node.ExecutionId == queriedExecutionId ? " [*]" : string.Empty;
+        var channels = node.Channels.Length > 0 ? string.Join(",", node.Channels) : "-";
+        var statuses = node.Statuses.Length > 0 ? string.Join(",", node.Statuses) : "-";
+        var site = node.SourceSiteId ?? "-";
+        var instance = node.SourceInstanceId ?? "-";
+        var first = node.FirstOccurredAtUtc.HasValue
+            ? node.FirstOccurredAtUtc.Value.ToString("yyyy-MM-ddTHH:mm:ssZ")
+            : "-";
+
+        sb.AppendLine(
+            $"{indent}{node.ExecutionId:D}{marker}  rows={node.RowCount}  channels=[{channels}]  statuses=[{statuses}]  site={site}  instance={instance}  first={first}");
+
+        if (childrenOf.TryGetValue(node.ExecutionId, out var children))
+        {
+            foreach (var child in children)
+            {
+                RenderNode(child, depth + 1, childrenOf, queriedExecutionId, sb);
+            }
+        }
+    }
+}
@@ -142,6 +142,60 @@ public class ManagementHttpClient : IDisposable
        return new ManagementResponse((int)httpResponse.StatusCode, null, error, code);
    }

+    /// <summary>
+    /// Issues a plain HTTP <c>POST</c> against a REST endpoint (e.g. the audit
+    /// maintenance endpoints) with a JSON body and returns the response. Unlike
+    /// <see cref="SendCommandAsync"/>, this does not wrap the call in the
+    /// <c>POST /management</c> command envelope — these are plain REST resources.
+    /// Authentication (HTTP Basic) and the base address are shared.
+    /// </summary>
+    /// <param name="relativePath">Path relative to the base URL.</param>
+    /// <param name="body">The JSON body to send, or <c>null</c> for an empty body.</param>
+    /// <param name="timeout">The request timeout.</param>
+    /// <returns>A management response containing status and data.</returns>
+    public async Task<ManagementResponse> SendPostAsync(string relativePath, string? body, TimeSpan timeout)
+    {
+        using var cts = new CancellationTokenSource(timeout);
+
+        var content = new StringContent(body ?? "{}", Encoding.UTF8, "application/json");
+
+        HttpResponseMessage httpResponse;
+        try
+        {
+            httpResponse = await _httpClient.PostAsync(relativePath, content, cts.Token);
+        }
+        catch (TaskCanceledException)
+        {
+            return new ManagementResponse(504, null, "Request timed out.", "TIMEOUT");
+        }
+        catch (HttpRequestException ex)
+        {
+            return new ManagementResponse(0, null, $"Connection failed: {ex.Message}", "CONNECTION_FAILED");
+        }
+
+        var responseBody = await httpResponse.Content.ReadAsStringAsync(cts.Token);
+
+        if (httpResponse.IsSuccessStatusCode)
+        {
+            return new ManagementResponse((int)httpResponse.StatusCode, responseBody, null, null);
+        }
+
+        string? error = null;
+        string? code = null;
+        try
+        {
+            using var doc = JsonDocument.Parse(responseBody);
+            error = doc.RootElement.TryGetProperty("error", out var e) ? e.GetString() : responseBody;
+            code = doc.RootElement.TryGetProperty("code", out var c) ? c.GetString() : null;
+        }
+        catch
+        {
+            error = responseBody;
+        }
+
+        return new ManagementResponse((int)httpResponse.StatusCode, null, error, code);
+    }
+
    /// <summary>
    /// Issues a plain HTTP <c>GET</c> and returns the raw <see cref="HttpResponseMessage"/>
    /// so the caller can stream the response body without buffering it in memory — used
@@ -1269,15 +1269,18 @@ script-trust-boundary action: outbound API calls (sync + cached), outbound DB
 operations (sync + cached), notifications, and inbound API calls. This is distinct
 from the configuration-change audit trail exposed by [`audit-config`](#audit-config--configuration-change-audit-log).

-The subcommands map directly onto the `GET /api/audit/query` and
-`GET /api/audit/export` management endpoints. Filters and the result columns mirror
-the Central UI **Audit** page, so a CLI query and a UI query with the same filters
-return the same rows — CLI ↔ UI filter parity is intentional.
+The subcommands map directly onto the `GET /api/audit/query`,
+`GET /api/audit/export`, `GET /api/audit/tree`, and
+`POST /api/audit/backfill-source-node` management endpoints. Filters and the
+result columns mirror the Central UI **Audit** page, so a CLI query and a UI
+query with the same filters return the same rows — CLI ↔ UI filter parity is
+intentional.

-**Permissions.** Querying requires the `OperationalAudit` permission (roles `Admin`,
-`Audit`, or `AuditReadOnly`). Exporting requires the stricter `AuditExport` permission
-(roles `Admin` or `Audit`) — read access does *not* imply export access. A request
-without the required role returns exit code `2`.
+**Permissions.** Querying and tree traversal require the `OperationalAudit`
+permission (roles `Admin`, `Audit`, or `AuditReadOnly`). Exporting requires the
+stricter `AuditExport` permission (roles `Admin` or `Audit`) — read access does
+*not* imply export access. The `backfill-source-node` maintenance command requires
+the `Admin` role. A request without the required role returns exit code `2`.

 #### `audit query`

@@ -1342,6 +1345,46 @@ scadabridge --url <url> audit export --since <time> --until <time> --format <fmt
 > Implemented` — Parquet archival is deferred to v1.x (see `Component-AuditLog.md`).
 > Use `csv` or `jsonl`.

+#### `audit tree` (M5.3 T8)
+
+Display the full execution-chain tree for a given execution ID. The server walks
+`ParentExecutionId` to find the root, then traverses downward to collect all
+reachable executions in the chain.
+
+```sh
+scadabridge --url <url> audit tree --execution-id <guid> [--format table|json]
+```
+
+| Option | Required | Default | Description |
+|--------|----------|---------|-------------|
+| `--execution-id` | yes | — | Any `ExecutionId` in the chain (root or child) |
+| `--format` | no | `json` | Output format: `json` (structured tree) or `table` (indented tree) |
+
+The `--execution-id` can be any node in the chain — the server resolves the root
+automatically. With `--format table` the tree is printed as an indented text
+representation. With `--format json` (the default) a structured JSON tree is
+returned, suitable for scripting. Backed by `GET /api/audit/tree?executionId=<guid>`.
+Requires `OperationalAudit` permission.
+
+#### `audit backfill-source-node` (M5.6 T5)
+
+Set `SourceNode` to a sentinel value on pre-feature rows where `SourceNode IS NULL`
+and `OccurredAtUtc` is older than `--before`. Admin-only maintenance command.
+
+```sh
+scadabridge --url <url> audit backfill-source-node --before <ISO-8601-UTC> [--sentinel <value>] [--batch <n>]
+```
+
+| Option | Required | Default | Description |
+|--------|----------|---------|-------------|
+| `--before` | yes | — | ISO-8601 UTC datetime; only rows older than this date are eligible |
+| `--sentinel` | no | `unknown` | Value to write (must be non-empty) |
+| `--batch` | no | `5000` | Max rows updated per batch; controls transaction size |
+
+The command is idempotent — running it multiple times converges (only rows where
+`SourceNode IS NULL` are eligible; already-set rows are untouched). Backed by
+`POST /api/audit/backfill-source-node`. Requires `Admin` role.
+
 #### `audit verify-chain`

 Verify the audit log hash chain for a given month.
@@ -1354,11 +1397,11 @@ scadabridge --url <url> audit verify-chain --month <YYYY-MM>
 |--------|----------|---------|-------------|
 | `--month` | yes | — | Month to verify, `YYYY-MM` (e.g. `2026-05`) |

-> **v1 no-op.** Hash-chain tamper-evidence is not enabled in this release. The
-> subcommand validates the `--month` argument and prints a notice pointing at the
-> v1.x roadmap in `Component-AuditLog.md`; it exits `0` without contacting the server.
-> The command exists now so scripts and operator habits do not need to change when
-> tamper-evidence ships.
+> **v1 no-op.** Hash-chain tamper-evidence is not enabled in this release (T1
+> deferred to v1.x). The subcommand validates the `--month` argument and prints a
+> notice pointing at the v1.x roadmap in `Component-AuditLog.md`; it exits `0`
+> without contacting the server. The command exists now so scripts and operator
+> habits do not need to change when tamper-evidence ships.

 ---

@@ -58,3 +58,31 @@
 {
    <div class="text-muted small mb-3">Site Call KPIs unavailable: @ErrorMessage</div>
 }
+@* ── Per-node stuck/parked sub-table (T6: M5.2 per-node stuck-count KPIs) ── *@
+@if (HasNodeBreakdown)
+{
+    <div class="mb-3">
+        <div class="d-flex justify-content-between align-items-center mb-1">
+            <small class="text-muted">By node</small>
+        </div>
+        <table class="table table-sm table-borderless mb-0 site-call-kpi-node-table">
+            <thead class="table-light">
+                <tr>
+                    <th class="small py-1">Node</th>
+                    <th class="text-end small py-1">Stuck</th>
+                    <th class="text-end small py-1">Parked</th>
+                </tr>
+            </thead>
+            <tbody>
+                @foreach (var n in PerNodeSnapshots!)
+                {
+                    <tr @key="n.SourceNode">
+                        <td class="small py-1"><code>@n.SourceNode</code></td>
+                        <td class="text-end font-monospace small py-1 @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
+                        <td class="text-end font-monospace small py-1 @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
+                    </tr>
+                }
+            </tbody>
+        </table>
+    </div>
+}
@@ -1,5 +1,6 @@
 using Microsoft.AspNetCore.Components;
 using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
+using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;

 namespace ZB.MOM.WW.ScadaBridge.CentralUI.Components.Health;

@@ -59,6 +60,24 @@ public partial class SiteCallKpiTiles
    /// </summary>
    [Parameter] public string? ErrorMessage { get; set; }

+    /// <summary>
+    /// Optional per-node KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
+    /// When non-null and non-empty, a compact node-level stuck/parked sub-table
+    /// is rendered below the main tiles. <c>null</c> means the parent has not
+    /// loaded it yet or has opted out — the sub-table is suppressed entirely.
+    /// </summary>
+    [Parameter] public IReadOnlyList<SiteCallNodeKpiSnapshot>? PerNodeSnapshots { get; set; }
+
+    /// <summary>
+    /// True when <see cref="PerNodeSnapshots"/> is a successful query result.
+    /// Used to suppress the sub-table on a load failure.
+    /// </summary>
+    [Parameter] public bool PerNodeAvailable { get; set; }
+
+    /// <summary>Whether the per-node sub-table has data to render.</summary>
+    internal bool HasNodeBreakdown =>
+        PerNodeAvailable && PerNodeSnapshots is { Count: > 0 };
+
    // ── Buffered tile ───────────────────────────────────────────────────────

    private string BufferedDisplay =>
@@ -9,6 +9,7 @@
@using ZB.MOM.WW.ScadaBridge.HealthMonitoring
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit
+@using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit
@using ZB.MOM.WW.ScadaBridge.Communication
@implements IDisposable
@inject ICentralHealthAggregator HealthAggregator
@@ -65,7 +66,9 @@
       (buffered / stuck / parked). Refreshed alongside the site states. *@
    <SiteCallKpiTiles Snapshot="@_siteCallKpi"
                      IsAvailable="@_siteCallKpiAvailable"
-                      ErrorMessage="@_siteCallKpiError" />
+                      ErrorMessage="@_siteCallKpiError"
+                      PerNodeSnapshots="@_siteCallNodeKpis"
+                      PerNodeAvailable="@_siteCallNodeKpiAvailable" />

    @* Audit Log (#23) M7 Bundle E — three KPI tiles for the Audit channel
       (volume / error rate / backlog). Refreshed alongside the site states. *@
@@ -378,6 +381,12 @@
    private bool _siteCallKpiAvailable;
    private string? _siteCallKpiError;

+    // Per-node Site Call KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
+    // Passed to SiteCallKpiTiles as an optional sub-table.
+    private IReadOnlyList<SiteCallNodeKpiSnapshot> _siteCallNodeKpis =
+        Array.Empty<SiteCallNodeKpiSnapshot>();
+    private bool _siteCallNodeKpiAvailable;
+
    private static bool SiteHasActiveErrors(SiteHealthState state)
    {
        var report = state.LatestReport;
@@ -415,7 +424,7 @@
    {
        _siteStates = HealthAggregator.GetAllSiteStates();
        await LoadOutboxKpis();
-        await LoadSiteCallKpis();
+        await Task.WhenAll(LoadSiteCallKpis(), LoadSiteCallNodeKpis());
        await LoadAuditKpis();
    }

@@ -474,6 +483,30 @@
        }
    }

+    // Per-node site-call KPI loader (T6: M5.2). Best-effort; a fault silently
+    // suppresses the per-node sub-table rather than degrading the dashboard.
+    private async Task LoadSiteCallNodeKpis()
+    {
+        try
+        {
+            var response = await CommunicationService.GetPerNodeSiteCallKpisAsync(
+                new PerNodeSiteCallKpiRequest(Guid.NewGuid().ToString("N")));
+            if (response.Success)
+            {
+                _siteCallNodeKpis = response.Nodes;
+                _siteCallNodeKpiAvailable = true;
+            }
+            else
+            {
+                _siteCallNodeKpiAvailable = false;
+            }
+        }
+        catch
+        {
+            _siteCallNodeKpiAvailable = false;
+        }
+    }
+
    // Tiles show the numeric KPI when available, or an em dash when the outbox
    // KPI query failed — matching how the page renders other unavailable data.
    private string OutboxTileValue(int value) =>
@@ -69,6 +69,51 @@
        </div>
    }

+    @* ── Per-node breakdown (T6: additive) ── *@
+    <h5 class="mb-2">Per-node breakdown</h5>
+    @if (_perNodeError != null)
+    {
+        <div class="alert alert-warning py-2">Per-node KPIs unavailable: @_perNodeError</div>
+    }
+    else if (_perNode.Count == 0)
+    {
+        <div class="card mb-3">
+            <div class="card-body text-center text-muted py-3">
+                <div class="small">No per-node activity (rows may have a null SourceNode).</div>
+            </div>
+        </div>
+    }
+    else
+    {
+        <div class="table-responsive mb-3">
+            <table class="table table-sm table-hover align-middle">
+                <thead class="table-light">
+                    <tr>
+                        <th>Node</th>
+                        <th class="text-end">Queue Depth</th>
+                        <th class="text-end">Stuck</th>
+                        <th class="text-end">Parked</th>
+                        <th class="text-end">Delivered (last interval)</th>
+                        <th class="text-end">Oldest Pending Age</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    @foreach (var n in _perNode)
+                    {
+                        <tr @key="n.SourceNode" class="@(n.StuckCount > 0 ? "table-warning" : "")">
+                            <td><code>@n.SourceNode</code></td>
+                            <td class="text-end font-monospace">@n.QueueDepth</td>
+                            <td class="text-end font-monospace @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
+                            <td class="text-end font-monospace @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
+                            <td class="text-end font-monospace text-success">@n.DeliveredLastInterval</td>
+                            <td class="text-end font-monospace">@FormatAge(n.OldestPendingAge)</td>
+                        </tr>
+                    }
+                </tbody>
+            </table>
+        </div>
+    }
+
    @* ── Per-site breakdown ── *@
    <h5 class="mb-2">Per-site breakdown</h5>
    @if (_perSiteError != null)
@@ -124,6 +169,10 @@
    private IReadOnlyList<SiteNotificationKpiSnapshot> _perSite = Array.Empty<SiteNotificationKpiSnapshot>();
    private string? _perSiteError;

+    // ── Per-node (T6: M5.2 per-node stuck-count KPIs) ──
+    private IReadOnlyList<NodeNotificationKpiSnapshot> _perNode = Array.Empty<NodeNotificationKpiSnapshot>();
+    private string? _perNodeError;
+
    private bool _loading;

    protected override async Task OnInitializedAsync()
@@ -144,9 +193,9 @@
    private async Task RefreshAll()
    {
        _loading = true;
-        // Race-free despite both tasks mutating component fields: Blazor Server runs
+        // Race-free despite all tasks mutating component fields: Blazor Server runs
        // every continuation on the circuit's single-threaded synchronization context.
-        await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis());
+        await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis(), LoadPerNodeKpis());
        _loading = false;
    }

@@ -194,6 +243,28 @@
        }
    }

+    private async Task LoadPerNodeKpis()
+    {
+        try
+        {
+            var response = await CommunicationService.GetPerNodeNotificationKpisAsync(
+                new PerNodeNotificationKpiRequest(Guid.NewGuid().ToString("N")));
+            if (response.Success)
+            {
+                _perNode = response.Nodes;
+                _perNodeError = null;
+            }
+            else
+            {
+                _perNodeError = response.ErrorMessage ?? "Per-node KPI query failed.";
+            }
+        }
+        catch (Exception ex)
+        {
+            _perNodeError = $"Per-node KPI query failed: {ex.Message}";
+        }
+    }
+
    private string SiteName(string siteId) =>
        _sites.FirstOrDefault(s => s.SiteIdentifier == siteId)?.Name ?? siteId;

@@ -87,6 +87,42 @@ public interface IAuditLogRepository
    /// <returns>A task that resolves to the approximate number of rows discarded by the partition switch.</returns>
    Task<long> SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default);

+    /// <summary>
+    /// M5.5 (T3) per-channel retention override purge. Deletes <c>AuditLog</c> rows for a
+    /// single <paramref name="channel"/> (matched against the canonical
+    /// <c>Category</c> column — the bare channel name, e.g. <c>ApiOutbound</c>) whose
+    /// <c>OccurredAtUtc</c> is strictly older than <paramref name="threshold"/>, in
+    /// bounded batches of <paramref name="batchSize"/> rows, looping until no further
+    /// rows match. Returns the total number of rows deleted across all batches.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// <b>Maintenance path — NOT the writer role.</b> The append-only invariant binds
+    /// the <c>scadabridge_audit_writer</c> ingest role (INSERT + SELECT only). This row
+    /// DELETE runs on the purge/maintenance connection, the same path that performs the
+    /// global partition switch-out (also a destructive operation forbidden to the writer
+    /// role). Per-channel overrides can only ever expire rows EARLIER than the global
+    /// month-partition switch-out would — never later — so this is a strict tightening
+    /// of the retention window, applied AFTER the global purge on the same tick.
+    /// </para>
+    /// <para>
+    /// <b>Bounded + idempotent.</b> Each batch is a <c>DELETE TOP (@batch)</c> so the
+    /// transaction log and lock footprint stay bounded regardless of backlog. Re-running
+    /// the purge is a no-op once every eligible row is gone (the loop exits when a batch
+    /// deletes zero rows), so a crash mid-loop is recoverable by simply running again.
+    /// </para>
+    /// </remarks>
+    /// <param name="channel">Canonical channel name (the <c>Category</c> column value, e.g. <c>ApiOutbound</c>).</param>
+    /// <param name="threshold">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are deleted.</param>
+    /// <param name="batchSize">Maximum rows deleted per batch; must be &gt; 0.</param>
+    /// <param name="ct">Cancellation token.</param>
+    /// <returns>A task that resolves to the total number of rows deleted across all batches.</returns>
+    Task<long> PurgeChannelOlderThanAsync(
+        string channel,
+        DateTime threshold,
+        int batchSize,
+        CancellationToken ct = default);
+
    /// <summary>
    /// Returns the set of <c>pf_AuditLog_Month</c> partition lower-bound
    /// boundaries whose partitions contain only rows with
@@ -201,4 +237,59 @@ public interface IAuditLogRepository
    /// <param name="ct">Cancellation token.</param>
    /// <returns>A task that resolves to the distinct, non-null source node names in ascending order.</returns>
    Task<IReadOnlyList<string>> GetDistinctSourceNodesAsync(CancellationToken ct = default);
+
+    /// <summary>
+    /// M5.6 (T5) one-time operational backfill: sets <c>SourceNode</c> to
+    /// <paramref name="sentinel"/> on every row where <c>SourceNode IS NULL</c>
+    /// and <c>OccurredAtUtc &lt; <paramref name="before"/></c>, in bounded
+    /// batches of <paramref name="batchSize"/> rows, looping until no further
+    /// rows match. Returns the total number of rows updated across all batches.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// <b>Why a sentinel, not the real value.</b> <c>SourceNode</c> captures the
+    /// physical cluster node on which an event was emitted. For pre-feature rows
+    /// that were ingested before the column was stamped, the true node-of-origin
+    /// is UNKNOWABLE — the original emitter is long gone and there is no
+    /// retroactive way to determine it. Backfilling a configurable sentinel
+    /// (default <c>"unknown"</c>) makes it explicit that these rows pre-date the
+    /// feature rather than silently leaving them NULL (which the filter UI already
+    /// treats as "unresolved" but which an operator might mistake for a bug).
+    /// </para>
+    /// <para>
+    /// <b><c>ExecutionId</c> / <c>ParentExecutionId</c> cannot be backfilled.</b>
+    /// These are PERSISTED COMPUTED columns derived from <c>DetailsJson</c>. The
+    /// AuditLog append-only invariant forbids mutating <c>DetailsJson</c>, so
+    /// the computed values for pre-feature rows remain NULL permanently. This is
+    /// documented rather than coded — see the Ops Note in
+    /// <c>Component-AuditLog.md § Ops Notes — Historical Null Columns</c>.
+    /// </para>
+    /// <para>
+    /// <b>Maintenance path — NOT the writer role.</b> This UPDATE runs on the
+    /// purge/maintenance connection (the same path as
+    /// <see cref="SwitchOutPartitionAsync"/> and any per-channel purge), NOT the
+    /// append-only <c>scadabridge_audit_writer</c> role. The CI guard
+    /// (<c>AuditLogAppendOnlyGuardTests</c>) recognises the
+    /// <c>// AUDIT-PURGE-ALLOWED</c> marker on the UPDATE line and forgives
+    /// exactly this one sanctioned maintenance-path UPDATE; any other UPDATE
+    /// against <c>AuditLog</c> still trips the guard.
+    /// </para>
+    /// <para>
+    /// <b>Bounded + idempotent.</b> <c>UPDATE TOP (@batch)</c> caps the
+    /// transaction-log and lock footprint per statement. The loop exits when a
+    /// batch updates zero rows, so a crash mid-loop is recoverable by simply
+    /// running again; re-running after completion is a no-op (no NULL rows
+    /// remain for the given <paramref name="before"/> window).
+    /// </para>
+    /// </remarks>
+    /// <param name="sentinel">Value to write into <c>SourceNode</c> for pre-feature rows (e.g. <c>"unknown"</c>).</param>
+    /// <param name="before">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are eligible.</param>
+    /// <param name="batchSize">Maximum rows updated per batch; must be &gt; 0.</param>
+    /// <param name="ct">Cancellation token.</param>
+    /// <returns>A task that resolves to the total number of rows updated across all batches.</returns>
+    Task<long> BackfillSourceNodeAsync(
+        string sentinel,
+        DateTime before,
+        int batchSize,
+        CancellationToken ct = default);
 }
@@ -100,6 +100,19 @@ public interface INotificationOutboxRepository
    Task<IReadOnlyList<SiteNotificationKpiSnapshot>> ComputePerSiteKpisAsync(
        DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);

+    /// <summary>
+    /// Computes a point-in-time <see cref="NodeNotificationKpiSnapshot"/> per originating node.
+    /// Nodes with no notification rows at all are omitted; rows with a <c>NULL</c>
+    /// <c>SourceNode</c> are excluded. The stuck and delivered cutoffs are supplied by the
+    /// caller; the current time used for <c>OldestPendingAge</c> is captured inside the method.
+    /// </summary>
+    /// <param name="stuckCutoff">The time threshold for marking notifications as stuck.</param>
+    /// <param name="deliveredSince">The time threshold for counting delivered notifications.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <returns>A list of per-node KPI snapshots, ordered by node name.</returns>
+    Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
+        DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
+
    /// <summary>
    /// Persists pending changes tracked on the underlying context. Use this when staging
    /// multiple changes for a single commit; the individual mutating methods on this
@@ -107,4 +107,19 @@ public interface ISiteCallAuditRepository
        DateTime stuckCutoff,
        DateTime intervalSince,
        CancellationToken ct = default);
+
+    /// <summary>
+    /// Computes a point-in-time <see cref="SiteCallNodeKpiSnapshot"/> per originating
+    /// node. Nodes with no <c>SiteCalls</c> rows at all are omitted; rows with a
+    /// <c>NULL</c> <c>SourceNode</c> are excluded. The stuck cutoff and interval
+    /// bounds are interpreted as in <see cref="ComputeKpisAsync"/>.
+    /// </summary>
+    /// <param name="stuckCutoff">UTC threshold for classifying a row as stuck.</param>
+    /// <param name="intervalSince">UTC start of the delivered/failed interval window.</param>
+    /// <param name="ct">Cancellation token.</param>
+    /// <returns>A task that resolves to a per-node KPI list; nodes with no rows are omitted.</returns>
+    Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
+        DateTime stuckCutoff,
+        DateTime intervalSince,
+        CancellationToken ct = default);
 }
@@ -164,3 +164,24 @@ public sealed record PerSiteSiteCallKpiResponse(
    bool Success,
    string? ErrorMessage,
    IReadOnlyList<SiteCallSiteKpiSnapshot> Sites);
+
+/// <summary>
+/// Site Calls UI -> Central: request for the per-node <c>SiteCalls</c>
+/// KPI breakdown. Mirrors <see cref="PerSiteSiteCallKpiRequest"/> but groups
+/// by <c>SourceNode</c> instead of <c>SourceSite</c>. Additive — does not
+/// change per-site behaviour.
+/// </summary>
+public sealed record PerNodeSiteCallKpiRequest(
+    string CorrelationId);
+
+/// <summary>
+/// Central -> Site Calls UI: per-node KPI breakdown for the Site Calls KPIs
+/// page. On a repository fault <see cref="Success"/> is <c>false</c>,
+/// <see cref="ErrorMessage"/> carries the cause, and <see cref="Nodes"/> is empty.
+/// Nodes with a <c>NULL</c> <c>SourceNode</c> are omitted.
+/// </summary>
+public sealed record PerNodeSiteCallKpiResponse(
+    string CorrelationId,
+    bool Success,
+    string? ErrorMessage,
+    IReadOnlyList<SiteCallNodeKpiSnapshot> Nodes);
@@ -83,3 +83,46 @@ public record RouteToSetAttributesResponse(
    bool Success,
    string? ErrorMessage,
    DateTimeOffset Timestamp);
+
+/// <summary>
+/// Request to block until a remote instance attribute reaches a target value
+/// (spec §6 — <c>Route.To("inst").WaitForAttribute(name, targetValue, timeout)</c>).
+/// Value-equality ONLY across the wire: <see cref="TargetValueEncoded"/> carries the
+/// canonical <c>AttributeValueCodec</c>-encoded target; there is no predicate and no
+/// quality flag in the comparison. The site evaluates equality and either matches or
+/// times out.
+/// </summary>
+/// <param name="ParentExecutionId">
+/// Audit Log #23 (ParentExecutionId): mirrors <see cref="RouteToCallRequest.ParentExecutionId"/>.
+/// For an inbound-API-routed wait this is the inbound request's per-request execution id;
+/// future site-side audit emission for routed waits can stamp it as <c>ParentExecutionId</c>
+/// so the inbound→site execution-tree link survives the wait path. Additive trailing
+/// member — null for the Central UI sandbox path or for callers built before the field existed.
+/// </param>
+public record RouteToWaitForAttributeRequest(
+    string CorrelationId,
+    string InstanceUniqueName,
+    string AttributeName,
+    string? TargetValueEncoded,
+    TimeSpan Timeout,
+    DateTimeOffset Timestamp,
+    Guid? ParentExecutionId = null);
+
+/// <summary>
+/// Response from a remote attribute wait. <see cref="Success"/>/<see cref="ErrorMessage"/>
+/// convey the routing-level outcome (e.g. instance-not-found); <see cref="Matched"/>,
+/// <see cref="TimedOut"/>, <see cref="Value"/>, and <see cref="Quality"/> convey the wait
+/// outcome itself. When <see cref="Success"/> is <c>true</c>, exactly one of
+/// <see cref="Matched"/>/<see cref="TimedOut"/> holds: <see cref="Matched"/> means the
+/// attribute reached the target value (with <see cref="Value"/>/<see cref="Quality"/>
+/// captured at the match), <see cref="TimedOut"/> means the deadline elapsed first.
+/// </summary>
+public record RouteToWaitForAttributeResponse(
+    string CorrelationId,
+    bool Matched,
+    object? Value,
+    string? Quality,
+    bool TimedOut,
+    bool Success,
+    string? ErrorMessage,
+    DateTimeOffset Timestamp);
@@ -0,0 +1,82 @@
+namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Instance;
+
+/// <summary>
+/// Request to wait, event-driven, until an attribute reaches a value (or any
+/// value satisfying a predicate), bounded by a timeout — the backing protocol for
+/// the script-facing <c>Attributes.WaitAsync</c> helper.
+///
+/// <para>
+/// <b>Site-local only.</b> The optional <see cref="Predicate"/> is a non-serializable
+/// in-process delegate, so this message MUST flow only within a single site node's
+/// actor system (script execution → Instance Actor). It is never sent across the
+/// ClusterClient / gRPC boundary. The value-equality form (<see cref="TargetValueEncoded"/>)
+/// would serialize, but the routed/inbound variant is deliberately out of scope here.
+/// </para>
+/// </summary>
+/// <param name="CorrelationId">Per-wait correlation id; keys the waiter registry and the timeout self-message.</param>
+/// <param name="InstanceName">The instance this wait targets.</param>
+/// <param name="AttributeName">The attribute to watch — already scope-resolved by the accessor.</param>
+/// <param name="TargetValueEncoded">
+/// The codec-encoded target value (<c>AttributeValueCodec.Encode(target)</c>). A
+/// match compares the codec-encoded form of the current value against this string.
+/// When both this and <see cref="Predicate"/> are null the wait matches on ANY change.
+/// </param>
+/// <param name="Predicate">
+/// Site-local predicate tested against the raw (decoded) current value. Mutually
+/// exclusive with <see cref="TargetValueEncoded"/> — null when the encoded target is used.
+/// </param>
+/// <param name="Timeout">How long to wait before self-evicting with a timeout reply.</param>
+/// <param name="OccurredAtUtc">When the request was issued (UTC).</param>
+/// <param name="RequireGoodQuality">
+/// Quality-gated ("Good"-only) mode (spec §4.2): when <see langword="true"/>, a
+/// match additionally requires the attribute quality to be exactly
+/// <c>"Good"</c> (<see cref="System.StringComparison.Ordinal"/>) — a value that
+/// reaches the target / satisfies the predicate at Bad/Uncertain quality is NOT a
+/// match and the waiter stays pending until the value satisfies the test at Good
+/// quality (or times out). Defaults to <see langword="false"/> (quality-agnostic:
+/// the match tests the value only). Trailing/defaulted so existing positional
+/// constructions compile unchanged.
+/// </param>
+public record WaitForAttributeRequest(
+    string CorrelationId,
+    string InstanceName,
+    string AttributeName,
+    string? TargetValueEncoded,
+    Func<object?, bool>? Predicate,
+    TimeSpan Timeout,
+    DateTimeOffset OccurredAtUtc,
+    bool RequireGoodQuality = false);
+
+/// <summary>
+/// Reply to a <see cref="WaitForAttributeRequest"/>. Exactly one of
+/// <see cref="Matched"/> / <see cref="TimedOut"/> is set on the happy paths;
+/// <see cref="ErrorMessage"/> is populated on the failure paths (per-instance
+/// waiter cap exceeded, or the match predicate threw).
+/// </summary>
+/// <param name="CorrelationId">Echoes the request's correlation id.</param>
+/// <param name="Matched">True when the attribute reached the target/predicate within the timeout.</param>
+/// <param name="Value">The matched value (null on timeout / error).</param>
+/// <param name="Quality">
+/// The attribute quality at match time; <see langword="null"/> on the non-match
+/// paths (timeout / error / cap-exceeded), matching the nullable
+/// <see cref="ErrorMessage"/> convention.
+/// </param>
+/// <param name="TimedOut">True when the timeout fired before a match.</param>
+/// <param name="ErrorMessage">
+/// Non-null only when the wait failed/refused — the per-instance waiter cap was
+/// exceeded, or the match predicate threw (<c>"Wait predicate threw: …"</c>).
+/// </param>
+public record WaitForAttributeResponse(
+    string CorrelationId,
+    bool Matched,
+    object? Value,
+    string? Quality,
+    bool TimedOut,
+    string? ErrorMessage = null);
+
+/// <summary>
+/// Internal self-message scheduled by the Instance Actor to fire a waiter's
+/// timeout. Site-local only; never crosses a cluster boundary.
+/// </summary>
+/// <param name="CorrelationId">The waiter whose timeout fired.</param>
+public record WaitForAttributeTimeout(string CorrelationId);
@@ -159,3 +159,23 @@ public record PerSiteNotificationKpiResponse(
    bool Success,
    string? ErrorMessage,
    IReadOnlyList<SiteNotificationKpiSnapshot> Sites);
+
+/// <summary>
+/// Outbox UI -> Central: request for the per-node notification outbox KPI breakdown.
+/// Mirrors <see cref="PerSiteNotificationKpiRequest"/> but groups by <c>SourceNode</c>
+/// instead of <c>SourceSiteId</c>. Additive — does not change per-site behaviour.
+/// </summary>
+public record PerNodeNotificationKpiRequest(
+    string CorrelationId);
+
+/// <summary>
+/// Central -> Outbox UI: per-node KPI breakdown for the Notification KPIs page.
+/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
+/// carries the cause, and <see cref="Nodes"/> is empty. Nodes with a <c>NULL</c>
+/// <c>SourceNode</c> are omitted.
+/// </summary>
+public record PerNodeNotificationKpiResponse(
+    string CorrelationId,
+    bool Success,
+    string? ErrorMessage,
+    IReadOnlyList<NodeNotificationKpiSnapshot> Nodes);
@@ -0,0 +1,37 @@
+namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
+
+/// <summary>
+/// Point-in-time <c>SiteCalls</c> metrics scoped to a single originating node. The
+/// per-node counterpart of <see cref="SiteCallSiteKpiSnapshot"/>; surfaced in the
+/// per-node breakdown table on the Site Calls KPIs page. Mirrors
+/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications.NodeNotificationKpiSnapshot"/>.
+/// </summary>
+/// <param name="SourceNode">
+/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
+/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
+/// </param>
+/// <param name="BufferedCount">Count of this node's non-terminal rows (<c>TerminalAtUtc IS NULL</c>).</param>
+/// <param name="ParkedCount">Count of this node's rows in the <c>Parked</c> status.</param>
+/// <param name="FailedLastInterval">
+/// Count of this node's <c>Failed</c> rows whose <c>TerminalAtUtc</c> is at or
+/// after the "since" timestamp.
+/// </param>
+/// <param name="DeliveredLastInterval">
+/// Count of this node's <c>Delivered</c> rows whose <c>TerminalAtUtc</c> is at
+/// or after the "since" timestamp.
+/// </param>
+/// <param name="OldestPendingAge">
+/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
+/// </param>
+/// <param name="StuckCount">
+/// Count of this node's non-terminal rows whose <c>CreatedAtUtc</c> is older
+/// than the stuck cutoff.
+/// </param>
+public sealed record SiteCallNodeKpiSnapshot(
+    string SourceNode,
+    int BufferedCount,
+    int ParkedCount,
+    int FailedLastInterval,
+    int DeliveredLastInterval,
+    TimeSpan? OldestPendingAge,
+    int StuckCount);
@@ -0,0 +1,30 @@
+namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications;
+
+/// <summary>
+/// Point-in-time notification-outbox metrics scoped to a single originating node.
+/// The per-node counterpart of <see cref="SiteNotificationKpiSnapshot"/>; surfaced
+/// in the per-node breakdown table on the Notification KPIs page.
+/// </summary>
+/// <param name="SourceNode">
+/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
+/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
+/// </param>
+/// <param name="QueueDepth">Count of this node's non-terminal rows (Pending + Retrying).</param>
+/// <param name="StuckCount">
+/// Count of this node's non-terminal rows whose <c>CreatedAt</c> is older than the stuck cutoff.
+/// </param>
+/// <param name="ParkedCount">Count of this node's rows in the Parked status.</param>
+/// <param name="DeliveredLastInterval">
+/// Count of this node's Delivered rows whose <c>DeliveredAt</c> is at or after the
+/// "delivered since" timestamp.
+/// </param>
+/// <param name="OldestPendingAge">
+/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
+/// </param>
+public record NodeNotificationKpiSnapshot(
+    string SourceNode,
+    int QueueDepth,
+    int StuckCount,
+    int ParkedCount,
+    int DeliveredLastInterval,
+    TimeSpan? OldestPendingAge);
@@ -0,0 +1,21 @@
+namespace ZB.MOM.WW.ScadaBridge.Commons.Types;
+
+/// <summary>
+/// Rich result of an <c>Attributes.WaitForAsync</c> wait (spec §3) — the full
+/// outcome of waiting for an attribute to reach a value / satisfy a predicate /
+/// change at all, bounded by a timeout. The <c>Attributes.WaitAsync</c> helpers
+/// surface only <see cref="Matched"/>; <c>WaitForAsync</c> returns this struct so
+/// a script can also read the matched <see cref="Value"/>, its <see cref="Quality"/>,
+/// and distinguish a genuine timeout (<see cref="TimedOut"/>) from a non-match.
+/// </summary>
+/// <param name="Matched">
+/// <see langword="true"/> when the attribute reached the target / satisfied the
+/// predicate within the timeout (and, in quality-gated mode, at "Good" quality).
+/// </param>
+/// <param name="Value">The matched value; <see langword="null"/> on timeout / error.</param>
+/// <param name="Quality">
+/// The attribute quality at match time; <see langword="null"/> on the non-match
+/// paths (timeout / error / cap-exceeded).
+/// </param>
+/// <param name="TimedOut"><see langword="true"/> when the timeout fired before a match.</param>
+public readonly record struct WaitResult(bool Matched, object? Value, string? Quality, bool TimedOut);
@@ -144,6 +144,7 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
        Receive<RouteToCallRequest>(msg => _deploymentManagerProxy.Forward(msg));
        Receive<RouteToGetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
        Receive<RouteToSetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
+        Receive<RouteToWaitForAttributeRequest>(msg => _deploymentManagerProxy.Forward(msg));

        // OPC UA Tag Browser (interactive design-time query) — forward to the
        // Deployment Manager singleton, which always lands on the active site
@@ -445,6 +445,25 @@ public class CommunicationService
            envelope, _options.IntegrationTimeout, cancellationToken);
    }

+    /// <summary>
+    /// Routes an inbound API wait-for-attribute request to a site (spec §6).
+    /// </summary>
+    /// <param name="siteId">The target site identifier.</param>
+    /// <param name="request">The wait-for-attribute route request.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <returns>The wait-for-attribute route response.</returns>
+    public async Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
+        string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken = default)
+    {
+        var envelope = new SiteEnvelope(siteId, request);
+        // A wait legitimately blocks up to request.Timeout on the site, so the cluster
+        // Ask must be bounded by the WAIT deadline (plus integration-timeout slack for
+        // the round trip), not the generic IntegrationTimeout used by the other routes.
+        var askTimeout = request.Timeout + _options.IntegrationTimeout;
+        return await GetActor().Ask<RouteToWaitForAttributeResponse>(
+            envelope, askTimeout, cancellationToken);
+    }
+
    // ── Notification Outbox (central-local actor — Asked directly, no SiteEnvelope) ──

    /// <summary>
@@ -525,6 +544,22 @@ public class CommunicationService
            request, _options.QueryTimeout, cancellationToken);
    }

+    /// <summary>
+    /// Gets per-node KPI metrics for the notification outbox.
+    /// Groups by <c>SourceNode</c> (e.g. <c>node-a</c>/<c>node-b</c>); rows with
+    /// a <c>NULL</c> node are omitted. Additive alongside
+    /// <see cref="GetPerSiteNotificationKpisAsync"/>.
+    /// </summary>
+    /// <param name="request">The per-node notification KPI request.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <returns>The per-node notification KPI response.</returns>
+    public async Task<PerNodeNotificationKpiResponse> GetPerNodeNotificationKpisAsync(
+        PerNodeNotificationKpiRequest request, CancellationToken cancellationToken = default)
+    {
+        return await GetNotificationOutbox().Ask<PerNodeNotificationKpiResponse>(
+            request, _options.QueryTimeout, cancellationToken);
+    }
+
    // ── Site Call Audit (central-local actor — Asked directly, no SiteEnvelope) ──

    /// <summary>
@@ -579,6 +614,21 @@ public class CommunicationService
            request, _options.QueryTimeout, cancellationToken);
    }

+    /// <summary>
+    /// Gets per-node KPI metrics for site calls. Groups by <c>SourceNode</c>
+    /// (e.g. <c>node-a</c>/<c>node-b</c>); rows with a <c>NULL</c> node are
+    /// omitted. Additive alongside <see cref="GetPerSiteSiteCallKpisAsync"/>.
+    /// </summary>
+    /// <param name="request">The per-node site call KPI request.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <returns>The per-node site call KPI response.</returns>
+    public async Task<PerNodeSiteCallKpiResponse> GetPerNodeSiteCallKpisAsync(
+        PerNodeSiteCallKpiRequest request, CancellationToken cancellationToken = default)
+    {
+        return await GetSiteCallAudit().Ask<PerNodeSiteCallKpiResponse>(
+            request, _options.QueryTimeout, cancellationToken);
+    }
+
    /// <summary>
    /// Task 5 (#22): relays an operator Retry of a parked cached call to its
    /// owning site. The <c>SiteCallAuditActor</c> is Asked directly (it is
@@ -370,6 +370,99 @@ VALUES
        return rowsDeleted;
    }

+    /// <inheritdoc />
+    public async Task<long> PurgeChannelOlderThanAsync(
+        string channel,
+        DateTime threshold,
+        int batchSize,
+        CancellationToken ct = default)
+    {
+        if (string.IsNullOrWhiteSpace(channel))
+        {
+            throw new ArgumentException("Channel must be a non-empty channel name.", nameof(channel));
+        }
+
+        if (batchSize <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
+        }
+
+        var thresholdUtc = DateTime.SpecifyKind(threshold.ToUniversalTime(), DateTimeKind.Utc);
+
+        // M5.5 (T3) per-channel retention override purge. This is the ONLY DELETE
+        // against dbo.AuditLog in the codebase and it runs on the purge/maintenance
+        // path, NOT the append-only writer role (which has INSERT + SELECT only — see
+        // the DENY UPDATE/DENY DELETE grants in CollapseAuditLogToCanonical). The
+        // AuditLog append-only CI guard (AuditLogAppendOnlyGuardTests) is intentionally
+        // widened to allow ONLY the single marked DELETE below; any other UPDATE/DELETE
+        // targeting AuditLog still trips the guard.
+        //
+        // Bounded + idempotent: DELETE TOP (@batch) caps the log/lock footprint per
+        // statement; the loop repeats until a batch deletes zero rows, so re-running
+        // after a crash mid-loop simply resumes. Category is the canonical
+        // channel-name column (e.g. 'ApiOutbound'); Action holds "{channel}.{kind}" so
+        // it is NOT the right column to match a bare channel name against.
+        //
+        // The trailing AUDIT-PURGE-ALLOWED marker on the DELETE line below is the
+        // single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
+        // recognizes; any other UPDATE/DELETE targeting AuditLog still trips the guard.
+        const string deleteBatchSql =
+            "DELETE TOP (@batch) FROM dbo.AuditLog WHERE Category = @channel AND OccurredAtUtc < @threshold;"; // AUDIT-PURGE-ALLOWED: per-channel retention override (M5.5 T3), maintenance path
+
+        long totalDeleted = 0;
+
+        var conn = _context.Database.GetDbConnection();
+        var openedHere = false;
+        if (conn.State != System.Data.ConnectionState.Open)
+        {
+            await conn.OpenAsync(ct).ConfigureAwait(false);
+            openedHere = true;
+        }
+
+        try
+        {
+            while (true)
+            {
+                ct.ThrowIfCancellationRequested();
+
+                await using var cmd = conn.CreateCommand();
+                cmd.CommandText = deleteBatchSql;
+
+                var pBatch = cmd.CreateParameter();
+                pBatch.ParameterName = "@batch";
+                pBatch.Value = batchSize;
+                cmd.Parameters.Add(pBatch);
+
+                var pChannel = cmd.CreateParameter();
+                pChannel.ParameterName = "@channel";
+                pChannel.Value = channel;
+                cmd.Parameters.Add(pChannel);
+
+                var pThreshold = cmd.CreateParameter();
+                pThreshold.ParameterName = "@threshold";
+                pThreshold.Value = thresholdUtc;
+                cmd.Parameters.Add(pThreshold);
+
+                var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
+                if (rows <= 0)
+                {
+                    break;
+                }
+
+                totalDeleted += rows;
+            }
+        }
+        finally
+        {
+            if (openedHere)
+            {
+                await conn.CloseAsync().ConfigureAwait(false);
+            }
+        }
+
+        return totalDeleted;
+    }
+
    /// <inheritdoc />
    public async Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
        DateTime threshold,
@@ -716,6 +809,102 @@ VALUES
            .ToListAsync(ct);
    }

+    /// <inheritdoc />
+    public async Task<long> BackfillSourceNodeAsync(
+        string sentinel,
+        DateTime before,
+        int batchSize,
+        CancellationToken ct = default)
+    {
+        if (string.IsNullOrWhiteSpace(sentinel))
+        {
+            throw new ArgumentException("Sentinel must be a non-empty value.", nameof(sentinel));
+        }
+
+        if (batchSize <= 0)
+        {
+            throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
+        }
+
+        var beforeUtc = DateTime.SpecifyKind(before.ToUniversalTime(), DateTimeKind.Utc);
+
+        // M5.6 (T5) SourceNode sentinel backfill. This is the ONE sanctioned UPDATE
+        // against dbo.AuditLog in the codebase. It touches ONLY rows where
+        // SourceNode IS NULL AND OccurredAtUtc < @before — rows that pre-date the
+        // M5.6 feature and whose node-of-origin is UNKNOWABLE. The sentinel (default
+        // "unknown") makes that explicit. ExecutionId/ParentExecutionId are PERSISTED
+        // COMPUTED columns derived from DetailsJson — mutating DetailsJson is forbidden
+        // under the append-only invariant, so those stay NULL on pre-feature rows.
+        //
+        // Maintenance path (NOT the writer role): runs on the same connection used for
+        // SwitchOutPartitionAsync (partition-switch DDL), which requires a role that
+        // holds UPDATE — the append-only scadabridge_audit_writer role has only
+        // INSERT + SELECT.
+        //
+        // Bounded + idempotent: UPDATE TOP (@batch) caps the log/lock footprint per
+        // statement; the loop exits when a batch updates 0 rows. Re-running after a
+        // crash simply resumes where it left off.
+        //
+        // The trailing AUDIT-PURGE-ALLOWED marker on the UPDATE line below is the
+        // single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
+        // recognises for an UPDATE; any other UPDATE targeting AuditLog still trips the guard.
+        const string updateBatchSql =
+            "UPDATE TOP (@batch) dbo.AuditLog SET SourceNode = @sentinel WHERE SourceNode IS NULL AND OccurredAtUtc < @before;"; // AUDIT-PURGE-ALLOWED: SourceNode sentinel backfill (M5.6 T5), maintenance path
+
+        long totalUpdated = 0;
+
+        var conn = _context.Database.GetDbConnection();
+        var openedHere = false;
+        if (conn.State != System.Data.ConnectionState.Open)
+        {
+            await conn.OpenAsync(ct).ConfigureAwait(false);
+            openedHere = true;
+        }
+
+        try
+        {
+            while (true)
+            {
+                ct.ThrowIfCancellationRequested();
+
+                await using var cmd = conn.CreateCommand();
+                cmd.CommandText = updateBatchSql;
+
+                var pBatch = cmd.CreateParameter();
+                pBatch.ParameterName = "@batch";
+                pBatch.Value = batchSize;
+                cmd.Parameters.Add(pBatch);
+
+                var pSentinel = cmd.CreateParameter();
+                pSentinel.ParameterName = "@sentinel";
+                pSentinel.Value = sentinel;
+                cmd.Parameters.Add(pSentinel);
+
+                var pBefore = cmd.CreateParameter();
+                pBefore.ParameterName = "@before";
+                pBefore.Value = beforeUtc;
+                cmd.Parameters.Add(pBefore);
+
+                var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
+                if (rows <= 0)
+                {
+                    break;
+                }
+
+                totalUpdated += rows;
+            }
+        }
+        finally
+        {
+            if (openedHere)
+            {
+                await conn.CloseAsync().ConfigureAwait(false);
+            }
+        }
+
+        return totalUpdated;
+    }
+
    /// <summary>
    /// Splits a <c>STRING_AGG</c> comma-joined value into a distinct, ordered
    /// list. A null/empty aggregate (a stub node with no rows) yields an empty
@@ -300,6 +300,63 @@ VALUES
                : null)).ToList();
    }

+    /// <inheritdoc />
+    public async Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
+        DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default)
+    {
+        var now = DateTimeOffset.UtcNow;
+
+        // Exclude rows with NULL SourceNode (legacy / unstamped) — per-node KPIs
+        // are only meaningful when the node identity is known.
+        var queueDepth = await CountByNodeAsync(
+            n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
+                && n.SourceNode != null,
+            cancellationToken);
+
+        var stuck = await CountByNodeAsync(
+            n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
+                && n.CreatedAt < stuckCutoff
+                && n.SourceNode != null,
+            cancellationToken);
+
+        var parked = await CountByNodeAsync(
+            n => n.Status == NotificationStatus.Parked && n.SourceNode != null,
+            cancellationToken);
+
+        var delivered = await CountByNodeAsync(
+            n => n.Status == NotificationStatus.Delivered
+                && n.DeliveredAt != null && n.DeliveredAt >= deliveredSince
+                && n.SourceNode != null,
+            cancellationToken);
+
+        // Oldest non-terminal CreatedAt per node — same in-memory reduction
+        // pattern as ComputePerSiteKpisAsync (DateTimeOffset converter makes
+        // a SQL Min awkward).
+        var oldest = (await _context.Notifications
+                .Where(n => (n.Status == NotificationStatus.Pending
+                    || n.Status == NotificationStatus.Retrying)
+                    && n.SourceNode != null)
+                .Select(n => new { n.SourceNode, n.CreatedAt })
+                .ToListAsync(cancellationToken))
+            .GroupBy(x => x.SourceNode!)
+            .ToDictionary(g => g.Key, g => g.Min(x => x.CreatedAt));
+
+        var nodeNames = queueDepth.Keys
+            .Concat(stuck.Keys).Concat(parked.Keys).Concat(delivered.Keys)
+            .Distinct()
+            .OrderBy(n => n, StringComparer.Ordinal);
+
+        return nodeNames.Select(node => new NodeNotificationKpiSnapshot(
+            SourceNode: node,
+            QueueDepth: queueDepth.GetValueOrDefault(node),
+            StuckCount: stuck.GetValueOrDefault(node),
+            ParkedCount: parked.GetValueOrDefault(node),
+            DeliveredLastInterval: delivered.GetValueOrDefault(node),
+            OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
+                ? now - createdAt
+                : null)).ToList();
+    }
+
    /// <summary>Counts notification rows matching <paramref name="predicate"/>, grouped by source site.</summary>
    private async Task<Dictionary<string, int>> CountBySiteAsync(
        System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
@@ -312,6 +369,22 @@ VALUES
            .ToDictionaryAsync(x => x.Site, x => x.Count, cancellationToken);
    }

+    /// <summary>
+    /// Counts notification rows matching <paramref name="predicate"/>, grouped by source node.
+    /// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
+    /// responsible for enforcing that guard.
+    /// </summary>
+    private async Task<Dictionary<string, int>> CountByNodeAsync(
+        System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
+        CancellationToken cancellationToken)
+    {
+        return await _context.Notifications
+            .Where(predicate)
+            .GroupBy(n => n.SourceNode!)
+            .Select(g => new { Node = g.Key, Count = g.Count() })
+            .ToDictionaryAsync(x => x.Node, x => x.Count, cancellationToken);
+    }
+
    /// <inheritdoc />
    public async Task<int> SaveChangesAsync(CancellationToken cancellationToken = default)
        => await _context.SaveChangesAsync(cancellationToken);
@@ -324,6 +324,61 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
            StuckCount: stuck.GetValueOrDefault(site))).ToList();
    }

+    /// <inheritdoc />
+    public async Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
+        DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default)
+    {
+        var now = DateTime.UtcNow;
+
+        // Exclude rows with NULL SourceNode — per-node KPIs are only meaningful
+        // when the node identity is known. Each predicate guards n.SourceNode != null
+        // so the GROUP BY key is always non-null.
+        var buffered = await CountByNodeAsync(
+            s => s.TerminalAtUtc == null && s.SourceNode != null, ct);
+
+        var parked = await CountByNodeAsync(
+            s => s.Status == StatusParked && s.SourceNode != null, ct);
+
+        var failed = await CountByNodeAsync(
+            s => s.Status == StatusFailed
+                && s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
+                && s.SourceNode != null, ct);
+
+        var delivered = await CountByNodeAsync(
+            s => s.Status == StatusDelivered
+                && s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
+                && s.SourceNode != null, ct);
+
+        var stuck = await CountByNodeAsync(
+            s => s.TerminalAtUtc == null && s.CreatedAtUtc < stuckCutoff
+                && s.SourceNode != null, ct);
+
+        // Oldest non-terminal CreatedAtUtc per node — server-side GROUP BY MIN.
+        var oldest = (await _context.SiteCalls
+                .Where(s => s.TerminalAtUtc == null && s.SourceNode != null)
+                .GroupBy(s => s.SourceNode!)
+                .Select(g => new { Node = g.Key, Oldest = g.Min(s => s.CreatedAtUtc) })
+                .ToListAsync(ct))
+            .ToDictionary(x => x.Node, x => x.Oldest);
+
+        var nodeNames = buffered.Keys
+            .Concat(parked.Keys).Concat(failed.Keys)
+            .Concat(delivered.Keys).Concat(stuck.Keys)
+            .Distinct()
+            .OrderBy(n => n, StringComparer.Ordinal);
+
+        return nodeNames.Select(node => new SiteCallNodeKpiSnapshot(
+            SourceNode: node,
+            BufferedCount: buffered.GetValueOrDefault(node),
+            ParkedCount: parked.GetValueOrDefault(node),
+            FailedLastInterval: failed.GetValueOrDefault(node),
+            DeliveredLastInterval: delivered.GetValueOrDefault(node),
+            OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
+                ? now - createdAt
+                : null,
+            StuckCount: stuck.GetValueOrDefault(node))).ToList();
+    }
+
    /// <summary>Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source site.</summary>
    private async Task<Dictionary<string, int>> CountBySiteAsync(
        System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
@@ -336,6 +391,22 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
            .ToDictionaryAsync(x => x.Site, x => x.Count, ct);
    }

+    /// <summary>
+    /// Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source node.
+    /// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
+    /// responsible for enforcing that guard.
+    /// </summary>
+    private async Task<Dictionary<string, int>> CountByNodeAsync(
+        System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
+        CancellationToken ct)
+    {
+        return await _context.SiteCalls
+            .Where(predicate)
+            .GroupBy(s => s.SourceNode!)
+            .Select(g => new { Node = g.Key, Count = g.Count() })
+            .ToDictionaryAsync(x => x.Node, x => x.Count, ct);
+    }
+
    private static int GetRankOrThrow(string status)
    {
        if (!StatusRank.TryGetValue(status, out var rank))
@@ -35,4 +35,9 @@ public sealed class CommunicationServiceInstanceRouter : IInstanceRouter
    public Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
        string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken) =>
        _communicationService.RouteToSetAttributesAsync(siteId, request, cancellationToken);
+
+    /// <inheritdoc />
+    public Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
+        string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken) =>
+        _communicationService.RouteToWaitForAttributeAsync(siteId, request, cancellationToken);
 }
@@ -34,4 +34,12 @@ public interface IInstanceRouter
    /// <returns>A task that resolves to the set-attributes response from the target site.</returns>
    Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
        string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken);
+
+    /// <summary>Routes a wait-for-attribute request to the specified site (spec §6).</summary>
+    /// <param name="siteId">Target site identifier.</param>
+    /// <param name="request">The wait-for-attribute request to route (value-equality only).</param>
+    /// <param name="cancellationToken">Cancellation token for the routed call.</param>
+    /// <returns>A task that resolves to the wait-for-attribute response from the target site.</returns>
+    Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
+        string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken);
 }
@@ -6,6 +6,7 @@ using Microsoft.AspNetCore.Http;
 using Microsoft.Extensions.Logging;
 using Microsoft.Extensions.Options;
 using ZB.MOM.WW.Audit;
+using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
 using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
 using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
 using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
@@ -95,6 +96,7 @@ public sealed class AuditWriteMiddleware
    private readonly ILogger<AuditWriteMiddleware> _logger;
    private readonly IOptionsMonitor<AuditLogOptions> _options;
    private readonly IAuditActorAccessor? _actorAccessor;
+    private readonly IAuditInboundCeilingHitsCounter _ceilingHitsCounter;

    /// <summary>
    /// Initializes the middleware with its required dependencies.
@@ -110,18 +112,26 @@ public sealed class AuditWriteMiddleware
    /// construct the middleware; when absent, actor resolution falls back to the
    /// stashed API-key name only.
    /// </param>
+    /// <param name="ceilingHitsCounter">
+    /// M5.3 (T7, optional): incremented whenever an inbound request or response
+    /// body is truncated at <see cref="AuditLogOptions.InboundMaxBytes"/>. Optional
+    /// so existing tests and composition roots without the central health snapshot
+    /// wired still construct without the counter; a NoOp is used when absent.
+    /// </param>
    public AuditWriteMiddleware(
        RequestDelegate next,
        ICentralAuditWriter auditWriter,
        ILogger<AuditWriteMiddleware> logger,
        IOptionsMonitor<AuditLogOptions> options,
-        IAuditActorAccessor? actorAccessor = null)
+        IAuditActorAccessor? actorAccessor = null,
+        IAuditInboundCeilingHitsCounter? ceilingHitsCounter = null)
    {
        _next = next ?? throw new ArgumentNullException(nameof(next));
        _auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
        _options = options ?? throw new ArgumentNullException(nameof(options));
        _actorAccessor = actorAccessor;
+        _ceilingHitsCounter = ceilingHitsCounter ?? new NoOpAuditInboundCeilingHitsCounter();
    }

    /// <summary>
@@ -133,9 +143,11 @@ public sealed class AuditWriteMiddleware
    {
        var sw = Stopwatch.StartNew();

-        // Per-request hot read of the inbound cap so a live config change
+        // Per-request hot read of the options snapshot so a live config change
        // picks up on the next request without re-resolving the singleton.
-        var cap = _options.CurrentValue.InboundMaxBytes;
+        // InboundMaxBytes is read once here and passed to the capture helpers.
+        var opts = _options.CurrentValue;
+        var cap = opts.InboundMaxBytes;

        // Audit Log #23 (ParentExecutionId): mint the inbound request's per-request
        // ExecutionId ONCE, here at the start of the request, and stash it on
@@ -163,9 +175,20 @@ public sealed class AuditWriteMiddleware
        // ReadBufferedRequestBodyAsync's own ContentLength is 0 short-circuit
        // returns (null, false) for the bodyless case anyway, so the audit row
        // is unchanged.
+        //
+        // M5.3 (T7): check if the matched method/target has SkipBodyCapture set.
+        // The route value is resolved BEFORE the pipeline runs (route matching
+        // has already bound {methodName} at this point), so we can skip the
+        // EnableBuffering allocation and body read up front.
+        var methodNameForOverride = ctx.Request.RouteValues.TryGetValue("methodName", out var rv)
+            && rv is string mn && !string.IsNullOrWhiteSpace(mn) ? mn : null;
+        var skipBody = methodNameForOverride != null
+            && opts.PerTargetOverrides.TryGetValue(methodNameForOverride, out var perTarget)
+            && perTarget.SkipBodyCapture;
+
        var requestBody = (string?)null;
        var requestTruncated = false;
-        if (RequestHasBody(ctx.Request))
+        if (!skipBody && RequestHasBody(ctx.Request))
        {
            ctx.Request.EnableBuffering();
            (requestBody, requestTruncated) =
@@ -200,15 +223,25 @@ public sealed class AuditWriteMiddleware
            // The forwarding wrapper has already written every byte to the
            // original sink; this just pulls back the bounded UTF-8 string.
            ctx.Response.Body = originalResponseBody;
-            var (responseBody, responseTruncated) = captureStream.GetCapturedBody();
+            var (capturedResponseBody, capturedResponseTruncated) = captureStream.GetCapturedBody();
+            // M5.3 (T7): if SkipBodyCapture is set, discard the captured response
+            // body (the request body was never captured above). The row + headers
+            // still emit with null RequestSummary / ResponseSummary.
+            // Truncation flags are also cleared so ceiling-hit counter is not
+            // bumped for methods that deliberately opt out of body capture.
+            var responseBody = skipBody ? null : capturedResponseBody;
+            var responseTruncated = skipBody ? false : capturedResponseTruncated;

            EmitInboundAudit(
                ctx,
+                opts,
                sw.ElapsedMilliseconds,
                thrown,
                requestBody,
                responseBody,
-                requestTruncated || responseTruncated);
+                requestTruncated || responseTruncated,
+                requestTruncated,
+                responseTruncated);
        }
    }

@@ -219,11 +252,14 @@ public sealed class AuditWriteMiddleware
    /// </summary>
    private void EmitInboundAudit(
        HttpContext ctx,
+        AuditLogOptions opts,
        long durationMs,
        Exception? thrown,
        string? requestBody,
        string? responseBody,
-        bool payloadTruncated)
+        bool payloadTruncated,
+        bool requestTruncated = false,
+        bool responseTruncated = false)
    {
        try
        {
@@ -243,10 +279,43 @@ public sealed class AuditWriteMiddleware
            var actor = isAuthFailure ? null : ResolveActor(ctx);
            var methodName = ResolveMethodName(ctx);

+            // M5.3 (T7): increment the ceiling-hits counter once per request
+            // that hit the cap on EITHER the request or response body.
+            if (requestTruncated || responseTruncated)
+            {
+                try { _ceilingHitsCounter.Increment(); } catch { /* swallow per §7 */ }
+            }
+
+            // M5.3 (T7): capture request headers into Extra JSON alongside the
+            // existing remoteIp / userAgent provenance fields. The header
+            // collection is run through the SAME header-redaction list
+            // (AuditLogOptions.HeaderRedactList) that the ScadaBridgeAuditRedactor
+            // applies to RequestSummary / ResponseSummary — auth/sensitive
+            // headers are redacted before they land in the row. Uses the SAME
+            // options snapshot captured at request start (passed in as opts) as
+            // the SkipBodyCapture / PerTargetOverrides decisions, so a mid-request
+            // live-reload can't split the body-capture and header-redaction
+            // verdicts across two different snapshots.
+            var redactSet = new HashSet<string>(
+                opts.HeaderRedactList,
+                StringComparer.OrdinalIgnoreCase);
+
+            var headerDict = new Dictionary<string, string>(StringComparer.Ordinal);
+            foreach (var header in ctx.Request.Headers)
+            {
+                // Redact headers whose name appears in the HeaderRedactList —
+                // the same "<redacted>" marker used by ScadaBridgeAuditRedactor.
+                var value = redactSet.Contains(header.Key)
+                    ? "<redacted>"
+                    : header.Value.ToString();
+                headerDict[header.Key] = value;
+            }
+
            var extra = JsonSerializer.Serialize(new
            {
                remoteIp = ctx.Connection.RemoteIpAddress?.ToString(),
                userAgent = ctx.Request.Headers.UserAgent.ToString(),
+                requestHeaders = headerDict,
            });

            var evt = ScadaBridgeAuditEventFactory.Create(
@@ -205,6 +205,47 @@ public class RouteTarget
        return response.Values;
    }

+    /// <summary>
+    /// Blocks until a remote instance attribute reaches <paramref name="targetValue"/>
+    /// or <paramref name="timeout"/> elapses (spec §6). Value-equality ONLY across the
+    /// wire: the target is canonically encoded via <see cref="AttributeValueCodec"/> and
+    /// the site evaluates equality — there is no predicate and no quality flag in the
+    /// comparison.
+    /// </summary>
+    /// <param name="attributeName">Name of the attribute to wait on.</param>
+    /// <param name="targetValue">Target value the attribute must equal for the wait to match.</param>
+    /// <param name="timeout">Maximum time to wait for the attribute to reach the target value.</param>
+    /// <param name="cancellationToken">Optional cancellation token; defaults to the method deadline.</param>
+    /// <returns>A task that resolves to <c>true</c> if the attribute reached the target value, <c>false</c> if the wait timed out.</returns>
+    public async Task<bool> WaitForAttribute(
+        string attributeName,
+        object? targetValue,
+        TimeSpan timeout,
+        CancellationToken cancellationToken = default)
+    {
+        var token = Effective(cancellationToken);
+        var siteId = await ResolveSiteAsync(token);
+
+        // Audit Log #23 (ParentExecutionId): mirrors the Call path — stamp the
+        // spawning inbound request's ExecutionId so future site-side audit
+        // emission for routed waits can record this wait's parent. CorrelationId
+        // is the per-operation lifecycle id, freshly minted per routed wait.
+        var request = new RouteToWaitForAttributeRequest(
+            Guid.NewGuid().ToString(), _instanceCode, attributeName,
+            AttributeValueCodec.Encode(targetValue), timeout, DateTimeOffset.UtcNow,
+            _parentExecutionId);
+
+        var response = await _instanceRouter.RouteToWaitForAttributeAsync(siteId, request, token);
+
+        if (!response.Success)
+        {
+            throw new InvalidOperationException(
+                response.ErrorMessage ?? "Remote attribute wait failed");
+        }
+
+        return response.Matched;
+    }
+
    /// <summary>
    /// Sets a single attribute value on the remote instance.
    /// </summary>
@@ -18,13 +18,17 @@ namespace ZB.MOM.WW.ScadaBridge.ManagementService;

 /// <summary>
 /// Minimal-API endpoints exposing the central Audit Log (#23) over HTTP for the
-/// ScadaBridge CLI (M8). Two routes:
+/// ScadaBridge CLI (M8). Three routes:
 /// <list type="bullet">
 ///   <item><c>GET /api/audit/query</c> — keyset-paged JSON page, gated on the
 ///   <see cref="AuthorizationPolicies.OperationalAudit"/> permission.</item>
 ///   <item><c>GET /api/audit/export</c> — streamed bulk export (csv / jsonl;
 ///   parquet returns HTTP 501), gated on the
 ///   <see cref="AuthorizationPolicies.AuditExport"/> permission.</item>
+///   <item><c>GET /api/audit/tree</c> — execution-chain tree rooted at the
+///   topmost ancestor of a given <c>executionId</c>, returned as a JSON array
+///   of <see cref="ExecutionTreeNode"/>; gated on
+///   <see cref="AuthorizationPolicies.OperationalAudit"/>.</item>
 /// </list>
 ///
 /// <para>
@@ -85,8 +89,16 @@ public static class AuditEndpoints
        Converters = { new JsonStringEnumConverter() },
    };

+    /// <summary>Default sentinel written by the backfill endpoint when the caller omits <c>sentinel</c>.</summary>
+    public const string DefaultBackfillSentinel = "unknown";
+
+    /// <summary>Default batch size for the backfill endpoint when the caller omits <c>batchSize</c>.</summary>
+    public const int DefaultBackfillBatchSize = 5000;
+
    /// <summary>
-    /// Registers the <c>/api/audit/query</c> and <c>/api/audit/export</c> minimal-API endpoints.
+    /// Registers the <c>/api/audit/query</c>, <c>/api/audit/export</c>,
+    /// <c>/api/audit/tree</c>, and <c>POST /api/audit/backfill-source-node</c>
+    /// minimal-API endpoints.
    /// </summary>
    /// <param name="endpoints">The endpoint route builder to register routes on.</param>
    /// <returns>The same <paramref name="endpoints"/> builder, for chaining.</returns>
@@ -94,6 +106,8 @@ public static class AuditEndpoints
    {
        endpoints.MapGet("/api/audit/query", (Delegate)HandleQuery);
        endpoints.MapGet("/api/audit/export", (Delegate)HandleExport);
+        endpoints.MapGet("/api/audit/tree", (Delegate)HandleTree);
+        endpoints.MapPost("/api/audit/backfill-source-node", (Delegate)HandleBackfillSourceNode);
        return endpoints;
    }

@@ -232,6 +246,177 @@ public static class AuditEndpoints
        return Results.Empty;
    }

+    // ─────────────────────────────────────────────────────────────────────
+    // GET /api/audit/tree
+    // ─────────────────────────────────────────────────────────────────────
+
+    /// <summary>
+    /// Handles <c>GET /api/audit/tree?executionId=...</c>: authenticates, checks the
+    /// OperationalAudit permission, and returns the full execution-chain tree rooted at
+    /// the topmost ancestor of the supplied <c>executionId</c>. The response is a JSON
+    /// array of <see cref="ExecutionTreeNode"/> objects (empty array when the id is
+    /// not found). Returns HTTP 400 when <c>executionId</c> is absent or not a valid
+    /// GUID.
+    /// </summary>
+    /// <param name="context">The HTTP context for the current request.</param>
+    /// <returns>A task that resolves to the HTTP result (200 JSON array, 400, 401, or 403).</returns>
+    internal static async Task<IResult> HandleTree(HttpContext context)
+    {
+        var auth = await AuthenticateAsync(context);
+        if (auth.Failure is not null)
+        {
+            return auth.Failure;
+        }
+
+        if (!HasAnyRole(auth.User!, AuthorizationPolicies.OperationalAuditRoles))
+        {
+            return Forbidden("OperationalAudit");
+        }
+
+        var raw = context.Request.Query["executionId"].ToString();
+        if (string.IsNullOrWhiteSpace(raw) || !Guid.TryParse(raw, out var executionId))
+        {
+            return Results.Json(
+                new { error = "Missing or invalid 'executionId' query parameter (expected a GUID).", code = "BAD_REQUEST" },
+                statusCode: 400);
+        }
+
+        var repo = context.RequestServices.GetRequiredService<IAuditLogRepository>();
+        var nodes = await repo.GetExecutionTreeAsync(executionId, context.RequestAborted);
+
+        return Results.Json(nodes, JsonOptions);
+    }
+
+    // ─────────────────────────────────────────────────────────────────────
+    // POST /api/audit/backfill-source-node
+    // ─────────────────────────────────────────────────────────────────────
+
+    /// <summary>
+    /// Handles <c>POST /api/audit/backfill-source-node</c>: authenticates (Admin role
+    /// required), reads the JSON body for <c>sentinel</c> / <c>before</c> /
+    /// <c>batchSize</c>, and calls
+    /// <see cref="IAuditLogRepository.BackfillSourceNodeAsync"/> on the maintenance
+    /// path.
+    ///
+    /// <para>
+    /// <b>Auth.</b> Admin-only — backfilling the SourceNode column is a one-time ops
+    /// procedure that mutates the AuditLog table via the maintenance path (NOT the
+    /// append-only writer role). Restricted to <see cref="AuthorizationPolicies.AuditExportRoles"/>
+    /// (Administrator) so it is never accessible to Viewer-role users.
+    /// </para>
+    ///
+    /// <para>
+    /// <b>Request body.</b>
+    /// <code>
+    /// {
+    ///   "sentinel":  "unknown",   // optional; default "unknown"
+    ///   "before":    "2026-01-01T00:00:00Z",  // required ISO-8601 UTC
+    ///   "batchSize": 5000         // optional; default 5000
+    /// }
+    /// </code>
+    /// </para>
+    ///
+    /// <para>
+    /// <b>Response (200).</b>
+    /// <code>{ "rowsUpdated": 12345, "sentinel": "unknown", "before": "2026-01-01T00:00:00Z" }</code>
+    /// </para>
+    /// </summary>
+    /// <param name="context">The HTTP context for the current request.</param>
+    /// <returns>A task that resolves to the HTTP result (200 JSON, 400, 401, or 403).</returns>
+    internal static async Task<IResult> HandleBackfillSourceNode(HttpContext context)
+    {
+        var auth = await AuthenticateAsync(context);
+        if (auth.Failure is not null)
+        {
+            return auth.Failure;
+        }
+
+        // Admin-only: backfilling is a one-time ops procedure on the maintenance path.
+        if (!HasAnyRole(auth.User!, AuthorizationPolicies.AuditExportRoles))
+        {
+            return Forbidden("Administrator");
+        }
+
+        string bodyText;
+        try
+        {
+            using var reader = new System.IO.StreamReader(context.Request.Body);
+            bodyText = await reader.ReadToEndAsync(context.RequestAborted);
+        }
+        catch (OperationCanceledException)
+        {
+            return Results.Json(new { error = "Request cancelled.", code = "CANCELLED" }, statusCode: 499);
+        }
+
+        string sentinel = DefaultBackfillSentinel;
+        DateTime? beforeUtc = null;
+        int batchSize = DefaultBackfillBatchSize;
+
+        if (!string.IsNullOrWhiteSpace(bodyText))
+        {
+            try
+            {
+                using var doc = System.Text.Json.JsonDocument.Parse(bodyText);
+                var root = doc.RootElement;
+
+                if (root.TryGetProperty("sentinel", out var sentinelEl))
+                {
+                    var s = sentinelEl.GetString();
+                    if (!string.IsNullOrWhiteSpace(s))
+                    {
+                        sentinel = s.Trim();
+                    }
+                }
+
+                if (root.TryGetProperty("before", out var beforeEl))
+                {
+                    if (DateTime.TryParse(
+                        beforeEl.GetString(),
+                        System.Globalization.CultureInfo.InvariantCulture,
+                        System.Globalization.DateTimeStyles.AssumeUniversal | System.Globalization.DateTimeStyles.AdjustToUniversal,
+                        out var parsed))
+                    {
+                        beforeUtc = DateTime.SpecifyKind(parsed, DateTimeKind.Utc);
+                    }
+                    else
+                    {
+                        return Results.Json(
+                            new { error = "Invalid 'before' value; expected ISO-8601 UTC datetime.", code = "BAD_REQUEST" },
+                            statusCode: 400);
+                    }
+                }
+
+                if (root.TryGetProperty("batchSize", out var batchEl) && batchEl.TryGetInt32(out var b) && b > 0)
+                {
+                    batchSize = b;
+                }
+            }
+            catch (System.Text.Json.JsonException)
+            {
+                return Results.Json(
+                    new { error = "Request body must be valid JSON.", code = "BAD_REQUEST" },
+                    statusCode: 400);
+            }
+        }
+
+        if (beforeUtc is null)
+        {
+            return Results.Json(
+                new { error = "Required field 'before' (ISO-8601 UTC datetime) is missing.", code = "BAD_REQUEST" },
+                statusCode: 400);
+        }
+
+        var repo = context.RequestServices.GetRequiredService<IAuditLogRepository>();
+        var rowsUpdated = await repo.BackfillSourceNodeAsync(sentinel, beforeUtc.Value, batchSize, context.RequestAborted);
+
+        return Results.Json(new
+        {
+            rowsUpdated,
+            sentinel,
+            before = beforeUtc.Value.ToString("O", System.Globalization.CultureInfo.InvariantCulture),
+        }, JsonOptions);
+    }
+
    /// <summary>
    /// Streams every matching row as RFC 4180 CSV, paging the repository with its
    /// keyset cursor and flushing after each page so a large export starts
@@ -122,6 +122,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
        Receive<DiscardNotificationRequest>(HandleDiscard);
        Receive<NotificationKpiRequest>(HandleKpiRequest);
        Receive<PerSiteNotificationKpiRequest>(HandlePerSiteKpiRequest);
+        Receive<PerNodeNotificationKpiRequest>(HandlePerNodeKpiRequest);
    }

    /// <inheritdoc />
@@ -1081,6 +1082,38 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
        return new PerSiteNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, sites);
    }

+    /// <summary>
+    /// Handles a per-node KPI request, computing the per-source-node outbox metrics with the
+    /// same stuck cutoff and delivered window as <see cref="HandleKpiRequest"/>. Additive
+    /// alongside <see cref="HandlePerSiteKpiRequest"/> — does not change per-site behaviour.
+    /// </summary>
+    private void HandlePerNodeKpiRequest(PerNodeNotificationKpiRequest request)
+    {
+        var sender = Sender;
+        var now = DateTimeOffset.UtcNow;
+        var stuckCutoff = StuckCutoff(now);
+        var deliveredSince = now - _options.DeliveredKpiWindow;
+
+        ComputePerNodeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(
+            sender,
+            success: response => response,
+            failure: ex => new PerNodeNotificationKpiResponse(
+                request.CorrelationId,
+                Success: false,
+                ErrorMessage: ex.GetBaseException().Message,
+                Nodes: Array.Empty<NodeNotificationKpiSnapshot>()));
+    }
+
+    private async Task<PerNodeNotificationKpiResponse> ComputePerNodeKpisAsync(
+        string correlationId, DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince)
+    {
+        using var scope = _serviceProvider.CreateScope();
+        var repository = scope.ServiceProvider.GetRequiredService<INotificationOutboxRepository>();
+        var nodes = await repository.ComputePerNodeKpisAsync(stuckCutoff, deliveredSince);
+
+        return new PerNodeNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, nodes);
+    }
+
    /// <summary>
    /// The instant before which a still-pending notification counts as stuck — <paramref name="now"/>
    /// offset back by <see cref="NotificationOutboxOptions.StuckAgeThreshold"/>.
@@ -239,6 +239,7 @@ public class SiteCallAuditActor : ReceiveActor
        Receive<SiteCallDetailRequest>(HandleDetail);
        Receive<SiteCallKpiRequest>(HandleKpi);
        Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
+        Receive<PerNodeSiteCallKpiRequest>(HandlePerNodeKpi);

        // Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
        Receive<RegisterCentralCommunication>(msg =>
@@ -817,6 +818,47 @@ public class SiteCallAuditActor : ReceiveActor
        }
    }

+    /// <summary>
+    /// Handles a per-node KPI request, using the same stuck cutoff and
+    /// interval bound as <see cref="HandleKpi"/>. Additive alongside
+    /// <see cref="HandlePerSiteKpi"/> — does not change per-site behaviour.
+    /// </summary>
+    private void HandlePerNodeKpi(PerNodeSiteCallKpiRequest request)
+    {
+        var sender = Sender;
+        var now = DateTime.UtcNow;
+        var stuckCutoff = now - _options.StuckAgeThreshold;
+        var intervalSince = now - _options.KpiInterval;
+
+        PerNodeKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
+            sender,
+            success: response => response,
+            failure: ex => new PerNodeSiteCallKpiResponse(
+                request.CorrelationId,
+                Success: false,
+                ErrorMessage: ex.GetBaseException().Message,
+                Nodes: Array.Empty<SiteCallNodeKpiSnapshot>()));
+    }
+
+    private async Task<PerNodeSiteCallKpiResponse> PerNodeKpiAsync(
+        string correlationId, DateTime stuckCutoff, DateTime intervalSince)
+    {
+        var (scope, repository) = ResolveRepository();
+        try
+        {
+            var nodes = await repository
+                .ComputePerNodeKpisAsync(stuckCutoff, intervalSince)
+                .ConfigureAwait(false);
+
+            return new PerNodeSiteCallKpiResponse(
+                correlationId, Success: true, ErrorMessage: null, nodes);
+        }
+        finally
+        {
+            scope?.Dispose();
+        }
+    }
+
    // ── Task 5: central→site Retry/Discard relay ──

    /// <summary>
@@ -571,7 +571,20 @@ public class AlarmActor : ReceiveActor
    /// Passes the firing alarm's level/priority/message so the script can
    /// branch on severity via the <c>Alarm</c> global.
    /// </summary>
-    private void SpawnAlarmExecution(AlarmLevel level, int priority, string message)
+    /// <param name="level">The firing alarm severity level.</param>
+    /// <param name="priority">The firing alarm priority.</param>
+    /// <param name="message">The firing alarm message.</param>
+    /// <param name="parentExecutionId">
+    /// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
+    /// the context that fired this alarm, recorded as the on-trigger script run's
+    /// <c>ParentExecutionId</c> so the alarm-triggered run chains under its firing
+    /// context in the audit tree. The alarm subsystem currently has no Guid-typed
+    /// firing id, so the only call sites pass <c>null</c> (the on-trigger run is a
+    /// root). The parameter exists so a future firing-id can flow without
+    /// touching the actor wiring.
+    /// </param>
+    private void SpawnAlarmExecution(
+        AlarmLevel level, int priority, string message, Guid? parentExecutionId = null)
    {
        if (_onTriggerCompiledScript == null) return;

@@ -591,7 +604,9 @@ public class AlarmActor : ReceiveActor
            _options,
            _logger,
            // M2.5 (#9): per-script timeout from the on-trigger script (null = global).
-            _onTriggerExecutionTimeoutSeconds));
+            _onTriggerExecutionTimeoutSeconds,
+            // Audit Log #23 (M5.4): the firing context's execution id (null today).
+            parentExecutionId));

        Context.ActorOf(props, executionId);
    }
@@ -29,6 +29,14 @@ public class AlarmExecutionActor : ReceiveActor
    /// <param name="options">Site runtime configuration options, including the execution timeout.</param>
    /// <param name="logger">Logger for execution diagnostics.</param>
    /// <param name="executionTimeoutSeconds">M2.5 (#9): the on-trigger script's per-script execution timeout in seconds. Null or non-positive falls back to the global <see cref="SiteRuntimeOptions.ScriptExecutionTimeoutSeconds"/>.</param>
+    /// <param name="parentExecutionId">
+    /// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
+    /// the context that fired this alarm, threaded into the on-trigger script's
+    /// <see cref="ScriptRuntimeContext"/> as its <c>ParentExecutionId</c> so the
+    /// alarm-triggered run chains under its firing context. Null today (no
+    /// Guid-typed firing id exists yet) — the run is a root, but the plumbing
+    /// is in place for a future firing id.
+    /// </param>
    public AlarmExecutionActor(
        string alarmName,
        string instanceName,
@@ -42,7 +50,9 @@ public class AlarmExecutionActor : ReceiveActor
        ILogger logger,
        // M2.5 (#9): per-script execution timeout override (seconds) for the
        // alarm on-trigger script. Null or non-positive falls back to the global.
-        int? executionTimeoutSeconds = null)
+        int? executionTimeoutSeconds = null,
+        // Audit Log #23 (M5.4): the firing context's execution id (null today).
+        Guid? parentExecutionId = null)
    {
        var self = Self;
        var parent = Context.Parent;
@@ -51,7 +61,7 @@ public class AlarmExecutionActor : ReceiveActor
            alarmName, instanceName, level, priority, message,
            compiledScript, instanceActor,
            sharedScriptLibrary, options, self, parent, logger,
-            executionTimeoutSeconds);
+            executionTimeoutSeconds, parentExecutionId);
    }

    private static void ExecuteAlarmScript(
@@ -67,7 +77,8 @@ public class AlarmExecutionActor : ReceiveActor
        IActorRef self,
        IActorRef parent,
        ILogger logger,
-        int? executionTimeoutSeconds)
+        int? executionTimeoutSeconds,
+        Guid? parentExecutionId)
    {
        // M2.5 (#9): per-script timeout overrides the global default. A null or
        // non-positive per-script value (≤ 0) falls back to the global.
@@ -95,7 +106,19 @@ public class AlarmExecutionActor : ReceiveActor
                    options.MaxScriptCallDepth,
                    timeout,
                    instanceName,
-                    logger);
+                    logger,
+                    // Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the
+                    // alarm on-trigger run mints its own fresh ExecutionId (the
+                    // ctor's `?? NewGuid()` fallback) and records the firing
+                    // context's id as its ParentExecutionId — null today, so the
+                    // run is a root, but the plumbing exists for a future
+                    // firing id.
+                    parentExecutionId: parentExecutionId,
+                    // WaitForAttribute (spec §4.4): thread the alarm on-trigger
+                    // script's per-script execution-timeout token so a
+                    // Attributes.WaitAsync inside an on-trigger script is bounded
+                    // by the same script deadline.
+                    scriptTimeoutToken: cts.Token);

                var globals = new ScriptGlobals
                {
@@ -149,6 +149,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
        Receive<RouteToCallRequest>(RouteInboundApiCall);
        Receive<RouteToGetAttributesRequest>(RouteInboundApiGetAttributes);
        Receive<RouteToSetAttributesRequest>(RouteInboundApiSetAttributes);
+        Receive<RouteToWaitForAttributeRequest>(RouteInboundApiWaitForAttribute);

        // OPC UA Tag Browser — singleton-only re-forward to local /user/dcl-manager.
        // BrowseNodeCommand is routed to this singleton (active node) by
@@ -1078,6 +1079,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
        }).PipeTo(sender);
    }

+    /// <summary>
+    /// Spec §6 (WD-2b): unpacks a routed <see cref="RouteToWaitForAttributeRequest"/>
+    /// (inbound-API <c>Route.To().WaitForAttribute()</c>) into the deployed
+    /// Instance Actor's site-local <see cref="WaitForAttributeRequest"/> and relays
+    /// the result back. Value-equality only across the wire — the predicate is null
+    /// and <c>RequireGoodQuality</c> is left at its default. The Ask is bounded by the
+    /// wait timeout plus slack (NOT a fixed 30s), since the wait legitimately blocks
+    /// for up to <see cref="RouteToWaitForAttributeRequest.Timeout"/>.
+    /// </summary>
+    private void RouteInboundApiWaitForAttribute(RouteToWaitForAttributeRequest request)
+    {
+        if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
+        {
+            Sender.Tell(new RouteToWaitForAttributeResponse(
+                request.CorrelationId, false, null, null, false,
+                false, $"Instance '{request.InstanceUniqueName}' not found on this site.",
+                DateTimeOffset.UtcNow));
+            return;
+        }
+
+        var sender = Sender;
+        // Routed waits are value-equality only (predicate null); RequireGoodQuality left at default.
+        var inner = new WaitForAttributeRequest(
+            request.CorrelationId, request.InstanceUniqueName, request.AttributeName,
+            request.TargetValueEncoded, null, request.Timeout, DateTimeOffset.UtcNow);
+
+        // Ask bounded by the WAIT timeout + slack — NOT a fixed 30s (the wait legitimately blocks up to request.Timeout).
+        instanceActor.Ask<WaitForAttributeResponse>(inner, request.Timeout + TimeSpan.FromSeconds(5))
+            .ContinueWith(t => t.IsCompletedSuccessfully
+                ? new RouteToWaitForAttributeResponse(
+                    request.CorrelationId, t.Result.Matched, t.Result.Value, t.Result.Quality, t.Result.TimedOut,
+                    true, null, DateTimeOffset.UtcNow)
+                : new RouteToWaitForAttributeResponse(
+                    request.CorrelationId, false, null, null, false,
+                    false, t.Exception?.GetBaseException().Message ?? "Attribute wait timed out",
+                    DateTimeOffset.UtcNow))
+            .PipeTo(sender);
+    }
+
    /// <summary>
    /// Writes attribute values on a deployed instance for a Route.To().SetAttribute(s)
    /// call (or a central Test Run bound to the instance). Each write is Ask'd to the
@@ -68,6 +68,18 @@ public class InstanceActor : ReceiveActor
    // mirroring the rest of the actor's by-name dictionaries).
    private readonly Dictionary<string, ResolvedAttribute> _resolvedAttributeByName = new();

+    // WaitForAttribute (spec §4.2): one-shot waiter registry keyed by the
+    // request CorrelationId. Each entry holds the watched attribute name, the
+    // match test (decoded target equality OR a site-local predicate), the
+    // original Sender to reply to, and the scheduled-timeout handle so a match
+    // can cancel it. Single-threaded actor access — no locking needed.
+    private readonly Dictionary<string, PendingWait> _attributeWaiters = new();
+
+    // WaitForAttribute: defensive per-instance cap so a script leaking waiters
+    // in a loop cannot grow the registry without bound. Exceeding it refuses the
+    // wait with an error reply rather than registering.
+    private const int MaxAttributeWaiters = 100;
+
    // DCL manager actor reference for subscribing to tag values
    private readonly IActorRef? _dclManager;
    // Maps each tag path to every attribute canonical name that references it.
@@ -170,6 +182,12 @@ public class InstanceActor : ReceiveActor
        // WP-22/23: Handle attribute value changes from DCL (Tell pattern)
        Receive<AttributeValueChanged>(HandleAttributeValueChanged);

+        // WaitForAttribute (spec §4.2): event-driven "wait for value" waiter
+        // registration + its scheduled-timeout self-message. Both flow only
+        // site-locally (the predicate variant carries a non-serializable delegate).
+        Receive<WaitForAttributeRequest>(HandleWaitForAttribute);
+        Receive<WaitForAttributeTimeout>(HandleWaitForAttributeTimeout);
+
        // Handle tag value updates from DCL — convert to AttributeValueChanged
        Receive<TagValueUpdate>(HandleTagValueUpdate);
        Receive<SubscribeTagsResponse>(_ => { }); // Ack from DCL subscribe — no action needed
@@ -519,6 +537,114 @@ public class InstanceActor : ReceiveActor
        PublishAndNotifyChildren(changed);
    }

+    /// <summary>
+    /// WaitForAttribute (spec §4.2): registers a one-shot event-driven waiter for
+    /// an attribute to reach a value (encoded-equality), satisfy a site-local
+    /// predicate, or change at all. The current-value fast-path and the
+    /// change-handling in <see cref="HandleAttributeValueChanged"/> both run on
+    /// this single-threaded actor, so a value that flips between "read current"
+    /// and "register" cannot be missed (spec §5).
+    /// </summary>
+    private void HandleWaitForAttribute(WaitForAttributeRequest req)
+    {
+        // Capture the sender immediately — Sender is invalid once we schedule /
+        // return and a later message arrives.
+        var replyer = Sender;
+
+        // Build the match test: explicit predicate wins; else null encoded target
+        // means "any change"; else compare the codec-encoded current value to the
+        // encoded target (avoids needing the attribute's DataType to decode).
+        Func<object?, bool> test;
+        if (req.Predicate is not null)
+        {
+            test = req.Predicate;
+        }
+        else if (req.TargetValueEncoded is null)
+        {
+            test = _ => true;
+        }
+        else
+        {
+            var target = req.TargetValueEncoded;
+            test = v => string.Equals(
+                AttributeValueCodec.Encode(v), target, StringComparison.Ordinal);
+        }
+
+        // Fast path: the current value already satisfies the test → reply now.
+        // A script-supplied predicate (or the codec-equality lambda) runs on the
+        // actor thread; guard it so a throwing predicate cannot crash the actor or
+        // leak a never-resolved waiter. On throw: reply non-matched + ErrorMessage
+        // and return WITHOUT registering (no timeout scheduled).
+        if (_attributes.TryGetValue(req.AttributeName, out var current))
+        {
+            // Effective quality used for BOTH the §4.2 quality gate and the match
+            // reply — the same `?? "Good"` default the reply has always used.
+            _attributeQualities.TryGetValue(req.AttributeName, out var fastQuality);
+            var effectiveQuality = fastQuality ?? "Good";
+
+            bool fastMatch;
+            try
+            {
+                // §4.2 quality gate ANDed with the value test, both INSIDE the guard:
+                // in quality-gated mode a value already at target but at Bad/Uncertain
+                // quality is NOT a fast match — it falls through to register + schedule
+                // the timeout like any other pending waiter (do NOT fast-reply matched).
+                fastMatch =
+                    (!req.RequireGoodQuality
+                        || string.Equals(effectiveQuality, "Good", StringComparison.Ordinal))
+                    && test(current);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogWarning(ex,
+                    "WaitForAttribute predicate threw on the fast-path for {Instance}.{Attribute}; refusing the wait",
+                    _instanceUniqueName, req.AttributeName);
+                replyer.Tell(new WaitForAttributeResponse(
+                    req.CorrelationId, Matched: false, null, null, TimedOut: false,
+                    ErrorMessage: "Wait predicate threw: " + ex.Message));
+                return;
+            }
+
+            if (fastMatch)
+            {
+                replyer.Tell(new WaitForAttributeResponse(
+                    req.CorrelationId, Matched: true, current, effectiveQuality, TimedOut: false));
+                return;
+            }
+        }
+
+        // Defensive cap: refuse rather than register if the instance already has
+        // too many concurrent waiters (guards against a script leaking waiters).
+        if (_attributeWaiters.Count >= MaxAttributeWaiters)
+        {
+            replyer.Tell(new WaitForAttributeResponse(
+                req.CorrelationId, Matched: false, null, null, TimedOut: false,
+                ErrorMessage: "Too many concurrent attribute waiters on this instance"));
+            return;
+        }
+
+        // Register and schedule the self-evicting timeout (NativeAlarmActor idiom).
+        var handle = Context.System.Scheduler.ScheduleTellOnceCancelable(
+            req.Timeout, Self, new WaitForAttributeTimeout(req.CorrelationId), Self);
+
+        _attributeWaiters[req.CorrelationId] =
+            new PendingWait(req.AttributeName, test, replyer, handle, req.RequireGoodQuality);
+    }
+
+    /// <summary>
+    /// WaitForAttribute (spec §4.2): the scheduled timeout fired for a waiter that
+    /// never matched. If still registered (a match would have removed + canceled
+    /// it), reply TimedOut and evict it.
+    /// </summary>
+    private void HandleWaitForAttributeTimeout(WaitForAttributeTimeout msg)
+    {
+        if (_attributeWaiters.Remove(msg.CorrelationId, out var pending))
+        {
+            pending.Replyer.Tell(new WaitForAttributeResponse(
+                msg.CorrelationId, Matched: false, null, null, TimedOut: true));
+        }
+    }
+
    /// <summary>
    /// Handles tag value updates from DCL. Maps the tag path back to the attribute
    /// canonical name and converts to an AttributeValueChanged for unified processing.
@@ -556,9 +682,14 @@ public class InstanceActor : ReceiveActor
                    _attributeQualities[attrName] = "Bad";
                    _attributeTimestamps[attrName] = update.Timestamp;
                    var currentValue = _attributes.GetValueOrDefault(attrName);
+                    // WaitForAttribute (spec §4.2): quality-only republish — the
+                    // stored value is UNCHANGED (we publish the OLD currentValue, only
+                    // the quality flips to Bad). Do NOT evaluate waiters, or an
+                    // "any-change" / unchanged-value-equality waiter would fire on a
+                    // non-change.
                    PublishAndNotifyChildren(new AttributeValueChanged(
                        _instanceUniqueName, update.TagPath, attrName,
-                        currentValue, "Bad", update.Timestamp));
+                        currentValue, "Bad", update.Timestamp), evaluateWaiters: false);
                }
                continue;
            }
@@ -908,7 +1039,17 @@ public class InstanceActor : ReceiveActor
    /// Publishes attribute change to stream and notifies child Script/Alarm actors.
    /// WP-22: Tell for attribute notifications (fire-and-forget, never blocks).
    /// </summary>
-    private void PublishAndNotifyChildren(AttributeValueChanged changed)
+    /// <param name="changed">The attribute change to publish.</param>
+    /// <param name="evaluateWaiters">
+    /// WaitForAttribute (spec §4.2): when <c>true</c> (the default), registered
+    /// <c>Attributes.WaitAsync</c> waiters on this attribute are re-evaluated against
+    /// <paramref name="changed"/>'s value. Pass <c>false</c> on republish/quality-only
+    /// paths that do NOT assign a new value to <c>_attributes[name]</c> (e.g. the
+    /// List-coerce-failure Bad-quality republish, which publishes the OLD value) —
+    /// otherwise an "any-change" waiter (or a waiter whose target equals the unchanged
+    /// value) would spuriously fire even though nothing actually changed.
+    /// </param>
+    private void PublishAndNotifyChildren(AttributeValueChanged changed, bool evaluateWaiters = true)
    {
        // WP-23: Publish to site-wide stream
        _streamManager?.PublishAttributeValueChanged(changed);
@@ -924,6 +1065,83 @@ public class InstanceActor : ReceiveActor
        {
            alarmActor.Tell(changed);
        }
+
+        // WaitForAttribute (spec §4.2): re-evaluate any waiters on THIS attribute —
+        // but ONLY when this publish reflects a real value change (evaluateWaiters).
+        // The genuine value-change paths (HandleAttributeValueChanged, the scalar
+        // DCL update path, HandleSetStaticAttributeCore) call it AFTER assigning
+        // _attributes[name], so changed.Value is the just-applied current value.
+        // Republish/quality-only paths (List-coerce-failure Bad-quality, which
+        // publishes the OLD value) pass evaluateWaiters:false so an "any-change" or
+        // unchanged-value-equality waiter does not spuriously fire (spec §4.2).
+        // Iterate a snapshot so satisfied waiters can be removed during the loop;
+        // each match cancels its scheduled timeout (so no stray WaitForAttributeTimeout
+        // follows) and replies Matched=true.
+        if (evaluateWaiters)
+            ResolveMatchedWaiters(changed);
+    }
+
+    /// <summary>
+    /// WaitForAttribute (spec §4.2): fires every registered waiter on
+    /// <paramref name="changed"/>'s attribute whose test now passes against the
+    /// just-applied value — cancelling its timeout, replying Matched, and removing
+    /// it from the registry. A no-op when there are no waiters.
+    ///
+    /// <para>
+    /// Each waiter's match test runs inside a per-waiter try/catch: a throwing
+    /// script-supplied predicate (or codec lambda) must NOT abort the loop and
+    /// strand sibling waiters on the same attribute, nor leave the throwing waiter
+    /// registered with a live scheduled timeout. On throw we cancel that waiter's
+    /// timeout, reply non-matched + ErrorMessage, remove it, and continue.
+    /// </para>
+    /// </summary>
+    private void ResolveMatchedWaiters(AttributeValueChanged changed)
+    {
+        if (_attributeWaiters.Count == 0)
+            return;
+
+        // Snapshot the candidate waiters on THIS attribute. Iterating a snapshot
+        // (and NOT evaluating the test inside the LINQ filter) keeps removal mid-loop
+        // safe and ensures one throwing test cannot abort materialization for siblings.
+        var candidates = _attributeWaiters
+            .Where(kvp => kvp.Value.AttributeName == changed.AttributeName)
+            .ToList();
+
+        foreach (var (cid, pending) in candidates)
+        {
+            bool matched;
+            try
+            {
+                // §4.2 quality gate ANDed with the value test, both INSIDE the guard:
+                // in quality-gated mode a value reaching the target at Bad/Uncertain
+                // quality is NOT a match — the waiter stays pending until it satisfies
+                // the test at Good quality (or times out).
+                matched =
+                    (!pending.RequireGoodQuality
+                        || string.Equals(changed.Quality, "Good", StringComparison.Ordinal))
+                    && pending.Test(changed.Value);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogWarning(ex,
+                    "WaitForAttribute predicate threw while resolving waiter {CorrelationId} on {Instance}.{Attribute}; evicting it",
+                    cid, _instanceUniqueName, changed.AttributeName);
+                pending.Timeout.Cancel();
+                pending.Replyer.Tell(new WaitForAttributeResponse(
+                    cid, Matched: false, null, null, TimedOut: false,
+                    ErrorMessage: "Wait predicate threw: " + ex.Message));
+                _attributeWaiters.Remove(cid);
+                continue;
+            }
+
+            if (!matched)
+                continue;
+
+            pending.Timeout.Cancel();
+            pending.Replyer.Tell(new WaitForAttributeResponse(
+                cid, Matched: true, changed.Value, changed.Quality, TimedOut: false));
+            _attributeWaiters.Remove(cid);
+        }
    }

    /// <summary>
@@ -1202,4 +1420,23 @@ public class InstanceActor : ReceiveActor
    /// Internal message for async override loading result.
    /// </summary>
    internal record LoadOverridesResult(Dictionary<string, string> Overrides, string? Error);
+
+    /// <summary>
+    /// WaitForAttribute (spec §4.2): one registered, not-yet-satisfied waiter.
+    /// </summary>
+    /// <param name="AttributeName">The attribute this waiter watches (scope-resolved).</param>
+    /// <param name="Test">The match test (decoded-target equality OR site-local predicate OR any-change).</param>
+    /// <param name="Replyer">The original sender to reply to on match / timeout.</param>
+    /// <param name="Timeout">The scheduled timeout handle, canceled on match.</param>
+    /// <param name="RequireGoodQuality">
+    /// Quality-gated ("Good"-only) mode (spec §4.2): when <c>true</c>, the resolve
+    /// loop additionally requires <c>changed.Quality == "Good"</c> before the test
+    /// can match.
+    /// </param>
+    private sealed record PendingWait(
+        string AttributeName,
+        Func<object?, bool> Test,
+        IActorRef Replyer,
+        ICancelable Timeout,
+        bool RequireGoodQuality);
 }
@@ -221,7 +221,12 @@ public class ScriptExecutionActor : ReceiveActor
                    // M2.12 (#25): thread the singleton site event logger so
                    // recursion-limit violations at CallScript/CallShared emit a
                    // script Error site event in addition to ILogger.LogError.
-                    siteEventLogger: siteEventLogger);
+                    siteEventLogger: siteEventLogger,
+                    // WaitForAttribute (spec §4.3/§4.4): thread the per-script
+                    // execution-timeout token so Attributes.WaitAsync's Ask is
+                    // bounded by the script's own ExecutionTimeoutSeconds — a
+                    // shorter script deadline wins over the wait's own timeout.
+                    scriptTimeoutToken: cts.Token);

                var globals = new ScriptGlobals
                {
@@ -73,6 +73,107 @@ public class AttributeAccessor
    /// <returns>A task that represents the asynchronous operation.</returns>
    public Task SetAsync(string key, object? value)
        => _ctx.SetAttribute(Resolve(key), AttributeValueCodec.Encode(value) ?? string.Empty);
+
+    /// <summary>
+    /// WaitForAttribute (spec §3-§5): waits event-driven until the attribute equals
+    /// <paramref name="targetValue"/> (value-equality, codec-normalized), bounded by
+    /// <paramref name="timeout"/>. Returns <c>true</c> if matched within the timeout,
+    /// <c>false</c> on timeout (no throw). Honors the script's execution-timeout token.
+    /// Scope/composition path resolution (<see cref="Resolve"/>) is applied just like
+    /// <see cref="GetAsync"/> / <see cref="SetAsync"/>.
+    ///
+    /// <para>
+    /// <b>Quality-agnostic by default (spec §4.2):</b> matching tests the VALUE, not
+    /// the quality — a value arriving at Bad quality still satisfies the wait. Pass
+    /// <paramref name="requireGoodQuality"/><c>:true</c> for quality-gated ("Good"-only)
+    /// matching: a value reaching the target at Bad/Uncertain quality is ignored and
+    /// the wait holds until the target is reached at "Good" quality (or times out).
+    /// </para>
+    ///
+    /// <para>
+    /// Passing a <b>null</b> <paramref name="targetValue"/> means "match on any change":
+    /// the wait then matches the next value the attribute receives — and matches
+    /// IMMEDIATELY (fast-path) if the attribute already holds any value at registration.
+    /// </para>
+    /// </summary>
+    /// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
+    /// <param name="targetValue">
+    /// The value to wait for (codec-encoded for comparison); <c>null</c> means
+    /// "match on any change" (matches immediately if the attribute already has a value).
+    /// </param>
+    /// <param name="timeout">How long to wait before returning false.</param>
+    /// <param name="requireGoodQuality">
+    /// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to
+    /// <c>false</c> (quality-agnostic — Bad/Uncertain-quality transients still match).
+    /// </param>
+    /// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
+    public Task<bool> WaitAsync(string key, object? targetValue, TimeSpan timeout, bool requireGoodQuality = false)
+        => _ctx.WaitAttribute(Resolve(key), AttributeValueCodec.Encode(targetValue), null, timeout, requireGoodQuality);
+
+    /// <summary>
+    /// WaitForAttribute (spec §3-§5): predicate form — waits event-driven until
+    /// <paramref name="predicate"/> returns <c>true</c> for the attribute's current
+    /// value, bounded by <paramref name="timeout"/>. Site-local only (the predicate
+    /// is an in-process delegate). Returns <c>true</c> if matched within the timeout,
+    /// <c>false</c> on timeout (no throw). Scope/composition path resolution applies.
+    ///
+    /// <para>
+    /// <b>Quality-agnostic by default (spec §4.2):</b> the predicate is tested against
+    /// the VALUE, regardless of quality — a value arriving at Bad quality still
+    /// satisfies the wait if the predicate passes. Pass <paramref name="requireGoodQuality"/>
+    /// <c>:true</c> for quality-gated ("Good"-only) matching: a value satisfying the
+    /// predicate at Bad/Uncertain quality is ignored until it does so at "Good" quality.
+    /// </para>
+    /// </summary>
+    /// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
+    /// <param name="predicate">The site-local predicate tested against the current value.</param>
+    /// <param name="timeout">How long to wait before returning false.</param>
+    /// <param name="requireGoodQuality">
+    /// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to
+    /// <c>false</c> (quality-agnostic).
+    /// </param>
+    /// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
+    public Task<bool> WaitAsync(string key, Func<object?, bool> predicate, TimeSpan timeout, bool requireGoodQuality = false)
+        => _ctx.WaitAttribute(Resolve(key), null, predicate, timeout, requireGoodQuality);
+
+    /// <summary>
+    /// WaitForAttribute (spec §3): richer value-equality form — like
+    /// <see cref="WaitAsync(string, object?, TimeSpan, bool)"/> but returns the full
+    /// <see cref="WaitResult"/> (matched flag + matched value + quality + timed-out
+    /// flag) instead of a bare bool. Scope/composition path resolution
+    /// (<see cref="Resolve"/>) is applied to <paramref name="key"/> just like the
+    /// other accessors. Never throws on timeout — a timeout yields
+    /// <c>WaitResult { Matched = false, TimedOut = true }</c>.
+    /// </summary>
+    /// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
+    /// <param name="targetValue">
+    /// The value to wait for (codec-encoded for comparison); <c>null</c> means
+    /// "match on any change".
+    /// </param>
+    /// <param name="timeout">How long to wait before returning a timed-out result.</param>
+    /// <param name="requireGoodQuality">
+    /// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to <c>false</c>.
+    /// </param>
+    /// <returns>The full <see cref="WaitResult"/> for the wait.</returns>
+    public Task<WaitResult> WaitForAsync(string key, object? targetValue, TimeSpan timeout, bool requireGoodQuality = false)
+        => _ctx.WaitAttributeFull(Resolve(key), AttributeValueCodec.Encode(targetValue), null, timeout, requireGoodQuality);
+
+    /// <summary>
+    /// WaitForAttribute (spec §3): richer predicate form — like
+    /// <see cref="WaitAsync(string, Func{object?, bool}, TimeSpan, bool)"/> but returns
+    /// the full <see cref="WaitResult"/>. Site-local only (the predicate is an
+    /// in-process delegate). Scope/composition path resolution applies. Never throws
+    /// on timeout (<c>WaitResult { Matched = false, TimedOut = true }</c>).
+    /// </summary>
+    /// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
+    /// <param name="predicate">The site-local predicate tested against the current value.</param>
+    /// <param name="timeout">How long to wait before returning a timed-out result.</param>
+    /// <param name="requireGoodQuality">
+    /// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to <c>false</c>.
+    /// </param>
+    /// <returns>The full <see cref="WaitResult"/> for the wait.</returns>
+    public Task<WaitResult> WaitForAsync(string key, Func<object?, bool> predicate, TimeSpan timeout, bool requireGoodQuality = false)
+        => _ctx.WaitAttributeFull(Resolve(key), null, predicate, timeout, requireGoodQuality);
 }

 /// <summary>
@@ -46,6 +46,16 @@ public class ScriptRuntimeContext
    private readonly ILogger _logger;
    private readonly string _instanceName;

+    /// <summary>
+    /// WaitForAttribute (spec §4.3): the per-script execution-timeout token from
+    /// the owning <c>ScriptExecutionActor</c>/<c>AlarmExecutionActor</c>
+    /// (<c>cts.Token</c>). Bounds the <c>Attributes.WaitAsync</c> Ask so a script
+    /// that hits its own <c>ExecutionTimeoutSeconds</c> abandons the wait. Defaults
+    /// to <see cref="CancellationToken.None"/> for contexts that do not thread one
+    /// (legacy callers / tests / the alarm path when it has no CTS).
+    /// </summary>
+    private readonly CancellationToken _scriptTimeoutToken;
+
    /// <summary>
    /// WP-13: External system client for ExternalSystem.Call/CachedCall.
    /// </summary>
@@ -194,6 +204,13 @@ public class ScriptRuntimeContext
    /// <c>ILogger.LogError</c> + throw. When null the existing behaviour is
    /// unchanged; all existing callers and tests remain source-compatible.
    /// </param>
+    /// <param name="scriptTimeoutToken">
+    /// WaitForAttribute (spec §4.3): the per-script execution-timeout token
+    /// (<c>cts.Token</c> on the owning execution actor) used to bound
+    /// <c>Attributes.WaitAsync</c>. Defaults to
+    /// <see cref="CancellationToken.None"/> for callers / tests that do not
+    /// thread one — those waits are bounded only by their own timeout.
+    /// </param>
    public ScriptRuntimeContext(
        IActorRef instanceActor,
        IActorRef self,
@@ -215,7 +232,8 @@ public class ScriptRuntimeContext
        Guid? executionId = null,
        Guid? parentExecutionId = null,
        string? sourceNode = null,
-        ISiteEventLogger? siteEventLogger = null)
+        ISiteEventLogger? siteEventLogger = null,
+        CancellationToken scriptTimeoutToken = default)
    {
        _instanceActor = instanceActor;
        _self = self;
@@ -245,6 +263,66 @@ public class ScriptRuntimeContext
        _parentExecutionId = parentExecutionId;
        // M2.12 (#25): optional — null when not wired (tests / AlarmExecutionActor).
        _siteEventLogger = siteEventLogger;
+        // WaitForAttribute (spec §4.3): default(CancellationToken) == None when
+        // not threaded in — the WaitAsync Ask is then bounded only by its own timeout.
+        _scriptTimeoutToken = scriptTimeoutToken;
+    }
+
+    /// <summary>
+    /// Audit Log #23 (M5.4): this run's own per-execution id. Exposed so a
+    /// nested <c>Scripts.CallShared</c> can record it as the spawned shared
+    /// script's <c>ParentExecutionId</c>, forming a true execution tree.
+    /// </summary>
+    internal Guid ExecutionId => _executionId;
+
+    /// <summary>
+    /// Audit Log #23 (M5.4): the spawning execution's id for this run (null for
+    /// a root run). Exposed for test assertions on the execution tree.
+    /// </summary>
+    internal Guid? ParentExecutionId => _parentExecutionId;
+
+    /// <summary>
+    /// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): builds a child
+    /// <see cref="ScriptRuntimeContext"/> for an inline <c>Scripts.CallShared</c>
+    /// invocation. The shared script runs inline (no actor hop) but is modelled
+    /// as its OWN execution node in the audit tree: it mints a fresh
+    /// <see cref="_executionId"/> and records THIS run's <see cref="_executionId"/>
+    /// as its <c>ParentExecutionId</c>, so <c>B → CallShared(C)</c> yields
+    /// <c>C.ParentExecutionId == B.ExecutionId</c>. Every other dependency
+    /// (actors, gateways, audit writer, site id, source node, call-depth) is
+    /// carried over verbatim from this context.
+    /// </summary>
+    /// <param name="childCallDepth">The recursion depth of the shared-script call.</param>
+    internal ScriptRuntimeContext CreateChildContextForSharedScript(int childCallDepth)
+    {
+        return new ScriptRuntimeContext(
+            _instanceActor,
+            _self,
+            _sharedScriptLibrary,
+            childCallDepth,
+            _maxCallDepth,
+            _askTimeout,
+            _instanceName,
+            _logger,
+            _externalSystemClient,
+            _databaseGateway,
+            _storeAndForward,
+            _siteCommunicationActor,
+            _siteId,
+            _sourceScript,
+            _auditWriter,
+            _operationTrackingStore,
+            _cachedForwarder,
+            // Fresh execution id for the shared-script run (omit so the ctor mints one)…
+            executionId: null,
+            // …parented to THIS run's execution id (the spawner).
+            parentExecutionId: _executionId,
+            sourceNode: _sourceNode,
+            siteEventLogger: _siteEventLogger,
+            // WaitForAttribute (spec §4.3): an inline shared-script call shares the
+            // parent run's execution-timeout token so a WaitAsync inside the shared
+            // script is bounded by the SAME script deadline.
+            scriptTimeoutToken: _scriptTimeoutToken);
    }

    /// <summary>
@@ -307,6 +385,115 @@ public class ScriptRuntimeContext
        return response.Value;
    }

+    /// <summary>
+    /// WaitForAttribute (spec §3-§5): waits event-driven for an attribute to reach
+    /// a value (encoded-equality), satisfy a site-local predicate, or change at all,
+    /// bounded by <paramref name="timeout"/>. Returns <c>true</c> if matched within
+    /// the timeout, <c>false</c> on timeout — NEVER throws on timeout. The backing
+    /// <c>Attributes.WaitAsync</c> for the accessor.
+    ///
+    /// <para>
+    /// The Ask is bounded by the script's own execution-timeout token (§4.3): a
+    /// script that hits its <c>ExecutionTimeoutSeconds</c> abandons the wait. The
+    /// Ask timeout is the wait timeout plus a small <see cref="_askTimeout"/> slack
+    /// so the InstanceActor's own scheduled timeout reply is the authoritative path
+    /// for the false/timed-out outcome, not the Ask deadline.
+    /// </para>
+    ///
+    /// <para>
+    /// <b>Quality-agnostic by default (spec §4.2):</b> a value arriving at Bad
+    /// quality still satisfies the wait — the match tests the value, not the quality.
+    /// A quality-gated ("Good"-only) mode is a planned enhancement, deferred per spec §4.2.
+    /// </para>
+    ///
+    /// <para>
+    /// <b>Never throws on timeout.</b> An <see cref="Akka.Actor.AskTimeoutException"/>
+    /// (the pathological case where the InstanceActor's authoritative timeout reply
+    /// never arrives — actor stopped/restarted) is caught and surfaced as <c>false</c>,
+    /// matching the timeout contract. An <see cref="OperationCanceledException"/> /
+    /// <see cref="TaskCanceledException"/> from the script-deadline token is NOT caught
+    /// — it propagates to abort the script (intended §4.3 behaviour).
+    /// </para>
+    /// </summary>
+    /// <param name="name">The scope-resolved attribute name to wait on.</param>
+    /// <param name="targetValueEncoded">
+    /// The codec-encoded target value; null (with null <paramref name="predicate"/>)
+    /// means "any change".
+    /// </param>
+    /// <param name="predicate">Site-local predicate; null when the encoded target is used.</param>
+    /// <param name="timeout">How long to wait before returning false.</param>
+    /// <param name="requireGoodQuality">
+    /// Quality-gated ("Good"-only) mode (spec §4.2): when <see langword="true"/>, a
+    /// value reaching the target / satisfying the predicate at Bad/Uncertain quality
+    /// is NOT a match — the wait holds until the value satisfies the test at Good
+    /// quality (or times out). Defaults to <see langword="false"/> (quality-agnostic).
+    /// </param>
+    /// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
+    public async Task<bool> WaitAttribute(
+        string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
+        bool requireGoodQuality = false)
+        => (await WaitInternal(name, targetValueEncoded, predicate, timeout, requireGoodQuality)).Matched;
+
+    /// <summary>
+    /// WaitForAttribute (spec §3): the richer overload backing <c>Attributes.WaitForAsync</c>
+    /// — identical semantics to <see cref="WaitAttribute"/> but surfaces the full
+    /// <see cref="WaitResult"/> (matched flag + matched value + quality + timed-out
+    /// flag) instead of a bare bool. Never throws on timeout (see <see cref="WaitInternal"/>).
+    /// </summary>
+    /// <param name="name">The scope-resolved attribute name to wait on.</param>
+    /// <param name="targetValueEncoded">The codec-encoded target value; null (with null predicate) means "any change".</param>
+    /// <param name="predicate">Site-local predicate; null when the encoded target is used.</param>
+    /// <param name="timeout">How long to wait before returning a timed-out result.</param>
+    /// <param name="requireGoodQuality">Quality-gated ("Good"-only) mode (spec §4.2); defaults to <see langword="false"/>.</param>
+    /// <returns>The full <see cref="WaitResult"/> — on timeout: <c>Matched:false, TimedOut:true</c>.</returns>
+    public async Task<WaitResult> WaitAttributeFull(
+        string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
+        bool requireGoodQuality = false)
+    {
+        var r = await WaitInternal(name, targetValueEncoded, predicate, timeout, requireGoodQuality);
+        return new WaitResult(r.Matched, r.Value, r.Quality, r.TimedOut);
+    }
+
+    /// <summary>
+    /// Shared core for <see cref="WaitAttribute"/> / <see cref="WaitAttributeFull"/>:
+    /// builds the <see cref="WaitForAttributeRequest"/> (incl. the §4.2
+    /// <paramref name="requireGoodQuality"/> flag), Asks the InstanceActor bounded by
+    /// the script's execution-timeout token, and returns the full response. An
+    /// <see cref="AskTimeoutException"/> (the pathological case where the actor's own
+    /// authoritative timeout reply never arrives — actor stopped/restarted) is caught
+    /// and surfaced as a synthetic non-matched/timed-out response, preserving the
+    /// "never throw on timeout" contract. An <see cref="OperationCanceledException"/> /
+    /// <see cref="TaskCanceledException"/> from the script-deadline token is NOT caught
+    /// — it propagates to abort the script (§4.3).
+    /// </summary>
+    private async Task<WaitForAttributeResponse> WaitInternal(
+        string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
+        bool requireGoodQuality)
+    {
+        var cid = Guid.NewGuid().ToString();
+        var req = new WaitForAttributeRequest(
+            cid, _instanceName, name, targetValueEncoded, predicate, timeout, DateTimeOffset.UtcNow,
+            requireGoodQuality);
+
+        try
+        {
+            return await _instanceActor.Ask<WaitForAttributeResponse>(
+                req, timeout + _askTimeout, _scriptTimeoutToken);
+        }
+        catch (AskTimeoutException)
+        {
+            // Pathological: the InstanceActor's own scheduled timeout reply never
+            // arrived (e.g. the actor stopped/restarted under us). The helper's
+            // contract is "false on timeout, never throw" — so synthesize a
+            // non-matched/timed-out response rather than leaking the Ask exception.
+            // OperationCanceledException / TaskCanceledException from the
+            // script-deadline token are deliberately NOT caught here: they must
+            // propagate to abort the script (§4.3).
+            return new WaitForAttributeResponse(
+                cid, Matched: false, null, null, TimedOut: true);
+        }
+    }
+
    /// <summary>
    /// Sets an attribute value. For data-connected attributes the Instance Actor
    /// forwards the write to the DCL, which writes the physical device; the
@@ -366,7 +553,14 @@ public class ScriptRuntimeContext
            scriptName,
            ScriptArgs.Normalize(parameters),
            nextDepth,
-            correlationId);
+            correlationId,
+            // Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the child
+            // script run is a NEW execution spawned BY this run. Its parent is
+            // THIS run's own ExecutionId — NOT the inherited _parentExecutionId.
+            // So A → CallScript(B) yields B.ParentExecutionId == A.ExecutionId,
+            // building a true multi-level execution tree rather than flattening
+            // every nested call under the original inbound spawner.
+            ParentExecutionId: _executionId);

        // Ask the Instance Actor, which routes to the appropriate Script Actor
        var result = await _instanceActor.Ask<ScriptCallResult>(request, _askTimeout);
@@ -526,8 +720,14 @@ public class ScriptRuntimeContext
                throw new InvalidOperationException(msg);
            }

+            // Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the shared
+            // script runs inline, but is modelled as its OWN execution node — a
+            // child context mints a fresh ExecutionId parented to the caller's
+            // ExecutionId, so its audit rows chain under the calling run.
+            var childContext = _context.CreateChildContextForSharedScript(nextDepth);
+
            return await _library.ExecuteAsync(
-                scriptName, _context, ScriptArgs.Normalize(parameters), cancellationToken);
+                scriptName, childContext, ScriptArgs.Normalize(parameters), cancellationToken);
        }
    }