merge: integrate WaitAsync/M5-audit (parallel session) with galaxy array-write + inbound-timeout fixes

This commit is contained in:
Joseph Doherty
2026-06-17 09:28:15 -04:00
88 changed files with 7714 additions and 169 deletions
@@ -39,10 +39,12 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
public sealed class AuditCentralHealthSnapshot
: IAuditCentralHealthSnapshot,
ICentralAuditWriteFailureCounter,
IAuditRedactionFailureCounter
IAuditRedactionFailureCounter,
IAuditInboundCeilingHitsCounter
{
private int _centralAuditWriteFailures;
private int _auditRedactionFailure;
private int _auditInboundCeilingHits;
private readonly ConcurrentDictionary<string, bool> _stalled = new();
/// <inheritdoc/>
@@ -53,6 +55,10 @@ public sealed class AuditCentralHealthSnapshot
public int AuditRedactionFailure =>
Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
/// <inheritdoc/>
public int AuditInboundCeilingHits =>
Interlocked.CompareExchange(ref _auditInboundCeilingHits, 0, 0);
/// <inheritdoc/>
public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
new Dictionary<string, bool>(_stalled);
@@ -78,4 +84,8 @@ public sealed class AuditCentralHealthSnapshot
/// <inheritdoc/>
void IAuditRedactionFailureCounter.Increment() =>
Interlocked.Increment(ref _auditRedactionFailure);
/// <inheritdoc/>
void IAuditInboundCeilingHitsCounter.Increment() =>
Interlocked.Increment(ref _auditInboundCeilingHits);
}
@@ -167,6 +167,9 @@ public class AuditLogPurgeActor : ReceiveActor
if (boundaries.Count == 0)
{
// No whole-month partitions are eligible, but per-channel overrides may
// still expire rows earlier than the global window — run them below.
await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
return;
}
@@ -202,6 +205,80 @@ public class AuditLogPurgeActor : ReceiveActor
sw.ElapsedMilliseconds);
}
}
// M5.5 (T3): after the channel-blind global partition switch-out, apply any
// per-channel retention overrides that are SHORTER than the global window via
// a bounded, batched row DELETE on the same maintenance path. The global
// switch-out has already dropped whole months older than RetentionDays; these
// deletes only ever expire rows EARLIER than that, so they run last and are a
// strict tightening.
await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
}
/// <summary>
/// M5.5 (T3): runs each per-channel retention override whose window is strictly
/// shorter than the global <see cref="AuditLogOptions.RetentionDays"/>, deleting
/// rows of that channel older than the channel-specific threshold via a bounded,
/// batched maintenance-path DELETE. Each channel runs inside its own try/catch so
/// one bad channel does not abandon the others on the same tick, mirroring the
/// per-boundary error isolation of the partition switch-out loop.
/// </summary>
/// <param name="repository">The repository resolved for this tick's DI scope.</param>
private async Task RunPerChannelOverridesAsync(IAuditLogRepository repository)
{
var overrides = _auditOptions.PerChannelRetentionDays;
if (overrides is null || overrides.Count == 0)
{
return;
}
var globalDays = _auditOptions.RetentionDays;
foreach (var (channel, days) in overrides)
{
// Only act when the per-channel window is strictly shorter than the global
// one. Equal/longer windows are already covered by the global partition
// switch-out, so a row DELETE would be redundant work (and a longer window
// is meaningless — the partition is dropped on the global schedule).
if (days >= globalDays)
{
continue;
}
var channelThreshold = DateTime.UtcNow - TimeSpan.FromDays(days);
var sw = Stopwatch.StartNew();
try
{
var rowsDeleted = await repository
.PurgeChannelOlderThanAsync(channel, channelThreshold, _purgeOptions.ChannelPurgeBatchSize)
.ConfigureAwait(false);
sw.Stop();
if (rowsDeleted > 0)
{
_logger.LogInformation(
"Purged {RowsDeleted} AuditLog rows for channel {Channel} older than {Threshold:o} " +
"(per-channel override {Days}d < global {GlobalDays}d) in {DurationMs} ms.",
rowsDeleted,
channel,
channelThreshold,
days,
globalDays,
sw.ElapsedMilliseconds);
}
}
catch (Exception ex)
{
sw.Stop();
_logger.LogError(
ex,
"Failed to apply per-channel retention override for channel {Channel} " +
"({Days}d); other channels continue. Elapsed {DurationMs} ms.",
channel,
days,
sw.ElapsedMilliseconds);
}
}
}
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
@@ -28,6 +28,24 @@ public sealed class AuditLogPurgeOptions
/// <summary>Period of the purge tick in hours (default 24).</summary>
public int IntervalHours { get; set; } = 24;
/// <summary>
/// M5.5 (T3): batch size for the per-channel retention-override row DELETE
/// (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories.IAuditLogRepository.PurgeChannelOlderThanAsync"/>).
/// Each <c>DELETE TOP (@batch)</c> caps the transaction-log and lock footprint
/// per statement; the repository loops batches until no rows remain. Default
/// 5000 keeps individual deletes short on a busy central DB while still draining
/// a large backlog within a tick. Clamped to a sane minimum in
/// <see cref="ChannelPurgeBatchSize"/>.
/// </summary>
public int ChannelPurgeBatchSizeConfigured { get; set; } = 5000;
/// <summary>
/// Resolves the effective per-channel purge batch size, clamped to at least 1 so
/// a misconfigured <c>0</c>/negative value cannot make the repository's DELETE
/// loop spin or throw.
/// </summary>
public int ChannelPurgeBatchSize => ChannelPurgeBatchSizeConfigured < 1 ? 1 : ChannelPurgeBatchSizeConfigured;
/// <summary>
/// Test-only override for finer control over the tick cadence than
/// whole-hour resolution allows. When non-null, takes precedence over
@@ -50,6 +50,17 @@ public interface IAuditCentralHealthSnapshot
/// </summary>
int AuditRedactionFailure { get; }
/// <summary>
/// Count of inbound request/response body truncations at the
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.InboundMaxBytes"/>
/// ceiling since process start. Incremented by
/// <see cref="ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware.AuditWriteMiddleware"/>
/// whenever either the request or response body exceeds the cap and is
/// truncated in the audit copy. A sustained non-zero count can indicate
/// callers sending unexpectedly large bodies.
/// </summary>
int AuditInboundCeilingHits { get; }
/// <summary>
/// Per-site latched stalled state: <c>true</c> when the
/// <see cref="SiteAuditReconciliationActor"/> has observed two
@@ -0,0 +1,24 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M5.3 (T7) counter sink incremented by
/// <see cref="ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware.AuditWriteMiddleware"/>
/// whenever an inbound request or response body is truncated at the
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.InboundMaxBytes"/>
/// ceiling. Mirrors the <see cref="ICentralAuditWriteFailureCounter"/> shape:
/// one-method, NoOp default, must-never-abort-the-user-facing-action invariant.
/// </summary>
/// <remarks>
/// A ceiling hit is a normal operational event (the caller sent a large
/// body) rather than a failure, but surfacing a cumulative count lets
/// operators detect over-size callers early. The
/// <see cref="AuditCentralHealthSnapshot"/> production implementation
/// accumulates the count via an <c>Interlocked</c> field alongside
/// <see cref="ICentralAuditWriteFailureCounter"/> and
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Payload.IAuditRedactionFailureCounter"/>.
/// </remarks>
public interface IAuditInboundCeilingHitsCounter
{
/// <summary>Increment the inbound body-ceiling hit counter by one.</summary>
void Increment();
}
@@ -0,0 +1,13 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Default <see cref="IAuditInboundCeilingHitsCounter"/> binding used when
/// the central health snapshot is not wired (e.g. site composition roots,
/// test harnesses that have no health dashboard). All increments are silently
/// dropped — correct for environments that have no audit KPI surface.
/// </summary>
public sealed class NoOpAuditInboundCeilingHitsCounter : IAuditInboundCeilingHitsCounter
{
/// <inheritdoc/>
public void Increment() { }
}
@@ -37,6 +37,33 @@ public sealed class AuditLogOptions
/// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
public int RetentionDays { get; set; } = 365;
/// <summary>
/// M5.5 (T3) per-channel retention overrides, keyed by the canonical channel name
/// (the <see cref="AuditChannel"/> enum name — e.g. <c>ApiOutbound</c>,
/// <c>DbOutbound</c>, <c>Notification</c>, <c>ApiInbound</c>). The value is a
/// retention window in days that MUST be SHORTER than or equal to the global
/// <see cref="RetentionDays"/>.
/// </summary>
/// <remarks>
/// <para>
/// The global <see cref="RetentionDays"/> window is enforced by month-partition
/// switch-out, which is channel-blind: it can only drop a whole month once every
/// row in it is older than the global window. A per-channel override therefore
/// can only ever expire rows EARLIER than the global purge would — never later
/// (a longer per-channel window is meaningless because the partition switch-out
/// would already have dropped the month). Overrides shorter than the global window
/// are honoured by the purge actor as a bounded, batched row DELETE on the
/// maintenance path (see <c>AuditLogPurgeActor</c>); the append-only writer/ingest
/// role is unaffected.
/// </para>
/// <para>
/// Each value is validated to be in <c>[30, RetentionDays]</c> by
/// <c>AuditLogOptionsValidator</c>; keys that are not recognized
/// <see cref="AuditChannel"/> names are rejected.
/// </para>
/// </remarks>
public Dictionary<string, int> PerChannelRetentionDays { get; set; } = new();
/// <summary>
/// Per-body byte ceiling applied to <see cref="AuditEvent.RequestSummary"/> and
/// <see cref="AuditEvent.ResponseSummary"/> for <see cref="AuditChannel.ApiInbound"/> rows
@@ -1,4 +1,5 @@
using ZB.MOM.WW.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
@@ -52,5 +53,27 @@ public sealed class AuditLogOptionsValidator : OptionsValidatorBase<AuditLogOpti
!(options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes),
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
// M5.5 (T3): per-channel retention overrides. Each entry must be keyed by a
// recognized AuditChannel name and carry a window in [MinRetentionDays,
// RetentionDays] — i.e. SHORTER than or equal to the global window. A longer
// per-channel window is meaningless under month-partition switch-out (governed
// by the global window), so it is rejected rather than silently ignored.
foreach (var (channelKey, days) in options.PerChannelRetentionDays)
{
builder.RequireThat(
Enum.TryParse<AuditChannel>(channelKey, ignoreCase: false, out _),
$"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)} key '{channelKey}' " +
$"is not a recognized channel name. Valid keys: {string.Join(", ", Enum.GetNames<AuditChannel>())}.");
// Valid when days is within [MinRetentionDays, RetentionDays] inclusive.
// The lower bound matches the global RetentionDays floor; the upper bound
// is the configured global window (longer is meaningless — see remarks).
builder.RequireThat(
!(days < MinRetentionDays || days > options.RetentionDays),
$"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)}['{channelKey}'] ({days}) " +
$"must be in [{MinRetentionDays}, {nameof(AuditLogOptions.RetentionDays)}={options.RetentionDays}] days " +
"— a per-channel window must be shorter than or equal to the global retention window.");
}
}
}
@@ -25,4 +25,15 @@ public sealed class PerTargetRedactionOverride
/// rows.
/// </summary>
public string? RedactSqlParamsMatching { get; set; }
/// <summary>
/// When <c>true</c>, the inbound API audit row for this target records
/// request/response headers and metadata (status, duration, actor, etc.)
/// but the request and response body strings are omitted
/// (<c>RequestSummary</c> / <c>ResponseSummary</c> are left null). The
/// audit row itself is always emitted — only the body content is suppressed.
/// Null (the default, equivalent to <c>false</c>) means body capture
/// proceeds normally up to <see cref="AuditLogOptions.InboundMaxBytes"/>.
/// </summary>
public bool SkipBodyCapture { get; set; }
}
@@ -200,6 +200,13 @@ public static class ServiceCollectionExtensions
// surface on the central dashboard.
services.TryAddSingleton<ICentralAuditWriteFailureCounter, NoOpCentralAuditWriteFailureCounter>();
// M5.3 (T7): inbound body-ceiling hit counter — NoOp default for
// site/test roots. AddAuditLogCentralMaintenance replaces this binding
// with the AuditCentralHealthSnapshot implementation so ceiling-hit
// counts surface on the central dashboard alongside write-failure and
// redaction-failure counters.
services.TryAddSingleton<IAuditInboundCeilingHitsCounter, NoOpAuditInboundCeilingHitsCounter>();
// M4 Bundle B: central direct-write audit writer used by
// NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
// emit AuditLog rows that originate ON central, not via site telemetry.
@@ -383,6 +390,12 @@ public static class ServiceCollectionExtensions
// HealthMetricsAuditRedactionFailureCounter shape one-for-one.
services.Replace(ServiceDescriptor.Singleton<IAuditRedactionFailureCounter,
CentralAuditRedactionFailureCounter>());
// M5.3 (T7): replace the NoOp IAuditInboundCeilingHitsCounter with the
// AuditCentralHealthSnapshot so ceiling-hit counts surface on the
// central dashboard. Same singleton-forward pattern as
// ICentralAuditWriteFailureCounter above.
services.Replace(ServiceDescriptor.Singleton<IAuditInboundCeilingHitsCounter>(
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>()));
return services;
}
@@ -0,0 +1,113 @@
using System.Text;
using System.Text.Json;
namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
/// <summary>
/// Arguments for an <c>audit backfill-source-node</c> invocation.
/// </summary>
public sealed class AuditBackfillSourceNodeArgs
{
/// <summary>
/// Value written into <c>SourceNode</c> for NULL rows (default <c>"unknown"</c>).
/// </summary>
public string Sentinel { get; set; } = "unknown";
/// <summary>
/// Only rows with <c>OccurredAtUtc</c> strictly before this UTC datetime are
/// eligible. Required — must be an ISO-8601 UTC datetime.
/// </summary>
public string Before { get; set; } = string.Empty;
/// <summary>
/// Maximum rows updated per batch (default 5000). Caps the per-transaction
/// log footprint; the loop repeats until no rows remain.
/// </summary>
public int BatchSize { get; set; } = 5000;
}
/// <summary>
/// Pure helpers for the <c>audit backfill-source-node</c> subcommand (Audit Log
/// #23 M5.6 T5). Builds the request body, POSTs to
/// <c>/api/audit/backfill-source-node</c>, and renders the result. Kept separate
/// from the command wiring so each piece is unit-testable without standing up the
/// command tree.
/// </summary>
public static class AuditBackfillHelpers
{
private static readonly JsonSerializerOptions JsonWriteOptions = new()
{
WriteIndented = true,
};
/// <summary>
/// Builds the JSON request body for <c>POST /api/audit/backfill-source-node</c>.
/// </summary>
/// <param name="args">The backfill arguments.</param>
/// <returns>A JSON string suitable for the request body.</returns>
public static string BuildRequestBody(AuditBackfillSourceNodeArgs args)
{
var obj = new
{
sentinel = args.Sentinel,
before = args.Before,
batchSize = args.BatchSize,
};
return JsonSerializer.Serialize(obj);
}
/// <summary>
/// Executes the backfill: POSTs <c>/api/audit/backfill-source-node</c> and
/// prints the result. Returns the process exit code (0 = success,
/// 1 = error, 2 = authorization failure).
/// </summary>
/// <param name="client">The management HTTP client.</param>
/// <param name="args">The backfill arguments.</param>
/// <param name="output">The output writer for results.</param>
/// <returns>A task that resolves to the process exit code.</returns>
public static async Task<int> RunBackfillAsync(
ManagementHttpClient client,
AuditBackfillSourceNodeArgs args,
TextWriter output)
{
var body = BuildRequestBody(args);
var response = await client.SendPostAsync(
"api/audit/backfill-source-node", body, TimeSpan.FromMinutes(10));
if (response.JsonData == null)
{
OutputFormatter.WriteError(
response.Error ?? "Backfill request failed.", response.ErrorCode ?? "ERROR");
return CommandHelpers.IsAuthorizationFailure(response) ? 2 : 1;
}
// Parse and display the result.
try
{
using var doc = JsonDocument.Parse(response.JsonData);
var root = doc.RootElement;
var rowsUpdated = root.TryGetProperty("rowsUpdated", out var r)
? r.GetInt64()
: 0L;
var sentinel = root.TryGetProperty("sentinel", out var s)
? s.GetString() ?? args.Sentinel
: args.Sentinel;
var before = root.TryGetProperty("before", out var b)
? b.GetString() ?? args.Before
: args.Before;
output.WriteLine($"SourceNode backfill complete.");
output.WriteLine($" rows updated : {rowsUpdated}");
output.WriteLine($" sentinel : {sentinel}");
output.WriteLine($" before : {before}");
}
catch (JsonException)
{
// Server returned success but non-JSON body — not expected; print raw.
output.WriteLine(response.JsonData);
}
output.Flush();
return 0;
}
}
@@ -6,13 +6,15 @@ namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
/// <summary>
/// The <c>scadabridge audit</c> command group (Audit Log #23 M8). Provides read access to
/// the centralized append-only Audit Log via the Bundle B REST endpoints
/// (<c>GET /api/audit/query</c>, <c>GET /api/audit/export</c>), plus a v1 no-op
/// <c>verify-chain</c> placeholder for the deferred hash-chain tamper-evidence feature.
/// (<c>GET /api/audit/query</c>, <c>GET /api/audit/export</c>,
/// <c>GET /api/audit/tree</c>), plus a v1 no-op <c>verify-chain</c> placeholder
/// for the deferred hash-chain tamper-evidence feature.
/// </summary>
public static class AuditCommands
{
/// <summary>
/// Builds the <c>audit</c> command group with query, export, and verify-chain sub-commands.
/// Builds the <c>audit</c> command group with query, export, tree, and verify-chain
/// sub-commands.
/// </summary>
/// <param name="urlOption">Global <c>--url</c> option for the management API endpoint.</param>
/// <param name="formatOption">Global <c>--format</c> option for output format.</param>
@@ -25,7 +27,9 @@ public static class AuditCommands
command.Add(BuildQuery(urlOption, formatOption, usernameOption, passwordOption));
command.Add(BuildExport(urlOption, formatOption, usernameOption, passwordOption));
command.Add(BuildTree(urlOption, formatOption, usernameOption, passwordOption));
command.Add(BuildVerifyChain(urlOption, formatOption, usernameOption, passwordOption));
command.Add(BuildBackfillSourceNode(urlOption, formatOption, usernameOption, passwordOption));
return command;
}
@@ -224,6 +228,44 @@ public static class AuditCommands
return cmd;
}
private static Command BuildTree(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var executionIdOption = new Option<string>("--execution-id")
{
Description = "Execution ID (GUID) to look up — may be any node in the chain",
Required = true,
};
var cmd = new Command("tree") { Description = "Display the full execution-chain tree for an audit execution" };
cmd.Add(executionIdOption);
cmd.SetAction(async (ParseResult result) =>
{
var connection = AuditCommandHelpers.ResolveConnection(result, urlOption, usernameOption, passwordOption);
if (connection.Error != null)
{
OutputFormatter.WriteError(connection.Error, connection.ErrorCode!);
return 1;
}
var rawId = result.GetValue(executionIdOption);
if (!Guid.TryParse(rawId, out var executionId))
{
OutputFormatter.WriteError(
$"Invalid execution ID '{rawId}'. Expected a GUID (e.g. 11111111-1111-1111-1111-111111111111).",
"INVALID_ARGUMENT");
return 1;
}
var format = AuditCommandHelpers.ResolveFormat(result, formatOption);
using var client = new ManagementHttpClient(connection.Url!, connection.Username!, connection.Password!);
return await AuditTreeHelpers.RunTreeAsync(client, executionId, format, Console.Out);
});
return cmd;
}
private static Command BuildVerifyChain(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var monthOption = new Option<string>("--month") { Description = "Month to verify (YYYY-MM)", Required = true };
@@ -247,4 +289,76 @@ public static class AuditCommands
});
return cmd;
}
/// <summary>
/// Builds the <c>audit backfill-source-node</c> sub-command (Audit Log #23 M5.6 T5).
/// Sets <c>SourceNode</c> on historical pre-feature rows whose <c>SourceNode IS NULL</c>
/// and <c>OccurredAtUtc</c> is older than <c>--before</c>, in batches. Admin-only.
/// </summary>
private static Command BuildBackfillSourceNode(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var sentinelOption = new Option<string>("--sentinel")
{
Description = "Value to write for pre-feature rows whose node-of-origin is unknown (default: unknown)",
};
sentinelOption.DefaultValueFactory = _ => "unknown";
var beforeOption = new Option<string>("--before")
{
Description = "ISO-8601 UTC datetime; only rows older than this date are eligible (required)",
Required = true,
};
var batchOption = new Option<int>("--batch")
{
Description = "Max rows updated per batch (default: 5000)",
};
batchOption.DefaultValueFactory = _ => 5000;
var cmd = new Command("backfill-source-node")
{
Description = "Set SourceNode to a sentinel value on pre-feature rows where it is NULL (admin-only, maintenance path)",
};
cmd.Add(sentinelOption);
cmd.Add(beforeOption);
cmd.Add(batchOption);
cmd.SetAction(async (ParseResult result) =>
{
var connection = AuditCommandHelpers.ResolveConnection(result, urlOption, usernameOption, passwordOption);
if (connection.Error != null)
{
OutputFormatter.WriteError(connection.Error, connection.ErrorCode!);
return 1;
}
var sentinel = result.GetValue(sentinelOption) ?? "unknown";
var before = result.GetValue(beforeOption)!;
var batch = result.GetValue(batchOption);
if (string.IsNullOrWhiteSpace(sentinel))
{
OutputFormatter.WriteError("--sentinel must be a non-empty string.", "INVALID_ARGUMENT");
return 1;
}
if (batch <= 0)
{
OutputFormatter.WriteError("--batch must be > 0.", "INVALID_ARGUMENT");
return 1;
}
var args = new AuditBackfillSourceNodeArgs
{
Sentinel = sentinel,
Before = before,
BatchSize = batch,
};
using var client = new ManagementHttpClient(connection.Url!, connection.Username!, connection.Password!);
return await AuditBackfillHelpers.RunBackfillAsync(client, args, Console.Out);
});
return cmd;
}
}
@@ -0,0 +1,208 @@
using System.Text;
using System.Text.Json;
namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
/// <summary>
/// Arguments for an <c>audit tree</c> invocation.
/// </summary>
public sealed class AuditTreeArgs
{
/// <summary>
/// The execution ID (GUID) to look up. May be any node in the chain — the
/// server walks to the root and returns the full tree.
/// </summary>
public string ExecutionId { get; set; } = string.Empty;
}
/// <summary>
/// Represents one execution node as returned by <c>GET /api/audit/tree</c>.
/// Property names match the server's camelCase JSON serialisation of
/// <c>ExecutionTreeNode</c>.
/// </summary>
internal sealed class AuditTreeNodeDto
{
public Guid ExecutionId { get; init; }
public Guid? ParentExecutionId { get; init; }
public int RowCount { get; init; }
public string[] Channels { get; init; } = Array.Empty<string>();
public string[] Statuses { get; init; } = Array.Empty<string>();
public string? SourceSiteId { get; init; }
public string? SourceInstanceId { get; init; }
public DateTime? FirstOccurredAtUtc { get; init; }
public DateTime? LastOccurredAtUtc { get; init; }
}
/// <summary>
/// Pure helpers for the <c>audit tree</c> subcommand: builds the query string,
/// calls <c>GET /api/audit/tree</c>, and renders the result as either an
/// indented ASCII tree (table format) or raw JSON. Kept separate from the
/// command wiring so each piece is unit-testable without standing up the
/// command tree.
/// </summary>
public static class AuditTreeHelpers
{
private static readonly JsonSerializerOptions JsonReadOptions = new()
{
PropertyNameCaseInsensitive = true,
};
private static readonly JsonSerializerOptions JsonWriteOptions = new()
{
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
WriteIndented = true,
};
/// <summary>
/// Builds the query string for <c>GET /api/audit/tree</c>.
/// </summary>
/// <param name="executionId">The execution ID GUID.</param>
/// <returns>A relative path + query string ready to append to the base URL.</returns>
public static string BuildUrl(Guid executionId)
=> $"api/audit/tree?executionId={executionId:D}";
/// <summary>
/// Executes the tree lookup: GETs <c>/api/audit/tree</c> and renders the result
/// in the requested format. Returns the process exit code (0 = success,
/// 1 = error, 2 = authorization failure).
/// </summary>
/// <param name="client">The management HTTP client.</param>
/// <param name="executionId">The execution ID to look up.</param>
/// <param name="format">"table" (default) or "json".</param>
/// <param name="output">The output writer for results.</param>
/// <returns>A task that resolves to the process exit code.</returns>
public static async Task<int> RunTreeAsync(
ManagementHttpClient client,
Guid executionId,
string format,
TextWriter output)
{
var url = BuildUrl(executionId);
var response = await client.SendGetAsync(url, TimeSpan.FromSeconds(30));
if (response.JsonData == null)
{
OutputFormatter.WriteError(
response.Error ?? "Audit tree request failed.", response.ErrorCode ?? "ERROR");
return CommandHelpers.IsAuthorizationFailure(response) ? 2 : 1;
}
var nodes = ParseNodes(response.JsonData);
if (format == "json")
{
WriteJson(nodes, output);
}
else
{
WriteTable(nodes, executionId, output);
}
output.Flush();
return 0;
}
/// <summary>
/// Parses the JSON array from the server into an array of
/// <see cref="AuditTreeNodeDto"/>.
/// </summary>
/// <param name="json">The raw JSON response body.</param>
/// <returns>An array of deserialized tree nodes (empty on parse failure).</returns>
internal static AuditTreeNodeDto[] ParseNodes(string json)
{
try
{
return JsonSerializer.Deserialize<AuditTreeNodeDto[]>(json, JsonReadOptions)
?? Array.Empty<AuditTreeNodeDto>();
}
catch (JsonException)
{
return Array.Empty<AuditTreeNodeDto>();
}
}
/// <summary>
/// Renders the nodes as pretty-printed JSON to <paramref name="output"/>.
/// </summary>
internal static void WriteJson(AuditTreeNodeDto[] nodes, TextWriter output)
{
output.WriteLine(JsonSerializer.Serialize(nodes, JsonWriteOptions));
}
/// <summary>
/// Renders the nodes as an indented ASCII tree. The root node (null
/// <c>ParentExecutionId</c>) is printed first; each child is indented
/// two spaces per depth level. The queried/entry-point node is marked
/// with <c> [*]</c>.
/// </summary>
internal static void WriteTable(
AuditTreeNodeDto[] nodes,
Guid queriedExecutionId,
TextWriter output)
{
if (nodes.Length == 0)
{
output.WriteLine("(no execution tree found)");
return;
}
// Build a parent → children lookup (keyed by non-null parent Guid).
// Nodes whose ParentExecutionId is null are roots and are not placed in
// the lookup; they are identified separately below.
var childrenOf = new Dictionary<Guid, List<AuditTreeNodeDto>>();
foreach (var node in nodes)
{
if (node.ParentExecutionId is { } parentId)
{
if (!childrenOf.ContainsKey(parentId))
childrenOf[parentId] = new List<AuditTreeNodeDto>();
childrenOf[parentId].Add(node);
}
}
// Identify roots: nodes whose ParentExecutionId is null, or whose parent
// is not present in the node set (stub-root case).
var nodeIds = new HashSet<Guid>(nodes.Select(n => n.ExecutionId));
var roots = nodes
.Where(n => n.ParentExecutionId == null || !nodeIds.Contains(n.ParentExecutionId.Value))
.ToList();
// Render depth-first.
var sb = new StringBuilder();
foreach (var root in roots)
{
RenderNode(root, depth: 0, childrenOf, queriedExecutionId, sb);
}
output.Write(sb.ToString());
}
private static void RenderNode(
AuditTreeNodeDto node,
int depth,
Dictionary<Guid, List<AuditTreeNodeDto>> childrenOf,
Guid queriedExecutionId,
StringBuilder sb)
{
var indent = new string(' ', depth * 2);
var marker = node.ExecutionId == queriedExecutionId ? " [*]" : string.Empty;
var channels = node.Channels.Length > 0 ? string.Join(",", node.Channels) : "-";
var statuses = node.Statuses.Length > 0 ? string.Join(",", node.Statuses) : "-";
var site = node.SourceSiteId ?? "-";
var instance = node.SourceInstanceId ?? "-";
var first = node.FirstOccurredAtUtc.HasValue
? node.FirstOccurredAtUtc.Value.ToString("yyyy-MM-ddTHH:mm:ssZ")
: "-";
sb.AppendLine(
$"{indent}{node.ExecutionId:D}{marker} rows={node.RowCount} channels=[{channels}] statuses=[{statuses}] site={site} instance={instance} first={first}");
if (childrenOf.TryGetValue(node.ExecutionId, out var children))
{
foreach (var child in children)
{
RenderNode(child, depth + 1, childrenOf, queriedExecutionId, sb);
}
}
}
}
@@ -142,6 +142,60 @@ public class ManagementHttpClient : IDisposable
return new ManagementResponse((int)httpResponse.StatusCode, null, error, code);
}
/// <summary>
/// Issues a plain HTTP <c>POST</c> against a REST endpoint (e.g. the audit
/// maintenance endpoints) with a JSON body and returns the response. Unlike
/// <see cref="SendCommandAsync"/>, this does not wrap the call in the
/// <c>POST /management</c> command envelope — these are plain REST resources.
/// Authentication (HTTP Basic) and the base address are shared.
/// </summary>
/// <param name="relativePath">Path relative to the base URL.</param>
/// <param name="body">The JSON body to send, or <c>null</c> for an empty body.</param>
/// <param name="timeout">The request timeout.</param>
/// <returns>A management response containing status and data.</returns>
public async Task<ManagementResponse> SendPostAsync(string relativePath, string? body, TimeSpan timeout)
{
using var cts = new CancellationTokenSource(timeout);
var content = new StringContent(body ?? "{}", Encoding.UTF8, "application/json");
HttpResponseMessage httpResponse;
try
{
httpResponse = await _httpClient.PostAsync(relativePath, content, cts.Token);
}
catch (TaskCanceledException)
{
return new ManagementResponse(504, null, "Request timed out.", "TIMEOUT");
}
catch (HttpRequestException ex)
{
return new ManagementResponse(0, null, $"Connection failed: {ex.Message}", "CONNECTION_FAILED");
}
var responseBody = await httpResponse.Content.ReadAsStringAsync(cts.Token);
if (httpResponse.IsSuccessStatusCode)
{
return new ManagementResponse((int)httpResponse.StatusCode, responseBody, null, null);
}
string? error = null;
string? code = null;
try
{
using var doc = JsonDocument.Parse(responseBody);
error = doc.RootElement.TryGetProperty("error", out var e) ? e.GetString() : responseBody;
code = doc.RootElement.TryGetProperty("code", out var c) ? c.GetString() : null;
}
catch
{
error = responseBody;
}
return new ManagementResponse((int)httpResponse.StatusCode, null, error, code);
}
/// <summary>
/// Issues a plain HTTP <c>GET</c> and returns the raw <see cref="HttpResponseMessage"/>
/// so the caller can stream the response body without buffering it in memory — used
+56 -13
View File
@@ -1269,15 +1269,18 @@ script-trust-boundary action: outbound API calls (sync + cached), outbound DB
operations (sync + cached), notifications, and inbound API calls. This is distinct
from the configuration-change audit trail exposed by [`audit-config`](#audit-config--configuration-change-audit-log).
The subcommands map directly onto the `GET /api/audit/query` and
`GET /api/audit/export` management endpoints. Filters and the result columns mirror
the Central UI **Audit** page, so a CLI query and a UI query with the same filters
return the same rows — CLI ↔ UI filter parity is intentional.
The subcommands map directly onto the `GET /api/audit/query`,
`GET /api/audit/export`, `GET /api/audit/tree`, and
`POST /api/audit/backfill-source-node` management endpoints. Filters and the
result columns mirror the Central UI **Audit** page, so a CLI query and a UI
query with the same filters return the same rows — CLI ↔ UI filter parity is
intentional.
**Permissions.** Querying requires the `OperationalAudit` permission (roles `Admin`,
`Audit`, or `AuditReadOnly`). Exporting requires the stricter `AuditExport` permission
(roles `Admin` or `Audit`) — read access does *not* imply export access. A request
without the required role returns exit code `2`.
**Permissions.** Querying and tree traversal require the `OperationalAudit`
permission (roles `Admin`, `Audit`, or `AuditReadOnly`). Exporting requires the
stricter `AuditExport` permission (roles `Admin` or `Audit`) — read access does
*not* imply export access. The `backfill-source-node` maintenance command requires
the `Admin` role. A request without the required role returns exit code `2`.
#### `audit query`
@@ -1342,6 +1345,46 @@ scadabridge --url <url> audit export --since <time> --until <time> --format <fmt
> Implemented` — Parquet archival is deferred to v1.x (see `Component-AuditLog.md`).
> Use `csv` or `jsonl`.
#### `audit tree` (M5.3 T8)
Display the full execution-chain tree for a given execution ID. The server walks
`ParentExecutionId` to find the root, then traverses downward to collect all
reachable executions in the chain.
```sh
scadabridge --url <url> audit tree --execution-id <guid> [--format table|json]
```
| Option | Required | Default | Description |
|--------|----------|---------|-------------|
| `--execution-id` | yes | — | Any `ExecutionId` in the chain (root or child) |
| `--format` | no | `json` | Output format: `json` (structured tree) or `table` (indented tree) |
The `--execution-id` can be any node in the chain — the server resolves the root
automatically. With `--format table` the tree is printed as an indented text
representation. With `--format json` (the default) a structured JSON tree is
returned, suitable for scripting. Backed by `GET /api/audit/tree?executionId=<guid>`.
Requires `OperationalAudit` permission.
#### `audit backfill-source-node` (M5.6 T5)
Set `SourceNode` to a sentinel value on pre-feature rows where `SourceNode IS NULL`
and `OccurredAtUtc` is older than `--before`. Admin-only maintenance command.
```sh
scadabridge --url <url> audit backfill-source-node --before <ISO-8601-UTC> [--sentinel <value>] [--batch <n>]
```
| Option | Required | Default | Description |
|--------|----------|---------|-------------|
| `--before` | yes | — | ISO-8601 UTC datetime; only rows older than this date are eligible |
| `--sentinel` | no | `unknown` | Value to write (must be non-empty) |
| `--batch` | no | `5000` | Max rows updated per batch; controls transaction size |
The command is idempotent — running it multiple times converges (only rows where
`SourceNode IS NULL` are eligible; already-set rows are untouched). Backed by
`POST /api/audit/backfill-source-node`. Requires `Admin` role.
#### `audit verify-chain`
Verify the audit log hash chain for a given month.
@@ -1354,11 +1397,11 @@ scadabridge --url <url> audit verify-chain --month <YYYY-MM>
|--------|----------|---------|-------------|
| `--month` | yes | — | Month to verify, `YYYY-MM` (e.g. `2026-05`) |
> **v1 no-op.** Hash-chain tamper-evidence is not enabled in this release. The
> subcommand validates the `--month` argument and prints a notice pointing at the
> v1.x roadmap in `Component-AuditLog.md`; it exits `0` without contacting the server.
> The command exists now so scripts and operator habits do not need to change when
> tamper-evidence ships.
> **v1 no-op.** Hash-chain tamper-evidence is not enabled in this release (T1
> deferred to v1.x). The subcommand validates the `--month` argument and prints a
> notice pointing at the v1.x roadmap in `Component-AuditLog.md`; it exits `0`
> without contacting the server. The command exists now so scripts and operator
> habits do not need to change when tamper-evidence ships.
---
@@ -58,3 +58,31 @@
{
<div class="text-muted small mb-3">Site Call KPIs unavailable: @ErrorMessage</div>
}
@* ── Per-node stuck/parked sub-table (T6: M5.2 per-node stuck-count KPIs) ── *@
@if (HasNodeBreakdown)
{
<div class="mb-3">
<div class="d-flex justify-content-between align-items-center mb-1">
<small class="text-muted">By node</small>
</div>
<table class="table table-sm table-borderless mb-0 site-call-kpi-node-table">
<thead class="table-light">
<tr>
<th class="small py-1">Node</th>
<th class="text-end small py-1">Stuck</th>
<th class="text-end small py-1">Parked</th>
</tr>
</thead>
<tbody>
@foreach (var n in PerNodeSnapshots!)
{
<tr @key="n.SourceNode">
<td class="small py-1"><code>@n.SourceNode</code></td>
<td class="text-end font-monospace small py-1 @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
<td class="text-end font-monospace small py-1 @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
</tr>
}
</tbody>
</table>
</div>
}
@@ -1,5 +1,6 @@
using Microsoft.AspNetCore.Components;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
namespace ZB.MOM.WW.ScadaBridge.CentralUI.Components.Health;
@@ -59,6 +60,24 @@ public partial class SiteCallKpiTiles
/// </summary>
[Parameter] public string? ErrorMessage { get; set; }
/// <summary>
/// Optional per-node KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
/// When non-null and non-empty, a compact node-level stuck/parked sub-table
/// is rendered below the main tiles. <c>null</c> means the parent has not
/// loaded it yet or has opted out — the sub-table is suppressed entirely.
/// </summary>
[Parameter] public IReadOnlyList<SiteCallNodeKpiSnapshot>? PerNodeSnapshots { get; set; }
/// <summary>
/// True when <see cref="PerNodeSnapshots"/> is a successful query result.
/// Used to suppress the sub-table on a load failure.
/// </summary>
[Parameter] public bool PerNodeAvailable { get; set; }
/// <summary>Whether the per-node sub-table has data to render.</summary>
internal bool HasNodeBreakdown =>
PerNodeAvailable && PerNodeSnapshots is { Count: > 0 };
// ── Buffered tile ───────────────────────────────────────────────────────
private string BufferedDisplay =>
@@ -9,6 +9,7 @@
@using ZB.MOM.WW.ScadaBridge.HealthMonitoring
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit
@using ZB.MOM.WW.ScadaBridge.Communication
@implements IDisposable
@inject ICentralHealthAggregator HealthAggregator
@@ -65,7 +66,9 @@
(buffered / stuck / parked). Refreshed alongside the site states. *@
<SiteCallKpiTiles Snapshot="@_siteCallKpi"
IsAvailable="@_siteCallKpiAvailable"
ErrorMessage="@_siteCallKpiError" />
ErrorMessage="@_siteCallKpiError"
PerNodeSnapshots="@_siteCallNodeKpis"
PerNodeAvailable="@_siteCallNodeKpiAvailable" />
@* Audit Log (#23) M7 Bundle E — three KPI tiles for the Audit channel
(volume / error rate / backlog). Refreshed alongside the site states. *@
@@ -378,6 +381,12 @@
private bool _siteCallKpiAvailable;
private string? _siteCallKpiError;
// Per-node Site Call KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
// Passed to SiteCallKpiTiles as an optional sub-table.
private IReadOnlyList<SiteCallNodeKpiSnapshot> _siteCallNodeKpis =
Array.Empty<SiteCallNodeKpiSnapshot>();
private bool _siteCallNodeKpiAvailable;
private static bool SiteHasActiveErrors(SiteHealthState state)
{
var report = state.LatestReport;
@@ -415,7 +424,7 @@
{
_siteStates = HealthAggregator.GetAllSiteStates();
await LoadOutboxKpis();
await LoadSiteCallKpis();
await Task.WhenAll(LoadSiteCallKpis(), LoadSiteCallNodeKpis());
await LoadAuditKpis();
}
@@ -474,6 +483,30 @@
}
}
// Per-node site-call KPI loader (T6: M5.2). Best-effort; a fault silently
// suppresses the per-node sub-table rather than degrading the dashboard.
private async Task LoadSiteCallNodeKpis()
{
try
{
var response = await CommunicationService.GetPerNodeSiteCallKpisAsync(
new PerNodeSiteCallKpiRequest(Guid.NewGuid().ToString("N")));
if (response.Success)
{
_siteCallNodeKpis = response.Nodes;
_siteCallNodeKpiAvailable = true;
}
else
{
_siteCallNodeKpiAvailable = false;
}
}
catch
{
_siteCallNodeKpiAvailable = false;
}
}
// Tiles show the numeric KPI when available, or an em dash when the outbox
// KPI query failed — matching how the page renders other unavailable data.
private string OutboxTileValue(int value) =>
@@ -69,6 +69,51 @@
</div>
}
@* ── Per-node breakdown (T6: additive) ── *@
<h5 class="mb-2">Per-node breakdown</h5>
@if (_perNodeError != null)
{
<div class="alert alert-warning py-2">Per-node KPIs unavailable: @_perNodeError</div>
}
else if (_perNode.Count == 0)
{
<div class="card mb-3">
<div class="card-body text-center text-muted py-3">
<div class="small">No per-node activity (rows may have a null SourceNode).</div>
</div>
</div>
}
else
{
<div class="table-responsive mb-3">
<table class="table table-sm table-hover align-middle">
<thead class="table-light">
<tr>
<th>Node</th>
<th class="text-end">Queue Depth</th>
<th class="text-end">Stuck</th>
<th class="text-end">Parked</th>
<th class="text-end">Delivered (last interval)</th>
<th class="text-end">Oldest Pending Age</th>
</tr>
</thead>
<tbody>
@foreach (var n in _perNode)
{
<tr @key="n.SourceNode" class="@(n.StuckCount > 0 ? "table-warning" : "")">
<td><code>@n.SourceNode</code></td>
<td class="text-end font-monospace">@n.QueueDepth</td>
<td class="text-end font-monospace @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
<td class="text-end font-monospace @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
<td class="text-end font-monospace text-success">@n.DeliveredLastInterval</td>
<td class="text-end font-monospace">@FormatAge(n.OldestPendingAge)</td>
</tr>
}
</tbody>
</table>
</div>
}
@* ── Per-site breakdown ── *@
<h5 class="mb-2">Per-site breakdown</h5>
@if (_perSiteError != null)
@@ -124,6 +169,10 @@
private IReadOnlyList<SiteNotificationKpiSnapshot> _perSite = Array.Empty<SiteNotificationKpiSnapshot>();
private string? _perSiteError;
// ── Per-node (T6: M5.2 per-node stuck-count KPIs) ──
private IReadOnlyList<NodeNotificationKpiSnapshot> _perNode = Array.Empty<NodeNotificationKpiSnapshot>();
private string? _perNodeError;
private bool _loading;
protected override async Task OnInitializedAsync()
@@ -144,9 +193,9 @@
private async Task RefreshAll()
{
_loading = true;
// Race-free despite both tasks mutating component fields: Blazor Server runs
// Race-free despite all tasks mutating component fields: Blazor Server runs
// every continuation on the circuit's single-threaded synchronization context.
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis());
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis(), LoadPerNodeKpis());
_loading = false;
}
@@ -194,6 +243,28 @@
}
}
private async Task LoadPerNodeKpis()
{
try
{
var response = await CommunicationService.GetPerNodeNotificationKpisAsync(
new PerNodeNotificationKpiRequest(Guid.NewGuid().ToString("N")));
if (response.Success)
{
_perNode = response.Nodes;
_perNodeError = null;
}
else
{
_perNodeError = response.ErrorMessage ?? "Per-node KPI query failed.";
}
}
catch (Exception ex)
{
_perNodeError = $"Per-node KPI query failed: {ex.Message}";
}
}
private string SiteName(string siteId) =>
_sites.FirstOrDefault(s => s.SiteIdentifier == siteId)?.Name ?? siteId;
@@ -87,6 +87,42 @@ public interface IAuditLogRepository
/// <returns>A task that resolves to the approximate number of rows discarded by the partition switch.</returns>
Task<long> SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default);
/// <summary>
/// M5.5 (T3) per-channel retention override purge. Deletes <c>AuditLog</c> rows for a
/// single <paramref name="channel"/> (matched against the canonical
/// <c>Category</c> column — the bare channel name, e.g. <c>ApiOutbound</c>) whose
/// <c>OccurredAtUtc</c> is strictly older than <paramref name="threshold"/>, in
/// bounded batches of <paramref name="batchSize"/> rows, looping until no further
/// rows match. Returns the total number of rows deleted across all batches.
/// </summary>
/// <remarks>
/// <para>
/// <b>Maintenance path — NOT the writer role.</b> The append-only invariant binds
/// the <c>scadabridge_audit_writer</c> ingest role (INSERT + SELECT only). This row
/// DELETE runs on the purge/maintenance connection, the same path that performs the
/// global partition switch-out (also a destructive operation forbidden to the writer
/// role). Per-channel overrides can only ever expire rows EARLIER than the global
/// month-partition switch-out would — never later — so this is a strict tightening
/// of the retention window, applied AFTER the global purge on the same tick.
/// </para>
/// <para>
/// <b>Bounded + idempotent.</b> Each batch is a <c>DELETE TOP (@batch)</c> so the
/// transaction log and lock footprint stay bounded regardless of backlog. Re-running
/// the purge is a no-op once every eligible row is gone (the loop exits when a batch
/// deletes zero rows), so a crash mid-loop is recoverable by simply running again.
/// </para>
/// </remarks>
/// <param name="channel">Canonical channel name (the <c>Category</c> column value, e.g. <c>ApiOutbound</c>).</param>
/// <param name="threshold">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are deleted.</param>
/// <param name="batchSize">Maximum rows deleted per batch; must be &gt; 0.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to the total number of rows deleted across all batches.</returns>
Task<long> PurgeChannelOlderThanAsync(
string channel,
DateTime threshold,
int batchSize,
CancellationToken ct = default);
/// <summary>
/// Returns the set of <c>pf_AuditLog_Month</c> partition lower-bound
/// boundaries whose partitions contain only rows with
@@ -201,4 +237,59 @@ public interface IAuditLogRepository
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to the distinct, non-null source node names in ascending order.</returns>
Task<IReadOnlyList<string>> GetDistinctSourceNodesAsync(CancellationToken ct = default);
/// <summary>
/// M5.6 (T5) one-time operational backfill: sets <c>SourceNode</c> to
/// <paramref name="sentinel"/> on every row where <c>SourceNode IS NULL</c>
/// and <c>OccurredAtUtc &lt; <paramref name="before"/></c>, in bounded
/// batches of <paramref name="batchSize"/> rows, looping until no further
/// rows match. Returns the total number of rows updated across all batches.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a sentinel, not the real value.</b> <c>SourceNode</c> captures the
/// physical cluster node on which an event was emitted. For pre-feature rows
/// that were ingested before the column was stamped, the true node-of-origin
/// is UNKNOWABLE — the original emitter is long gone and there is no
/// retroactive way to determine it. Backfilling a configurable sentinel
/// (default <c>"unknown"</c>) makes it explicit that these rows pre-date the
/// feature rather than silently leaving them NULL (which the filter UI already
/// treats as "unresolved" but which an operator might mistake for a bug).
/// </para>
/// <para>
/// <b><c>ExecutionId</c> / <c>ParentExecutionId</c> cannot be backfilled.</b>
/// These are PERSISTED COMPUTED columns derived from <c>DetailsJson</c>. The
/// AuditLog append-only invariant forbids mutating <c>DetailsJson</c>, so
/// the computed values for pre-feature rows remain NULL permanently. This is
/// documented rather than coded — see the Ops Note in
/// <c>Component-AuditLog.md § Ops Notes — Historical Null Columns</c>.
/// </para>
/// <para>
/// <b>Maintenance path — NOT the writer role.</b> This UPDATE runs on the
/// purge/maintenance connection (the same path as
/// <see cref="SwitchOutPartitionAsync"/> and any per-channel purge), NOT the
/// append-only <c>scadabridge_audit_writer</c> role. The CI guard
/// (<c>AuditLogAppendOnlyGuardTests</c>) recognises the
/// <c>// AUDIT-PURGE-ALLOWED</c> marker on the UPDATE line and forgives
/// exactly this one sanctioned maintenance-path UPDATE; any other UPDATE
/// against <c>AuditLog</c> still trips the guard.
/// </para>
/// <para>
/// <b>Bounded + idempotent.</b> <c>UPDATE TOP (@batch)</c> caps the
/// transaction-log and lock footprint per statement. The loop exits when a
/// batch updates zero rows, so a crash mid-loop is recoverable by simply
/// running again; re-running after completion is a no-op (no NULL rows
/// remain for the given <paramref name="before"/> window).
/// </para>
/// </remarks>
/// <param name="sentinel">Value to write into <c>SourceNode</c> for pre-feature rows (e.g. <c>"unknown"</c>).</param>
/// <param name="before">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are eligible.</param>
/// <param name="batchSize">Maximum rows updated per batch; must be &gt; 0.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to the total number of rows updated across all batches.</returns>
Task<long> BackfillSourceNodeAsync(
string sentinel,
DateTime before,
int batchSize,
CancellationToken ct = default);
}
@@ -100,6 +100,19 @@ public interface INotificationOutboxRepository
Task<IReadOnlyList<SiteNotificationKpiSnapshot>> ComputePerSiteKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
/// <summary>
/// Computes a point-in-time <see cref="NodeNotificationKpiSnapshot"/> per originating node.
/// Nodes with no notification rows at all are omitted; rows with a <c>NULL</c>
/// <c>SourceNode</c> are excluded. The stuck and delivered cutoffs are supplied by the
/// caller; the current time used for <c>OldestPendingAge</c> is captured inside the method.
/// </summary>
/// <param name="stuckCutoff">The time threshold for marking notifications as stuck.</param>
/// <param name="deliveredSince">The time threshold for counting delivered notifications.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>A list of per-node KPI snapshots, ordered by node name.</returns>
Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
/// <summary>
/// Persists pending changes tracked on the underlying context. Use this when staging
/// multiple changes for a single commit; the individual mutating methods on this
@@ -107,4 +107,19 @@ public interface ISiteCallAuditRepository
DateTime stuckCutoff,
DateTime intervalSince,
CancellationToken ct = default);
/// <summary>
/// Computes a point-in-time <see cref="SiteCallNodeKpiSnapshot"/> per originating
/// node. Nodes with no <c>SiteCalls</c> rows at all are omitted; rows with a
/// <c>NULL</c> <c>SourceNode</c> are excluded. The stuck cutoff and interval
/// bounds are interpreted as in <see cref="ComputeKpisAsync"/>.
/// </summary>
/// <param name="stuckCutoff">UTC threshold for classifying a row as stuck.</param>
/// <param name="intervalSince">UTC start of the delivered/failed interval window.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a per-node KPI list; nodes with no rows are omitted.</returns>
Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
DateTime stuckCutoff,
DateTime intervalSince,
CancellationToken ct = default);
}
@@ -164,3 +164,24 @@ public sealed record PerSiteSiteCallKpiResponse(
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteCallSiteKpiSnapshot> Sites);
/// <summary>
/// Site Calls UI -> Central: request for the per-node <c>SiteCalls</c>
/// KPI breakdown. Mirrors <see cref="PerSiteSiteCallKpiRequest"/> but groups
/// by <c>SourceNode</c> instead of <c>SourceSite</c>. Additive — does not
/// change per-site behaviour.
/// </summary>
public sealed record PerNodeSiteCallKpiRequest(
string CorrelationId);
/// <summary>
/// Central -> Site Calls UI: per-node KPI breakdown for the Site Calls KPIs
/// page. On a repository fault <see cref="Success"/> is <c>false</c>,
/// <see cref="ErrorMessage"/> carries the cause, and <see cref="Nodes"/> is empty.
/// Nodes with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </summary>
public sealed record PerNodeSiteCallKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteCallNodeKpiSnapshot> Nodes);
@@ -83,3 +83,46 @@ public record RouteToSetAttributesResponse(
bool Success,
string? ErrorMessage,
DateTimeOffset Timestamp);
/// <summary>
/// Request to block until a remote instance attribute reaches a target value
/// (spec §6 — <c>Route.To("inst").WaitForAttribute(name, targetValue, timeout)</c>).
/// Value-equality ONLY across the wire: <see cref="TargetValueEncoded"/> carries the
/// canonical <c>AttributeValueCodec</c>-encoded target; there is no predicate and no
/// quality flag in the comparison. The site evaluates equality and either matches or
/// times out.
/// </summary>
/// <param name="ParentExecutionId">
/// Audit Log #23 (ParentExecutionId): mirrors <see cref="RouteToCallRequest.ParentExecutionId"/>.
/// For an inbound-API-routed wait this is the inbound request's per-request execution id;
/// future site-side audit emission for routed waits can stamp it as <c>ParentExecutionId</c>
/// so the inbound→site execution-tree link survives the wait path. Additive trailing
/// member — null for the Central UI sandbox path or for callers built before the field existed.
/// </param>
public record RouteToWaitForAttributeRequest(
string CorrelationId,
string InstanceUniqueName,
string AttributeName,
string? TargetValueEncoded,
TimeSpan Timeout,
DateTimeOffset Timestamp,
Guid? ParentExecutionId = null);
/// <summary>
/// Response from a remote attribute wait. <see cref="Success"/>/<see cref="ErrorMessage"/>
/// convey the routing-level outcome (e.g. instance-not-found); <see cref="Matched"/>,
/// <see cref="TimedOut"/>, <see cref="Value"/>, and <see cref="Quality"/> convey the wait
/// outcome itself. When <see cref="Success"/> is <c>true</c>, exactly one of
/// <see cref="Matched"/>/<see cref="TimedOut"/> holds: <see cref="Matched"/> means the
/// attribute reached the target value (with <see cref="Value"/>/<see cref="Quality"/>
/// captured at the match), <see cref="TimedOut"/> means the deadline elapsed first.
/// </summary>
public record RouteToWaitForAttributeResponse(
string CorrelationId,
bool Matched,
object? Value,
string? Quality,
bool TimedOut,
bool Success,
string? ErrorMessage,
DateTimeOffset Timestamp);
@@ -0,0 +1,82 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Instance;
/// <summary>
/// Request to wait, event-driven, until an attribute reaches a value (or any
/// value satisfying a predicate), bounded by a timeout — the backing protocol for
/// the script-facing <c>Attributes.WaitAsync</c> helper.
///
/// <para>
/// <b>Site-local only.</b> The optional <see cref="Predicate"/> is a non-serializable
/// in-process delegate, so this message MUST flow only within a single site node's
/// actor system (script execution → Instance Actor). It is never sent across the
/// ClusterClient / gRPC boundary. The value-equality form (<see cref="TargetValueEncoded"/>)
/// would serialize, but the routed/inbound variant is deliberately out of scope here.
/// </para>
/// </summary>
/// <param name="CorrelationId">Per-wait correlation id; keys the waiter registry and the timeout self-message.</param>
/// <param name="InstanceName">The instance this wait targets.</param>
/// <param name="AttributeName">The attribute to watch — already scope-resolved by the accessor.</param>
/// <param name="TargetValueEncoded">
/// The codec-encoded target value (<c>AttributeValueCodec.Encode(target)</c>). A
/// match compares the codec-encoded form of the current value against this string.
/// When both this and <see cref="Predicate"/> are null the wait matches on ANY change.
/// </param>
/// <param name="Predicate">
/// Site-local predicate tested against the raw (decoded) current value. Mutually
/// exclusive with <see cref="TargetValueEncoded"/> — null when the encoded target is used.
/// </param>
/// <param name="Timeout">How long to wait before self-evicting with a timeout reply.</param>
/// <param name="OccurredAtUtc">When the request was issued (UTC).</param>
/// <param name="RequireGoodQuality">
/// Quality-gated ("Good"-only) mode (spec §4.2): when <see langword="true"/>, a
/// match additionally requires the attribute quality to be exactly
/// <c>"Good"</c> (<see cref="System.StringComparison.Ordinal"/>) — a value that
/// reaches the target / satisfies the predicate at Bad/Uncertain quality is NOT a
/// match and the waiter stays pending until the value satisfies the test at Good
/// quality (or times out). Defaults to <see langword="false"/> (quality-agnostic:
/// the match tests the value only). Trailing/defaulted so existing positional
/// constructions compile unchanged.
/// </param>
public record WaitForAttributeRequest(
string CorrelationId,
string InstanceName,
string AttributeName,
string? TargetValueEncoded,
Func<object?, bool>? Predicate,
TimeSpan Timeout,
DateTimeOffset OccurredAtUtc,
bool RequireGoodQuality = false);
/// <summary>
/// Reply to a <see cref="WaitForAttributeRequest"/>. Exactly one of
/// <see cref="Matched"/> / <see cref="TimedOut"/> is set on the happy paths;
/// <see cref="ErrorMessage"/> is populated on the failure paths (per-instance
/// waiter cap exceeded, or the match predicate threw).
/// </summary>
/// <param name="CorrelationId">Echoes the request's correlation id.</param>
/// <param name="Matched">True when the attribute reached the target/predicate within the timeout.</param>
/// <param name="Value">The matched value (null on timeout / error).</param>
/// <param name="Quality">
/// The attribute quality at match time; <see langword="null"/> on the non-match
/// paths (timeout / error / cap-exceeded), matching the nullable
/// <see cref="ErrorMessage"/> convention.
/// </param>
/// <param name="TimedOut">True when the timeout fired before a match.</param>
/// <param name="ErrorMessage">
/// Non-null only when the wait failed/refused — the per-instance waiter cap was
/// exceeded, or the match predicate threw (<c>"Wait predicate threw: …"</c>).
/// </param>
public record WaitForAttributeResponse(
string CorrelationId,
bool Matched,
object? Value,
string? Quality,
bool TimedOut,
string? ErrorMessage = null);
/// <summary>
/// Internal self-message scheduled by the Instance Actor to fire a waiter's
/// timeout. Site-local only; never crosses a cluster boundary.
/// </summary>
/// <param name="CorrelationId">The waiter whose timeout fired.</param>
public record WaitForAttributeTimeout(string CorrelationId);
@@ -159,3 +159,23 @@ public record PerSiteNotificationKpiResponse(
bool Success,
string? ErrorMessage,
IReadOnlyList<SiteNotificationKpiSnapshot> Sites);
/// <summary>
/// Outbox UI -> Central: request for the per-node notification outbox KPI breakdown.
/// Mirrors <see cref="PerSiteNotificationKpiRequest"/> but groups by <c>SourceNode</c>
/// instead of <c>SourceSiteId</c>. Additive — does not change per-site behaviour.
/// </summary>
public record PerNodeNotificationKpiRequest(
string CorrelationId);
/// <summary>
/// Central -> Outbox UI: per-node KPI breakdown for the Notification KPIs page.
/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
/// carries the cause, and <see cref="Nodes"/> is empty. Nodes with a <c>NULL</c>
/// <c>SourceNode</c> are omitted.
/// </summary>
public record PerNodeNotificationKpiResponse(
string CorrelationId,
bool Success,
string? ErrorMessage,
IReadOnlyList<NodeNotificationKpiSnapshot> Nodes);
@@ -0,0 +1,37 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
/// <summary>
/// Point-in-time <c>SiteCalls</c> metrics scoped to a single originating node. The
/// per-node counterpart of <see cref="SiteCallSiteKpiSnapshot"/>; surfaced in the
/// per-node breakdown table on the Site Calls KPIs page. Mirrors
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications.NodeNotificationKpiSnapshot"/>.
/// </summary>
/// <param name="SourceNode">
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </param>
/// <param name="BufferedCount">Count of this node's non-terminal rows (<c>TerminalAtUtc IS NULL</c>).</param>
/// <param name="ParkedCount">Count of this node's rows in the <c>Parked</c> status.</param>
/// <param name="FailedLastInterval">
/// Count of this node's <c>Failed</c> rows whose <c>TerminalAtUtc</c> is at or
/// after the "since" timestamp.
/// </param>
/// <param name="DeliveredLastInterval">
/// Count of this node's <c>Delivered</c> rows whose <c>TerminalAtUtc</c> is at
/// or after the "since" timestamp.
/// </param>
/// <param name="OldestPendingAge">
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
/// </param>
/// <param name="StuckCount">
/// Count of this node's non-terminal rows whose <c>CreatedAtUtc</c> is older
/// than the stuck cutoff.
/// </param>
public sealed record SiteCallNodeKpiSnapshot(
string SourceNode,
int BufferedCount,
int ParkedCount,
int FailedLastInterval,
int DeliveredLastInterval,
TimeSpan? OldestPendingAge,
int StuckCount);
@@ -0,0 +1,30 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications;
/// <summary>
/// Point-in-time notification-outbox metrics scoped to a single originating node.
/// The per-node counterpart of <see cref="SiteNotificationKpiSnapshot"/>; surfaced
/// in the per-node breakdown table on the Notification KPIs page.
/// </summary>
/// <param name="SourceNode">
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
/// </param>
/// <param name="QueueDepth">Count of this node's non-terminal rows (Pending + Retrying).</param>
/// <param name="StuckCount">
/// Count of this node's non-terminal rows whose <c>CreatedAt</c> is older than the stuck cutoff.
/// </param>
/// <param name="ParkedCount">Count of this node's rows in the Parked status.</param>
/// <param name="DeliveredLastInterval">
/// Count of this node's Delivered rows whose <c>DeliveredAt</c> is at or after the
/// "delivered since" timestamp.
/// </param>
/// <param name="OldestPendingAge">
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
/// </param>
public record NodeNotificationKpiSnapshot(
string SourceNode,
int QueueDepth,
int StuckCount,
int ParkedCount,
int DeliveredLastInterval,
TimeSpan? OldestPendingAge);
@@ -0,0 +1,21 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Types;
/// <summary>
/// Rich result of an <c>Attributes.WaitForAsync</c> wait (spec §3) — the full
/// outcome of waiting for an attribute to reach a value / satisfy a predicate /
/// change at all, bounded by a timeout. The <c>Attributes.WaitAsync</c> helpers
/// surface only <see cref="Matched"/>; <c>WaitForAsync</c> returns this struct so
/// a script can also read the matched <see cref="Value"/>, its <see cref="Quality"/>,
/// and distinguish a genuine timeout (<see cref="TimedOut"/>) from a non-match.
/// </summary>
/// <param name="Matched">
/// <see langword="true"/> when the attribute reached the target / satisfied the
/// predicate within the timeout (and, in quality-gated mode, at "Good" quality).
/// </param>
/// <param name="Value">The matched value; <see langword="null"/> on timeout / error.</param>
/// <param name="Quality">
/// The attribute quality at match time; <see langword="null"/> on the non-match
/// paths (timeout / error / cap-exceeded).
/// </param>
/// <param name="TimedOut"><see langword="true"/> when the timeout fired before a match.</param>
public readonly record struct WaitResult(bool Matched, object? Value, string? Quality, bool TimedOut);
@@ -144,6 +144,7 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
Receive<RouteToCallRequest>(msg => _deploymentManagerProxy.Forward(msg));
Receive<RouteToGetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
Receive<RouteToSetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
Receive<RouteToWaitForAttributeRequest>(msg => _deploymentManagerProxy.Forward(msg));
// OPC UA Tag Browser (interactive design-time query) — forward to the
// Deployment Manager singleton, which always lands on the active site
@@ -445,6 +445,25 @@ public class CommunicationService
envelope, _options.IntegrationTimeout, cancellationToken);
}
/// <summary>
/// Routes an inbound API wait-for-attribute request to a site (spec §6).
/// </summary>
/// <param name="siteId">The target site identifier.</param>
/// <param name="request">The wait-for-attribute route request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The wait-for-attribute route response.</returns>
public async Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken = default)
{
var envelope = new SiteEnvelope(siteId, request);
// A wait legitimately blocks up to request.Timeout on the site, so the cluster
// Ask must be bounded by the WAIT deadline (plus integration-timeout slack for
// the round trip), not the generic IntegrationTimeout used by the other routes.
var askTimeout = request.Timeout + _options.IntegrationTimeout;
return await GetActor().Ask<RouteToWaitForAttributeResponse>(
envelope, askTimeout, cancellationToken);
}
// ── Notification Outbox (central-local actor — Asked directly, no SiteEnvelope) ──
/// <summary>
@@ -525,6 +544,22 @@ public class CommunicationService
request, _options.QueryTimeout, cancellationToken);
}
/// <summary>
/// Gets per-node KPI metrics for the notification outbox.
/// Groups by <c>SourceNode</c> (e.g. <c>node-a</c>/<c>node-b</c>); rows with
/// a <c>NULL</c> node are omitted. Additive alongside
/// <see cref="GetPerSiteNotificationKpisAsync"/>.
/// </summary>
/// <param name="request">The per-node notification KPI request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The per-node notification KPI response.</returns>
public async Task<PerNodeNotificationKpiResponse> GetPerNodeNotificationKpisAsync(
PerNodeNotificationKpiRequest request, CancellationToken cancellationToken = default)
{
return await GetNotificationOutbox().Ask<PerNodeNotificationKpiResponse>(
request, _options.QueryTimeout, cancellationToken);
}
// ── Site Call Audit (central-local actor — Asked directly, no SiteEnvelope) ──
/// <summary>
@@ -579,6 +614,21 @@ public class CommunicationService
request, _options.QueryTimeout, cancellationToken);
}
/// <summary>
/// Gets per-node KPI metrics for site calls. Groups by <c>SourceNode</c>
/// (e.g. <c>node-a</c>/<c>node-b</c>); rows with a <c>NULL</c> node are
/// omitted. Additive alongside <see cref="GetPerSiteSiteCallKpisAsync"/>.
/// </summary>
/// <param name="request">The per-node site call KPI request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The per-node site call KPI response.</returns>
public async Task<PerNodeSiteCallKpiResponse> GetPerNodeSiteCallKpisAsync(
PerNodeSiteCallKpiRequest request, CancellationToken cancellationToken = default)
{
return await GetSiteCallAudit().Ask<PerNodeSiteCallKpiResponse>(
request, _options.QueryTimeout, cancellationToken);
}
/// <summary>
/// Task 5 (#22): relays an operator Retry of a parked cached call to its
/// owning site. The <c>SiteCallAuditActor</c> is Asked directly (it is
@@ -370,6 +370,99 @@ VALUES
return rowsDeleted;
}
/// <inheritdoc />
public async Task<long> PurgeChannelOlderThanAsync(
string channel,
DateTime threshold,
int batchSize,
CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(channel))
{
throw new ArgumentException("Channel must be a non-empty channel name.", nameof(channel));
}
if (batchSize <= 0)
{
throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
}
var thresholdUtc = DateTime.SpecifyKind(threshold.ToUniversalTime(), DateTimeKind.Utc);
// M5.5 (T3) per-channel retention override purge. This is the ONLY DELETE
// against dbo.AuditLog in the codebase and it runs on the purge/maintenance
// path, NOT the append-only writer role (which has INSERT + SELECT only — see
// the DENY UPDATE/DENY DELETE grants in CollapseAuditLogToCanonical). The
// AuditLog append-only CI guard (AuditLogAppendOnlyGuardTests) is intentionally
// widened to allow ONLY the single marked DELETE below; any other UPDATE/DELETE
// targeting AuditLog still trips the guard.
//
// Bounded + idempotent: DELETE TOP (@batch) caps the log/lock footprint per
// statement; the loop repeats until a batch deletes zero rows, so re-running
// after a crash mid-loop simply resumes. Category is the canonical
// channel-name column (e.g. 'ApiOutbound'); Action holds "{channel}.{kind}" so
// it is NOT the right column to match a bare channel name against.
//
// The trailing AUDIT-PURGE-ALLOWED marker on the DELETE line below is the
// single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
// recognizes; any other UPDATE/DELETE targeting AuditLog still trips the guard.
const string deleteBatchSql =
"DELETE TOP (@batch) FROM dbo.AuditLog WHERE Category = @channel AND OccurredAtUtc < @threshold;"; // AUDIT-PURGE-ALLOWED: per-channel retention override (M5.5 T3), maintenance path
long totalDeleted = 0;
var conn = _context.Database.GetDbConnection();
var openedHere = false;
if (conn.State != System.Data.ConnectionState.Open)
{
await conn.OpenAsync(ct).ConfigureAwait(false);
openedHere = true;
}
try
{
while (true)
{
ct.ThrowIfCancellationRequested();
await using var cmd = conn.CreateCommand();
cmd.CommandText = deleteBatchSql;
var pBatch = cmd.CreateParameter();
pBatch.ParameterName = "@batch";
pBatch.Value = batchSize;
cmd.Parameters.Add(pBatch);
var pChannel = cmd.CreateParameter();
pChannel.ParameterName = "@channel";
pChannel.Value = channel;
cmd.Parameters.Add(pChannel);
var pThreshold = cmd.CreateParameter();
pThreshold.ParameterName = "@threshold";
pThreshold.Value = thresholdUtc;
cmd.Parameters.Add(pThreshold);
var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
if (rows <= 0)
{
break;
}
totalDeleted += rows;
}
}
finally
{
if (openedHere)
{
await conn.CloseAsync().ConfigureAwait(false);
}
}
return totalDeleted;
}
/// <inheritdoc />
public async Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
DateTime threshold,
@@ -716,6 +809,102 @@ VALUES
.ToListAsync(ct);
}
/// <inheritdoc />
public async Task<long> BackfillSourceNodeAsync(
string sentinel,
DateTime before,
int batchSize,
CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(sentinel))
{
throw new ArgumentException("Sentinel must be a non-empty value.", nameof(sentinel));
}
if (batchSize <= 0)
{
throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
}
var beforeUtc = DateTime.SpecifyKind(before.ToUniversalTime(), DateTimeKind.Utc);
// M5.6 (T5) SourceNode sentinel backfill. This is the ONE sanctioned UPDATE
// against dbo.AuditLog in the codebase. It touches ONLY rows where
// SourceNode IS NULL AND OccurredAtUtc < @before — rows that pre-date the
// M5.6 feature and whose node-of-origin is UNKNOWABLE. The sentinel (default
// "unknown") makes that explicit. ExecutionId/ParentExecutionId are PERSISTED
// COMPUTED columns derived from DetailsJson — mutating DetailsJson is forbidden
// under the append-only invariant, so those stay NULL on pre-feature rows.
//
// Maintenance path (NOT the writer role): runs on the same connection used for
// SwitchOutPartitionAsync (partition-switch DDL), which requires a role that
// holds UPDATE — the append-only scadabridge_audit_writer role has only
// INSERT + SELECT.
//
// Bounded + idempotent: UPDATE TOP (@batch) caps the log/lock footprint per
// statement; the loop exits when a batch updates 0 rows. Re-running after a
// crash simply resumes where it left off.
//
// The trailing AUDIT-PURGE-ALLOWED marker on the UPDATE line below is the
// single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
// recognises for an UPDATE; any other UPDATE targeting AuditLog still trips the guard.
const string updateBatchSql =
"UPDATE TOP (@batch) dbo.AuditLog SET SourceNode = @sentinel WHERE SourceNode IS NULL AND OccurredAtUtc < @before;"; // AUDIT-PURGE-ALLOWED: SourceNode sentinel backfill (M5.6 T5), maintenance path
long totalUpdated = 0;
var conn = _context.Database.GetDbConnection();
var openedHere = false;
if (conn.State != System.Data.ConnectionState.Open)
{
await conn.OpenAsync(ct).ConfigureAwait(false);
openedHere = true;
}
try
{
while (true)
{
ct.ThrowIfCancellationRequested();
await using var cmd = conn.CreateCommand();
cmd.CommandText = updateBatchSql;
var pBatch = cmd.CreateParameter();
pBatch.ParameterName = "@batch";
pBatch.Value = batchSize;
cmd.Parameters.Add(pBatch);
var pSentinel = cmd.CreateParameter();
pSentinel.ParameterName = "@sentinel";
pSentinel.Value = sentinel;
cmd.Parameters.Add(pSentinel);
var pBefore = cmd.CreateParameter();
pBefore.ParameterName = "@before";
pBefore.Value = beforeUtc;
cmd.Parameters.Add(pBefore);
var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
if (rows <= 0)
{
break;
}
totalUpdated += rows;
}
}
finally
{
if (openedHere)
{
await conn.CloseAsync().ConfigureAwait(false);
}
}
return totalUpdated;
}
/// <summary>
/// Splits a <c>STRING_AGG</c> comma-joined value into a distinct, ordered
/// list. A null/empty aggregate (a stub node with no rows) yields an empty
@@ -300,6 +300,63 @@ VALUES
: null)).ToList();
}
/// <inheritdoc />
public async Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default)
{
var now = DateTimeOffset.UtcNow;
// Exclude rows with NULL SourceNode (legacy / unstamped) — per-node KPIs
// are only meaningful when the node identity is known.
var queueDepth = await CountByNodeAsync(
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
&& n.SourceNode != null,
cancellationToken);
var stuck = await CountByNodeAsync(
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
&& n.CreatedAt < stuckCutoff
&& n.SourceNode != null,
cancellationToken);
var parked = await CountByNodeAsync(
n => n.Status == NotificationStatus.Parked && n.SourceNode != null,
cancellationToken);
var delivered = await CountByNodeAsync(
n => n.Status == NotificationStatus.Delivered
&& n.DeliveredAt != null && n.DeliveredAt >= deliveredSince
&& n.SourceNode != null,
cancellationToken);
// Oldest non-terminal CreatedAt per node — same in-memory reduction
// pattern as ComputePerSiteKpisAsync (DateTimeOffset converter makes
// a SQL Min awkward).
var oldest = (await _context.Notifications
.Where(n => (n.Status == NotificationStatus.Pending
|| n.Status == NotificationStatus.Retrying)
&& n.SourceNode != null)
.Select(n => new { n.SourceNode, n.CreatedAt })
.ToListAsync(cancellationToken))
.GroupBy(x => x.SourceNode!)
.ToDictionary(g => g.Key, g => g.Min(x => x.CreatedAt));
var nodeNames = queueDepth.Keys
.Concat(stuck.Keys).Concat(parked.Keys).Concat(delivered.Keys)
.Distinct()
.OrderBy(n => n, StringComparer.Ordinal);
return nodeNames.Select(node => new NodeNotificationKpiSnapshot(
SourceNode: node,
QueueDepth: queueDepth.GetValueOrDefault(node),
StuckCount: stuck.GetValueOrDefault(node),
ParkedCount: parked.GetValueOrDefault(node),
DeliveredLastInterval: delivered.GetValueOrDefault(node),
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
? now - createdAt
: null)).ToList();
}
/// <summary>Counts notification rows matching <paramref name="predicate"/>, grouped by source site.</summary>
private async Task<Dictionary<string, int>> CountBySiteAsync(
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
@@ -312,6 +369,22 @@ VALUES
.ToDictionaryAsync(x => x.Site, x => x.Count, cancellationToken);
}
/// <summary>
/// Counts notification rows matching <paramref name="predicate"/>, grouped by source node.
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
/// responsible for enforcing that guard.
/// </summary>
private async Task<Dictionary<string, int>> CountByNodeAsync(
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
CancellationToken cancellationToken)
{
return await _context.Notifications
.Where(predicate)
.GroupBy(n => n.SourceNode!)
.Select(g => new { Node = g.Key, Count = g.Count() })
.ToDictionaryAsync(x => x.Node, x => x.Count, cancellationToken);
}
/// <inheritdoc />
public async Task<int> SaveChangesAsync(CancellationToken cancellationToken = default)
=> await _context.SaveChangesAsync(cancellationToken);
@@ -324,6 +324,61 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
StuckCount: stuck.GetValueOrDefault(site))).ToList();
}
/// <inheritdoc />
public async Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default)
{
var now = DateTime.UtcNow;
// Exclude rows with NULL SourceNode — per-node KPIs are only meaningful
// when the node identity is known. Each predicate guards n.SourceNode != null
// so the GROUP BY key is always non-null.
var buffered = await CountByNodeAsync(
s => s.TerminalAtUtc == null && s.SourceNode != null, ct);
var parked = await CountByNodeAsync(
s => s.Status == StatusParked && s.SourceNode != null, ct);
var failed = await CountByNodeAsync(
s => s.Status == StatusFailed
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
&& s.SourceNode != null, ct);
var delivered = await CountByNodeAsync(
s => s.Status == StatusDelivered
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
&& s.SourceNode != null, ct);
var stuck = await CountByNodeAsync(
s => s.TerminalAtUtc == null && s.CreatedAtUtc < stuckCutoff
&& s.SourceNode != null, ct);
// Oldest non-terminal CreatedAtUtc per node — server-side GROUP BY MIN.
var oldest = (await _context.SiteCalls
.Where(s => s.TerminalAtUtc == null && s.SourceNode != null)
.GroupBy(s => s.SourceNode!)
.Select(g => new { Node = g.Key, Oldest = g.Min(s => s.CreatedAtUtc) })
.ToListAsync(ct))
.ToDictionary(x => x.Node, x => x.Oldest);
var nodeNames = buffered.Keys
.Concat(parked.Keys).Concat(failed.Keys)
.Concat(delivered.Keys).Concat(stuck.Keys)
.Distinct()
.OrderBy(n => n, StringComparer.Ordinal);
return nodeNames.Select(node => new SiteCallNodeKpiSnapshot(
SourceNode: node,
BufferedCount: buffered.GetValueOrDefault(node),
ParkedCount: parked.GetValueOrDefault(node),
FailedLastInterval: failed.GetValueOrDefault(node),
DeliveredLastInterval: delivered.GetValueOrDefault(node),
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
? now - createdAt
: null,
StuckCount: stuck.GetValueOrDefault(node))).ToList();
}
/// <summary>Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source site.</summary>
private async Task<Dictionary<string, int>> CountBySiteAsync(
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
@@ -336,6 +391,22 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
.ToDictionaryAsync(x => x.Site, x => x.Count, ct);
}
/// <summary>
/// Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source node.
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
/// responsible for enforcing that guard.
/// </summary>
private async Task<Dictionary<string, int>> CountByNodeAsync(
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
CancellationToken ct)
{
return await _context.SiteCalls
.Where(predicate)
.GroupBy(s => s.SourceNode!)
.Select(g => new { Node = g.Key, Count = g.Count() })
.ToDictionaryAsync(x => x.Node, x => x.Count, ct);
}
private static int GetRankOrThrow(string status)
{
if (!StatusRank.TryGetValue(status, out var rank))
@@ -35,4 +35,9 @@ public sealed class CommunicationServiceInstanceRouter : IInstanceRouter
public Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken) =>
_communicationService.RouteToSetAttributesAsync(siteId, request, cancellationToken);
/// <inheritdoc />
public Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken) =>
_communicationService.RouteToWaitForAttributeAsync(siteId, request, cancellationToken);
}
@@ -34,4 +34,12 @@ public interface IInstanceRouter
/// <returns>A task that resolves to the set-attributes response from the target site.</returns>
Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken);
/// <summary>Routes a wait-for-attribute request to the specified site (spec §6).</summary>
/// <param name="siteId">Target site identifier.</param>
/// <param name="request">The wait-for-attribute request to route (value-equality only).</param>
/// <param name="cancellationToken">Cancellation token for the routed call.</param>
/// <returns>A task that resolves to the wait-for-attribute response from the target site.</returns>
Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken);
}
@@ -6,6 +6,7 @@ using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
@@ -95,6 +96,7 @@ public sealed class AuditWriteMiddleware
private readonly ILogger<AuditWriteMiddleware> _logger;
private readonly IOptionsMonitor<AuditLogOptions> _options;
private readonly IAuditActorAccessor? _actorAccessor;
private readonly IAuditInboundCeilingHitsCounter _ceilingHitsCounter;
/// <summary>
/// Initializes the middleware with its required dependencies.
@@ -110,18 +112,26 @@ public sealed class AuditWriteMiddleware
/// construct the middleware; when absent, actor resolution falls back to the
/// stashed API-key name only.
/// </param>
/// <param name="ceilingHitsCounter">
/// M5.3 (T7, optional): incremented whenever an inbound request or response
/// body is truncated at <see cref="AuditLogOptions.InboundMaxBytes"/>. Optional
/// so existing tests and composition roots without the central health snapshot
/// wired still construct without the counter; a NoOp is used when absent.
/// </param>
public AuditWriteMiddleware(
RequestDelegate next,
ICentralAuditWriter auditWriter,
ILogger<AuditWriteMiddleware> logger,
IOptionsMonitor<AuditLogOptions> options,
IAuditActorAccessor? actorAccessor = null)
IAuditActorAccessor? actorAccessor = null,
IAuditInboundCeilingHitsCounter? ceilingHitsCounter = null)
{
_next = next ?? throw new ArgumentNullException(nameof(next));
_auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_options = options ?? throw new ArgumentNullException(nameof(options));
_actorAccessor = actorAccessor;
_ceilingHitsCounter = ceilingHitsCounter ?? new NoOpAuditInboundCeilingHitsCounter();
}
/// <summary>
@@ -133,9 +143,11 @@ public sealed class AuditWriteMiddleware
{
var sw = Stopwatch.StartNew();
// Per-request hot read of the inbound cap so a live config change
// Per-request hot read of the options snapshot so a live config change
// picks up on the next request without re-resolving the singleton.
var cap = _options.CurrentValue.InboundMaxBytes;
// InboundMaxBytes is read once here and passed to the capture helpers.
var opts = _options.CurrentValue;
var cap = opts.InboundMaxBytes;
// Audit Log #23 (ParentExecutionId): mint the inbound request's per-request
// ExecutionId ONCE, here at the start of the request, and stash it on
@@ -163,9 +175,20 @@ public sealed class AuditWriteMiddleware
// ReadBufferedRequestBodyAsync's own ContentLength is 0 short-circuit
// returns (null, false) for the bodyless case anyway, so the audit row
// is unchanged.
//
// M5.3 (T7): check if the matched method/target has SkipBodyCapture set.
// The route value is resolved BEFORE the pipeline runs (route matching
// has already bound {methodName} at this point), so we can skip the
// EnableBuffering allocation and body read up front.
var methodNameForOverride = ctx.Request.RouteValues.TryGetValue("methodName", out var rv)
&& rv is string mn && !string.IsNullOrWhiteSpace(mn) ? mn : null;
var skipBody = methodNameForOverride != null
&& opts.PerTargetOverrides.TryGetValue(methodNameForOverride, out var perTarget)
&& perTarget.SkipBodyCapture;
var requestBody = (string?)null;
var requestTruncated = false;
if (RequestHasBody(ctx.Request))
if (!skipBody && RequestHasBody(ctx.Request))
{
ctx.Request.EnableBuffering();
(requestBody, requestTruncated) =
@@ -200,15 +223,25 @@ public sealed class AuditWriteMiddleware
// The forwarding wrapper has already written every byte to the
// original sink; this just pulls back the bounded UTF-8 string.
ctx.Response.Body = originalResponseBody;
var (responseBody, responseTruncated) = captureStream.GetCapturedBody();
var (capturedResponseBody, capturedResponseTruncated) = captureStream.GetCapturedBody();
// M5.3 (T7): if SkipBodyCapture is set, discard the captured response
// body (the request body was never captured above). The row + headers
// still emit with null RequestSummary / ResponseSummary.
// Truncation flags are also cleared so ceiling-hit counter is not
// bumped for methods that deliberately opt out of body capture.
var responseBody = skipBody ? null : capturedResponseBody;
var responseTruncated = skipBody ? false : capturedResponseTruncated;
EmitInboundAudit(
ctx,
opts,
sw.ElapsedMilliseconds,
thrown,
requestBody,
responseBody,
requestTruncated || responseTruncated);
requestTruncated || responseTruncated,
requestTruncated,
responseTruncated);
}
}
@@ -219,11 +252,14 @@ public sealed class AuditWriteMiddleware
/// </summary>
private void EmitInboundAudit(
HttpContext ctx,
AuditLogOptions opts,
long durationMs,
Exception? thrown,
string? requestBody,
string? responseBody,
bool payloadTruncated)
bool payloadTruncated,
bool requestTruncated = false,
bool responseTruncated = false)
{
try
{
@@ -243,10 +279,43 @@ public sealed class AuditWriteMiddleware
var actor = isAuthFailure ? null : ResolveActor(ctx);
var methodName = ResolveMethodName(ctx);
// M5.3 (T7): increment the ceiling-hits counter once per request
// that hit the cap on EITHER the request or response body.
if (requestTruncated || responseTruncated)
{
try { _ceilingHitsCounter.Increment(); } catch { /* swallow per §7 */ }
}
// M5.3 (T7): capture request headers into Extra JSON alongside the
// existing remoteIp / userAgent provenance fields. The header
// collection is run through the SAME header-redaction list
// (AuditLogOptions.HeaderRedactList) that the ScadaBridgeAuditRedactor
// applies to RequestSummary / ResponseSummary — auth/sensitive
// headers are redacted before they land in the row. Uses the SAME
// options snapshot captured at request start (passed in as opts) as
// the SkipBodyCapture / PerTargetOverrides decisions, so a mid-request
// live-reload can't split the body-capture and header-redaction
// verdicts across two different snapshots.
var redactSet = new HashSet<string>(
opts.HeaderRedactList,
StringComparer.OrdinalIgnoreCase);
var headerDict = new Dictionary<string, string>(StringComparer.Ordinal);
foreach (var header in ctx.Request.Headers)
{
// Redact headers whose name appears in the HeaderRedactList —
// the same "<redacted>" marker used by ScadaBridgeAuditRedactor.
var value = redactSet.Contains(header.Key)
? "<redacted>"
: header.Value.ToString();
headerDict[header.Key] = value;
}
var extra = JsonSerializer.Serialize(new
{
remoteIp = ctx.Connection.RemoteIpAddress?.ToString(),
userAgent = ctx.Request.Headers.UserAgent.ToString(),
requestHeaders = headerDict,
});
var evt = ScadaBridgeAuditEventFactory.Create(
@@ -205,6 +205,47 @@ public class RouteTarget
return response.Values;
}
/// <summary>
/// Blocks until a remote instance attribute reaches <paramref name="targetValue"/>
/// or <paramref name="timeout"/> elapses (spec §6). Value-equality ONLY across the
/// wire: the target is canonically encoded via <see cref="AttributeValueCodec"/> and
/// the site evaluates equality — there is no predicate and no quality flag in the
/// comparison.
/// </summary>
/// <param name="attributeName">Name of the attribute to wait on.</param>
/// <param name="targetValue">Target value the attribute must equal for the wait to match.</param>
/// <param name="timeout">Maximum time to wait for the attribute to reach the target value.</param>
/// <param name="cancellationToken">Optional cancellation token; defaults to the method deadline.</param>
/// <returns>A task that resolves to <c>true</c> if the attribute reached the target value, <c>false</c> if the wait timed out.</returns>
public async Task<bool> WaitForAttribute(
string attributeName,
object? targetValue,
TimeSpan timeout,
CancellationToken cancellationToken = default)
{
var token = Effective(cancellationToken);
var siteId = await ResolveSiteAsync(token);
// Audit Log #23 (ParentExecutionId): mirrors the Call path — stamp the
// spawning inbound request's ExecutionId so future site-side audit
// emission for routed waits can record this wait's parent. CorrelationId
// is the per-operation lifecycle id, freshly minted per routed wait.
var request = new RouteToWaitForAttributeRequest(
Guid.NewGuid().ToString(), _instanceCode, attributeName,
AttributeValueCodec.Encode(targetValue), timeout, DateTimeOffset.UtcNow,
_parentExecutionId);
var response = await _instanceRouter.RouteToWaitForAttributeAsync(siteId, request, token);
if (!response.Success)
{
throw new InvalidOperationException(
response.ErrorMessage ?? "Remote attribute wait failed");
}
return response.Matched;
}
/// <summary>
/// Sets a single attribute value on the remote instance.
/// </summary>
@@ -18,13 +18,17 @@ namespace ZB.MOM.WW.ScadaBridge.ManagementService;
/// <summary>
/// Minimal-API endpoints exposing the central Audit Log (#23) over HTTP for the
/// ScadaBridge CLI (M8). Two routes:
/// ScadaBridge CLI (M8). Three routes:
/// <list type="bullet">
/// <item><c>GET /api/audit/query</c> — keyset-paged JSON page, gated on the
/// <see cref="AuthorizationPolicies.OperationalAudit"/> permission.</item>
/// <item><c>GET /api/audit/export</c> — streamed bulk export (csv / jsonl;
/// parquet returns HTTP 501), gated on the
/// <see cref="AuthorizationPolicies.AuditExport"/> permission.</item>
/// <item><c>GET /api/audit/tree</c> — execution-chain tree rooted at the
/// topmost ancestor of a given <c>executionId</c>, returned as a JSON array
/// of <see cref="ExecutionTreeNode"/>; gated on
/// <see cref="AuthorizationPolicies.OperationalAudit"/>.</item>
/// </list>
///
/// <para>
@@ -85,8 +89,16 @@ public static class AuditEndpoints
Converters = { new JsonStringEnumConverter() },
};
/// <summary>Default sentinel written by the backfill endpoint when the caller omits <c>sentinel</c>.</summary>
public const string DefaultBackfillSentinel = "unknown";
/// <summary>Default batch size for the backfill endpoint when the caller omits <c>batchSize</c>.</summary>
public const int DefaultBackfillBatchSize = 5000;
/// <summary>
/// Registers the <c>/api/audit/query</c> and <c>/api/audit/export</c> minimal-API endpoints.
/// Registers the <c>/api/audit/query</c>, <c>/api/audit/export</c>,
/// <c>/api/audit/tree</c>, and <c>POST /api/audit/backfill-source-node</c>
/// minimal-API endpoints.
/// </summary>
/// <param name="endpoints">The endpoint route builder to register routes on.</param>
/// <returns>The same <paramref name="endpoints"/> builder, for chaining.</returns>
@@ -94,6 +106,8 @@ public static class AuditEndpoints
{
endpoints.MapGet("/api/audit/query", (Delegate)HandleQuery);
endpoints.MapGet("/api/audit/export", (Delegate)HandleExport);
endpoints.MapGet("/api/audit/tree", (Delegate)HandleTree);
endpoints.MapPost("/api/audit/backfill-source-node", (Delegate)HandleBackfillSourceNode);
return endpoints;
}
@@ -232,6 +246,177 @@ public static class AuditEndpoints
return Results.Empty;
}
// ─────────────────────────────────────────────────────────────────────
// GET /api/audit/tree
// ─────────────────────────────────────────────────────────────────────
/// <summary>
/// Handles <c>GET /api/audit/tree?executionId=...</c>: authenticates, checks the
/// OperationalAudit permission, and returns the full execution-chain tree rooted at
/// the topmost ancestor of the supplied <c>executionId</c>. The response is a JSON
/// array of <see cref="ExecutionTreeNode"/> objects (empty array when the id is
/// not found). Returns HTTP 400 when <c>executionId</c> is absent or not a valid
/// GUID.
/// </summary>
/// <param name="context">The HTTP context for the current request.</param>
/// <returns>A task that resolves to the HTTP result (200 JSON array, 400, 401, or 403).</returns>
internal static async Task<IResult> HandleTree(HttpContext context)
{
var auth = await AuthenticateAsync(context);
if (auth.Failure is not null)
{
return auth.Failure;
}
if (!HasAnyRole(auth.User!, AuthorizationPolicies.OperationalAuditRoles))
{
return Forbidden("OperationalAudit");
}
var raw = context.Request.Query["executionId"].ToString();
if (string.IsNullOrWhiteSpace(raw) || !Guid.TryParse(raw, out var executionId))
{
return Results.Json(
new { error = "Missing or invalid 'executionId' query parameter (expected a GUID).", code = "BAD_REQUEST" },
statusCode: 400);
}
var repo = context.RequestServices.GetRequiredService<IAuditLogRepository>();
var nodes = await repo.GetExecutionTreeAsync(executionId, context.RequestAborted);
return Results.Json(nodes, JsonOptions);
}
// ─────────────────────────────────────────────────────────────────────
// POST /api/audit/backfill-source-node
// ─────────────────────────────────────────────────────────────────────
/// <summary>
/// Handles <c>POST /api/audit/backfill-source-node</c>: authenticates (Admin role
/// required), reads the JSON body for <c>sentinel</c> / <c>before</c> /
/// <c>batchSize</c>, and calls
/// <see cref="IAuditLogRepository.BackfillSourceNodeAsync"/> on the maintenance
/// path.
///
/// <para>
/// <b>Auth.</b> Admin-only — backfilling the SourceNode column is a one-time ops
/// procedure that mutates the AuditLog table via the maintenance path (NOT the
/// append-only writer role). Restricted to <see cref="AuthorizationPolicies.AuditExportRoles"/>
/// (Administrator) so it is never accessible to Viewer-role users.
/// </para>
///
/// <para>
/// <b>Request body.</b>
/// <code>
/// {
/// "sentinel": "unknown", // optional; default "unknown"
/// "before": "2026-01-01T00:00:00Z", // required ISO-8601 UTC
/// "batchSize": 5000 // optional; default 5000
/// }
/// </code>
/// </para>
///
/// <para>
/// <b>Response (200).</b>
/// <code>{ "rowsUpdated": 12345, "sentinel": "unknown", "before": "2026-01-01T00:00:00Z" }</code>
/// </para>
/// </summary>
/// <param name="context">The HTTP context for the current request.</param>
/// <returns>A task that resolves to the HTTP result (200 JSON, 400, 401, or 403).</returns>
internal static async Task<IResult> HandleBackfillSourceNode(HttpContext context)
{
var auth = await AuthenticateAsync(context);
if (auth.Failure is not null)
{
return auth.Failure;
}
// Admin-only: backfilling is a one-time ops procedure on the maintenance path.
if (!HasAnyRole(auth.User!, AuthorizationPolicies.AuditExportRoles))
{
return Forbidden("Administrator");
}
string bodyText;
try
{
using var reader = new System.IO.StreamReader(context.Request.Body);
bodyText = await reader.ReadToEndAsync(context.RequestAborted);
}
catch (OperationCanceledException)
{
return Results.Json(new { error = "Request cancelled.", code = "CANCELLED" }, statusCode: 499);
}
string sentinel = DefaultBackfillSentinel;
DateTime? beforeUtc = null;
int batchSize = DefaultBackfillBatchSize;
if (!string.IsNullOrWhiteSpace(bodyText))
{
try
{
using var doc = System.Text.Json.JsonDocument.Parse(bodyText);
var root = doc.RootElement;
if (root.TryGetProperty("sentinel", out var sentinelEl))
{
var s = sentinelEl.GetString();
if (!string.IsNullOrWhiteSpace(s))
{
sentinel = s.Trim();
}
}
if (root.TryGetProperty("before", out var beforeEl))
{
if (DateTime.TryParse(
beforeEl.GetString(),
System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.AssumeUniversal | System.Globalization.DateTimeStyles.AdjustToUniversal,
out var parsed))
{
beforeUtc = DateTime.SpecifyKind(parsed, DateTimeKind.Utc);
}
else
{
return Results.Json(
new { error = "Invalid 'before' value; expected ISO-8601 UTC datetime.", code = "BAD_REQUEST" },
statusCode: 400);
}
}
if (root.TryGetProperty("batchSize", out var batchEl) && batchEl.TryGetInt32(out var b) && b > 0)
{
batchSize = b;
}
}
catch (System.Text.Json.JsonException)
{
return Results.Json(
new { error = "Request body must be valid JSON.", code = "BAD_REQUEST" },
statusCode: 400);
}
}
if (beforeUtc is null)
{
return Results.Json(
new { error = "Required field 'before' (ISO-8601 UTC datetime) is missing.", code = "BAD_REQUEST" },
statusCode: 400);
}
var repo = context.RequestServices.GetRequiredService<IAuditLogRepository>();
var rowsUpdated = await repo.BackfillSourceNodeAsync(sentinel, beforeUtc.Value, batchSize, context.RequestAborted);
return Results.Json(new
{
rowsUpdated,
sentinel,
before = beforeUtc.Value.ToString("O", System.Globalization.CultureInfo.InvariantCulture),
}, JsonOptions);
}
/// <summary>
/// Streams every matching row as RFC 4180 CSV, paging the repository with its
/// keyset cursor and flushing after each page so a large export starts
@@ -122,6 +122,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
Receive<DiscardNotificationRequest>(HandleDiscard);
Receive<NotificationKpiRequest>(HandleKpiRequest);
Receive<PerSiteNotificationKpiRequest>(HandlePerSiteKpiRequest);
Receive<PerNodeNotificationKpiRequest>(HandlePerNodeKpiRequest);
}
/// <inheritdoc />
@@ -1081,6 +1082,38 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
return new PerSiteNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, sites);
}
/// <summary>
/// Handles a per-node KPI request, computing the per-source-node outbox metrics with the
/// same stuck cutoff and delivered window as <see cref="HandleKpiRequest"/>. Additive
/// alongside <see cref="HandlePerSiteKpiRequest"/> — does not change per-site behaviour.
/// </summary>
private void HandlePerNodeKpiRequest(PerNodeNotificationKpiRequest request)
{
var sender = Sender;
var now = DateTimeOffset.UtcNow;
var stuckCutoff = StuckCutoff(now);
var deliveredSince = now - _options.DeliveredKpiWindow;
ComputePerNodeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(
sender,
success: response => response,
failure: ex => new PerNodeNotificationKpiResponse(
request.CorrelationId,
Success: false,
ErrorMessage: ex.GetBaseException().Message,
Nodes: Array.Empty<NodeNotificationKpiSnapshot>()));
}
private async Task<PerNodeNotificationKpiResponse> ComputePerNodeKpisAsync(
string correlationId, DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince)
{
using var scope = _serviceProvider.CreateScope();
var repository = scope.ServiceProvider.GetRequiredService<INotificationOutboxRepository>();
var nodes = await repository.ComputePerNodeKpisAsync(stuckCutoff, deliveredSince);
return new PerNodeNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, nodes);
}
/// <summary>
/// The instant before which a still-pending notification counts as stuck — <paramref name="now"/>
/// offset back by <see cref="NotificationOutboxOptions.StuckAgeThreshold"/>.
@@ -239,6 +239,7 @@ public class SiteCallAuditActor : ReceiveActor
Receive<SiteCallDetailRequest>(HandleDetail);
Receive<SiteCallKpiRequest>(HandleKpi);
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
Receive<PerNodeSiteCallKpiRequest>(HandlePerNodeKpi);
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
Receive<RegisterCentralCommunication>(msg =>
@@ -817,6 +818,47 @@ public class SiteCallAuditActor : ReceiveActor
}
}
/// <summary>
/// Handles a per-node KPI request, using the same stuck cutoff and
/// interval bound as <see cref="HandleKpi"/>. Additive alongside
/// <see cref="HandlePerSiteKpi"/> — does not change per-site behaviour.
/// </summary>
private void HandlePerNodeKpi(PerNodeSiteCallKpiRequest request)
{
var sender = Sender;
var now = DateTime.UtcNow;
var stuckCutoff = now - _options.StuckAgeThreshold;
var intervalSince = now - _options.KpiInterval;
PerNodeKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
sender,
success: response => response,
failure: ex => new PerNodeSiteCallKpiResponse(
request.CorrelationId,
Success: false,
ErrorMessage: ex.GetBaseException().Message,
Nodes: Array.Empty<SiteCallNodeKpiSnapshot>()));
}
private async Task<PerNodeSiteCallKpiResponse> PerNodeKpiAsync(
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
{
var (scope, repository) = ResolveRepository();
try
{
var nodes = await repository
.ComputePerNodeKpisAsync(stuckCutoff, intervalSince)
.ConfigureAwait(false);
return new PerNodeSiteCallKpiResponse(
correlationId, Success: true, ErrorMessage: null, nodes);
}
finally
{
scope?.Dispose();
}
}
// ── Task 5: central→site Retry/Discard relay ──
/// <summary>
@@ -571,7 +571,20 @@ public class AlarmActor : ReceiveActor
/// Passes the firing alarm's level/priority/message so the script can
/// branch on severity via the <c>Alarm</c> global.
/// </summary>
private void SpawnAlarmExecution(AlarmLevel level, int priority, string message)
/// <param name="level">The firing alarm severity level.</param>
/// <param name="priority">The firing alarm priority.</param>
/// <param name="message">The firing alarm message.</param>
/// <param name="parentExecutionId">
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
/// the context that fired this alarm, recorded as the on-trigger script run's
/// <c>ParentExecutionId</c> so the alarm-triggered run chains under its firing
/// context in the audit tree. The alarm subsystem currently has no Guid-typed
/// firing id, so the only call sites pass <c>null</c> (the on-trigger run is a
/// root). The parameter exists so a future firing-id can flow without
/// touching the actor wiring.
/// </param>
private void SpawnAlarmExecution(
AlarmLevel level, int priority, string message, Guid? parentExecutionId = null)
{
if (_onTriggerCompiledScript == null) return;
@@ -591,7 +604,9 @@ public class AlarmActor : ReceiveActor
_options,
_logger,
// M2.5 (#9): per-script timeout from the on-trigger script (null = global).
_onTriggerExecutionTimeoutSeconds));
_onTriggerExecutionTimeoutSeconds,
// Audit Log #23 (M5.4): the firing context's execution id (null today).
parentExecutionId));
Context.ActorOf(props, executionId);
}
@@ -29,6 +29,14 @@ public class AlarmExecutionActor : ReceiveActor
/// <param name="options">Site runtime configuration options, including the execution timeout.</param>
/// <param name="logger">Logger for execution diagnostics.</param>
/// <param name="executionTimeoutSeconds">M2.5 (#9): the on-trigger script's per-script execution timeout in seconds. Null or non-positive falls back to the global <see cref="SiteRuntimeOptions.ScriptExecutionTimeoutSeconds"/>.</param>
/// <param name="parentExecutionId">
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
/// the context that fired this alarm, threaded into the on-trigger script's
/// <see cref="ScriptRuntimeContext"/> as its <c>ParentExecutionId</c> so the
/// alarm-triggered run chains under its firing context. Null today (no
/// Guid-typed firing id exists yet) — the run is a root, but the plumbing
/// is in place for a future firing id.
/// </param>
public AlarmExecutionActor(
string alarmName,
string instanceName,
@@ -42,7 +50,9 @@ public class AlarmExecutionActor : ReceiveActor
ILogger logger,
// M2.5 (#9): per-script execution timeout override (seconds) for the
// alarm on-trigger script. Null or non-positive falls back to the global.
int? executionTimeoutSeconds = null)
int? executionTimeoutSeconds = null,
// Audit Log #23 (M5.4): the firing context's execution id (null today).
Guid? parentExecutionId = null)
{
var self = Self;
var parent = Context.Parent;
@@ -51,7 +61,7 @@ public class AlarmExecutionActor : ReceiveActor
alarmName, instanceName, level, priority, message,
compiledScript, instanceActor,
sharedScriptLibrary, options, self, parent, logger,
executionTimeoutSeconds);
executionTimeoutSeconds, parentExecutionId);
}
private static void ExecuteAlarmScript(
@@ -67,7 +77,8 @@ public class AlarmExecutionActor : ReceiveActor
IActorRef self,
IActorRef parent,
ILogger logger,
int? executionTimeoutSeconds)
int? executionTimeoutSeconds,
Guid? parentExecutionId)
{
// M2.5 (#9): per-script timeout overrides the global default. A null or
// non-positive per-script value (≤ 0) falls back to the global.
@@ -95,7 +106,19 @@ public class AlarmExecutionActor : ReceiveActor
options.MaxScriptCallDepth,
timeout,
instanceName,
logger);
logger,
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the
// alarm on-trigger run mints its own fresh ExecutionId (the
// ctor's `?? NewGuid()` fallback) and records the firing
// context's id as its ParentExecutionId — null today, so the
// run is a root, but the plumbing exists for a future
// firing id.
parentExecutionId: parentExecutionId,
// WaitForAttribute (spec §4.4): thread the alarm on-trigger
// script's per-script execution-timeout token so a
// Attributes.WaitAsync inside an on-trigger script is bounded
// by the same script deadline.
scriptTimeoutToken: cts.Token);
var globals = new ScriptGlobals
{
@@ -149,6 +149,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
Receive<RouteToCallRequest>(RouteInboundApiCall);
Receive<RouteToGetAttributesRequest>(RouteInboundApiGetAttributes);
Receive<RouteToSetAttributesRequest>(RouteInboundApiSetAttributes);
Receive<RouteToWaitForAttributeRequest>(RouteInboundApiWaitForAttribute);
// OPC UA Tag Browser — singleton-only re-forward to local /user/dcl-manager.
// BrowseNodeCommand is routed to this singleton (active node) by
@@ -1078,6 +1079,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
}).PipeTo(sender);
}
/// <summary>
/// Spec §6 (WD-2b): unpacks a routed <see cref="RouteToWaitForAttributeRequest"/>
/// (inbound-API <c>Route.To().WaitForAttribute()</c>) into the deployed
/// Instance Actor's site-local <see cref="WaitForAttributeRequest"/> and relays
/// the result back. Value-equality only across the wire — the predicate is null
/// and <c>RequireGoodQuality</c> is left at its default. The Ask is bounded by the
/// wait timeout plus slack (NOT a fixed 30s), since the wait legitimately blocks
/// for up to <see cref="RouteToWaitForAttributeRequest.Timeout"/>.
/// </summary>
private void RouteInboundApiWaitForAttribute(RouteToWaitForAttributeRequest request)
{
if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
{
Sender.Tell(new RouteToWaitForAttributeResponse(
request.CorrelationId, false, null, null, false,
false, $"Instance '{request.InstanceUniqueName}' not found on this site.",
DateTimeOffset.UtcNow));
return;
}
var sender = Sender;
// Routed waits are value-equality only (predicate null); RequireGoodQuality left at default.
var inner = new WaitForAttributeRequest(
request.CorrelationId, request.InstanceUniqueName, request.AttributeName,
request.TargetValueEncoded, null, request.Timeout, DateTimeOffset.UtcNow);
// Ask bounded by the WAIT timeout + slack — NOT a fixed 30s (the wait legitimately blocks up to request.Timeout).
instanceActor.Ask<WaitForAttributeResponse>(inner, request.Timeout + TimeSpan.FromSeconds(5))
.ContinueWith(t => t.IsCompletedSuccessfully
? new RouteToWaitForAttributeResponse(
request.CorrelationId, t.Result.Matched, t.Result.Value, t.Result.Quality, t.Result.TimedOut,
true, null, DateTimeOffset.UtcNow)
: new RouteToWaitForAttributeResponse(
request.CorrelationId, false, null, null, false,
false, t.Exception?.GetBaseException().Message ?? "Attribute wait timed out",
DateTimeOffset.UtcNow))
.PipeTo(sender);
}
/// <summary>
/// Writes attribute values on a deployed instance for a Route.To().SetAttribute(s)
/// call (or a central Test Run bound to the instance). Each write is Ask'd to the
@@ -68,6 +68,18 @@ public class InstanceActor : ReceiveActor
// mirroring the rest of the actor's by-name dictionaries).
private readonly Dictionary<string, ResolvedAttribute> _resolvedAttributeByName = new();
// WaitForAttribute (spec §4.2): one-shot waiter registry keyed by the
// request CorrelationId. Each entry holds the watched attribute name, the
// match test (decoded target equality OR a site-local predicate), the
// original Sender to reply to, and the scheduled-timeout handle so a match
// can cancel it. Single-threaded actor access — no locking needed.
private readonly Dictionary<string, PendingWait> _attributeWaiters = new();
// WaitForAttribute: defensive per-instance cap so a script leaking waiters
// in a loop cannot grow the registry without bound. Exceeding it refuses the
// wait with an error reply rather than registering.
private const int MaxAttributeWaiters = 100;
// DCL manager actor reference for subscribing to tag values
private readonly IActorRef? _dclManager;
// Maps each tag path to every attribute canonical name that references it.
@@ -170,6 +182,12 @@ public class InstanceActor : ReceiveActor
// WP-22/23: Handle attribute value changes from DCL (Tell pattern)
Receive<AttributeValueChanged>(HandleAttributeValueChanged);
// WaitForAttribute (spec §4.2): event-driven "wait for value" waiter
// registration + its scheduled-timeout self-message. Both flow only
// site-locally (the predicate variant carries a non-serializable delegate).
Receive<WaitForAttributeRequest>(HandleWaitForAttribute);
Receive<WaitForAttributeTimeout>(HandleWaitForAttributeTimeout);
// Handle tag value updates from DCL — convert to AttributeValueChanged
Receive<TagValueUpdate>(HandleTagValueUpdate);
Receive<SubscribeTagsResponse>(_ => { }); // Ack from DCL subscribe — no action needed
@@ -519,6 +537,114 @@ public class InstanceActor : ReceiveActor
PublishAndNotifyChildren(changed);
}
/// <summary>
/// WaitForAttribute (spec §4.2): registers a one-shot event-driven waiter for
/// an attribute to reach a value (encoded-equality), satisfy a site-local
/// predicate, or change at all. The current-value fast-path and the
/// change-handling in <see cref="HandleAttributeValueChanged"/> both run on
/// this single-threaded actor, so a value that flips between "read current"
/// and "register" cannot be missed (spec §5).
/// </summary>
private void HandleWaitForAttribute(WaitForAttributeRequest req)
{
// Capture the sender immediately — Sender is invalid once we schedule /
// return and a later message arrives.
var replyer = Sender;
// Build the match test: explicit predicate wins; else null encoded target
// means "any change"; else compare the codec-encoded current value to the
// encoded target (avoids needing the attribute's DataType to decode).
Func<object?, bool> test;
if (req.Predicate is not null)
{
test = req.Predicate;
}
else if (req.TargetValueEncoded is null)
{
test = _ => true;
}
else
{
var target = req.TargetValueEncoded;
test = v => string.Equals(
AttributeValueCodec.Encode(v), target, StringComparison.Ordinal);
}
// Fast path: the current value already satisfies the test → reply now.
// A script-supplied predicate (or the codec-equality lambda) runs on the
// actor thread; guard it so a throwing predicate cannot crash the actor or
// leak a never-resolved waiter. On throw: reply non-matched + ErrorMessage
// and return WITHOUT registering (no timeout scheduled).
if (_attributes.TryGetValue(req.AttributeName, out var current))
{
// Effective quality used for BOTH the §4.2 quality gate and the match
// reply — the same `?? "Good"` default the reply has always used.
_attributeQualities.TryGetValue(req.AttributeName, out var fastQuality);
var effectiveQuality = fastQuality ?? "Good";
bool fastMatch;
try
{
// §4.2 quality gate ANDed with the value test, both INSIDE the guard:
// in quality-gated mode a value already at target but at Bad/Uncertain
// quality is NOT a fast match — it falls through to register + schedule
// the timeout like any other pending waiter (do NOT fast-reply matched).
fastMatch =
(!req.RequireGoodQuality
|| string.Equals(effectiveQuality, "Good", StringComparison.Ordinal))
&& test(current);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"WaitForAttribute predicate threw on the fast-path for {Instance}.{Attribute}; refusing the wait",
_instanceUniqueName, req.AttributeName);
replyer.Tell(new WaitForAttributeResponse(
req.CorrelationId, Matched: false, null, null, TimedOut: false,
ErrorMessage: "Wait predicate threw: " + ex.Message));
return;
}
if (fastMatch)
{
replyer.Tell(new WaitForAttributeResponse(
req.CorrelationId, Matched: true, current, effectiveQuality, TimedOut: false));
return;
}
}
// Defensive cap: refuse rather than register if the instance already has
// too many concurrent waiters (guards against a script leaking waiters).
if (_attributeWaiters.Count >= MaxAttributeWaiters)
{
replyer.Tell(new WaitForAttributeResponse(
req.CorrelationId, Matched: false, null, null, TimedOut: false,
ErrorMessage: "Too many concurrent attribute waiters on this instance"));
return;
}
// Register and schedule the self-evicting timeout (NativeAlarmActor idiom).
var handle = Context.System.Scheduler.ScheduleTellOnceCancelable(
req.Timeout, Self, new WaitForAttributeTimeout(req.CorrelationId), Self);
_attributeWaiters[req.CorrelationId] =
new PendingWait(req.AttributeName, test, replyer, handle, req.RequireGoodQuality);
}
/// <summary>
/// WaitForAttribute (spec §4.2): the scheduled timeout fired for a waiter that
/// never matched. If still registered (a match would have removed + canceled
/// it), reply TimedOut and evict it.
/// </summary>
private void HandleWaitForAttributeTimeout(WaitForAttributeTimeout msg)
{
if (_attributeWaiters.Remove(msg.CorrelationId, out var pending))
{
pending.Replyer.Tell(new WaitForAttributeResponse(
msg.CorrelationId, Matched: false, null, null, TimedOut: true));
}
}
/// <summary>
/// Handles tag value updates from DCL. Maps the tag path back to the attribute
/// canonical name and converts to an AttributeValueChanged for unified processing.
@@ -556,9 +682,14 @@ public class InstanceActor : ReceiveActor
_attributeQualities[attrName] = "Bad";
_attributeTimestamps[attrName] = update.Timestamp;
var currentValue = _attributes.GetValueOrDefault(attrName);
// WaitForAttribute (spec §4.2): quality-only republish — the
// stored value is UNCHANGED (we publish the OLD currentValue, only
// the quality flips to Bad). Do NOT evaluate waiters, or an
// "any-change" / unchanged-value-equality waiter would fire on a
// non-change.
PublishAndNotifyChildren(new AttributeValueChanged(
_instanceUniqueName, update.TagPath, attrName,
currentValue, "Bad", update.Timestamp));
currentValue, "Bad", update.Timestamp), evaluateWaiters: false);
}
continue;
}
@@ -908,7 +1039,17 @@ public class InstanceActor : ReceiveActor
/// Publishes attribute change to stream and notifies child Script/Alarm actors.
/// WP-22: Tell for attribute notifications (fire-and-forget, never blocks).
/// </summary>
private void PublishAndNotifyChildren(AttributeValueChanged changed)
/// <param name="changed">The attribute change to publish.</param>
/// <param name="evaluateWaiters">
/// WaitForAttribute (spec §4.2): when <c>true</c> (the default), registered
/// <c>Attributes.WaitAsync</c> waiters on this attribute are re-evaluated against
/// <paramref name="changed"/>'s value. Pass <c>false</c> on republish/quality-only
/// paths that do NOT assign a new value to <c>_attributes[name]</c> (e.g. the
/// List-coerce-failure Bad-quality republish, which publishes the OLD value) —
/// otherwise an "any-change" waiter (or a waiter whose target equals the unchanged
/// value) would spuriously fire even though nothing actually changed.
/// </param>
private void PublishAndNotifyChildren(AttributeValueChanged changed, bool evaluateWaiters = true)
{
// WP-23: Publish to site-wide stream
_streamManager?.PublishAttributeValueChanged(changed);
@@ -924,6 +1065,83 @@ public class InstanceActor : ReceiveActor
{
alarmActor.Tell(changed);
}
// WaitForAttribute (spec §4.2): re-evaluate any waiters on THIS attribute —
// but ONLY when this publish reflects a real value change (evaluateWaiters).
// The genuine value-change paths (HandleAttributeValueChanged, the scalar
// DCL update path, HandleSetStaticAttributeCore) call it AFTER assigning
// _attributes[name], so changed.Value is the just-applied current value.
// Republish/quality-only paths (List-coerce-failure Bad-quality, which
// publishes the OLD value) pass evaluateWaiters:false so an "any-change" or
// unchanged-value-equality waiter does not spuriously fire (spec §4.2).
// Iterate a snapshot so satisfied waiters can be removed during the loop;
// each match cancels its scheduled timeout (so no stray WaitForAttributeTimeout
// follows) and replies Matched=true.
if (evaluateWaiters)
ResolveMatchedWaiters(changed);
}
/// <summary>
/// WaitForAttribute (spec §4.2): fires every registered waiter on
/// <paramref name="changed"/>'s attribute whose test now passes against the
/// just-applied value — cancelling its timeout, replying Matched, and removing
/// it from the registry. A no-op when there are no waiters.
///
/// <para>
/// Each waiter's match test runs inside a per-waiter try/catch: a throwing
/// script-supplied predicate (or codec lambda) must NOT abort the loop and
/// strand sibling waiters on the same attribute, nor leave the throwing waiter
/// registered with a live scheduled timeout. On throw we cancel that waiter's
/// timeout, reply non-matched + ErrorMessage, remove it, and continue.
/// </para>
/// </summary>
private void ResolveMatchedWaiters(AttributeValueChanged changed)
{
if (_attributeWaiters.Count == 0)
return;
// Snapshot the candidate waiters on THIS attribute. Iterating a snapshot
// (and NOT evaluating the test inside the LINQ filter) keeps removal mid-loop
// safe and ensures one throwing test cannot abort materialization for siblings.
var candidates = _attributeWaiters
.Where(kvp => kvp.Value.AttributeName == changed.AttributeName)
.ToList();
foreach (var (cid, pending) in candidates)
{
bool matched;
try
{
// §4.2 quality gate ANDed with the value test, both INSIDE the guard:
// in quality-gated mode a value reaching the target at Bad/Uncertain
// quality is NOT a match — the waiter stays pending until it satisfies
// the test at Good quality (or times out).
matched =
(!pending.RequireGoodQuality
|| string.Equals(changed.Quality, "Good", StringComparison.Ordinal))
&& pending.Test(changed.Value);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"WaitForAttribute predicate threw while resolving waiter {CorrelationId} on {Instance}.{Attribute}; evicting it",
cid, _instanceUniqueName, changed.AttributeName);
pending.Timeout.Cancel();
pending.Replyer.Tell(new WaitForAttributeResponse(
cid, Matched: false, null, null, TimedOut: false,
ErrorMessage: "Wait predicate threw: " + ex.Message));
_attributeWaiters.Remove(cid);
continue;
}
if (!matched)
continue;
pending.Timeout.Cancel();
pending.Replyer.Tell(new WaitForAttributeResponse(
cid, Matched: true, changed.Value, changed.Quality, TimedOut: false));
_attributeWaiters.Remove(cid);
}
}
/// <summary>
@@ -1202,4 +1420,23 @@ public class InstanceActor : ReceiveActor
/// Internal message for async override loading result.
/// </summary>
internal record LoadOverridesResult(Dictionary<string, string> Overrides, string? Error);
/// <summary>
/// WaitForAttribute (spec §4.2): one registered, not-yet-satisfied waiter.
/// </summary>
/// <param name="AttributeName">The attribute this waiter watches (scope-resolved).</param>
/// <param name="Test">The match test (decoded-target equality OR site-local predicate OR any-change).</param>
/// <param name="Replyer">The original sender to reply to on match / timeout.</param>
/// <param name="Timeout">The scheduled timeout handle, canceled on match.</param>
/// <param name="RequireGoodQuality">
/// Quality-gated ("Good"-only) mode (spec §4.2): when <c>true</c>, the resolve
/// loop additionally requires <c>changed.Quality == "Good"</c> before the test
/// can match.
/// </param>
private sealed record PendingWait(
string AttributeName,
Func<object?, bool> Test,
IActorRef Replyer,
ICancelable Timeout,
bool RequireGoodQuality);
}
@@ -221,7 +221,12 @@ public class ScriptExecutionActor : ReceiveActor
// M2.12 (#25): thread the singleton site event logger so
// recursion-limit violations at CallScript/CallShared emit a
// script Error site event in addition to ILogger.LogError.
siteEventLogger: siteEventLogger);
siteEventLogger: siteEventLogger,
// WaitForAttribute (spec §4.3/§4.4): thread the per-script
// execution-timeout token so Attributes.WaitAsync's Ask is
// bounded by the script's own ExecutionTimeoutSeconds — a
// shorter script deadline wins over the wait's own timeout.
scriptTimeoutToken: cts.Token);
var globals = new ScriptGlobals
{
@@ -73,6 +73,107 @@ public class AttributeAccessor
/// <returns>A task that represents the asynchronous operation.</returns>
public Task SetAsync(string key, object? value)
=> _ctx.SetAttribute(Resolve(key), AttributeValueCodec.Encode(value) ?? string.Empty);
/// <summary>
/// WaitForAttribute (spec §3-§5): waits event-driven until the attribute equals
/// <paramref name="targetValue"/> (value-equality, codec-normalized), bounded by
/// <paramref name="timeout"/>. Returns <c>true</c> if matched within the timeout,
/// <c>false</c> on timeout (no throw). Honors the script's execution-timeout token.
/// Scope/composition path resolution (<see cref="Resolve"/>) is applied just like
/// <see cref="GetAsync"/> / <see cref="SetAsync"/>.
///
/// <para>
/// <b>Quality-agnostic by default (spec §4.2):</b> matching tests the VALUE, not
/// the quality — a value arriving at Bad quality still satisfies the wait. Pass
/// <paramref name="requireGoodQuality"/><c>:true</c> for quality-gated ("Good"-only)
/// matching: a value reaching the target at Bad/Uncertain quality is ignored and
/// the wait holds until the target is reached at "Good" quality (or times out).
/// </para>
///
/// <para>
/// Passing a <b>null</b> <paramref name="targetValue"/> means "match on any change":
/// the wait then matches the next value the attribute receives — and matches
/// IMMEDIATELY (fast-path) if the attribute already holds any value at registration.
/// </para>
/// </summary>
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
/// <param name="targetValue">
/// The value to wait for (codec-encoded for comparison); <c>null</c> means
/// "match on any change" (matches immediately if the attribute already has a value).
/// </param>
/// <param name="timeout">How long to wait before returning false.</param>
/// <param name="requireGoodQuality">
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to
/// <c>false</c> (quality-agnostic — Bad/Uncertain-quality transients still match).
/// </param>
/// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
public Task<bool> WaitAsync(string key, object? targetValue, TimeSpan timeout, bool requireGoodQuality = false)
=> _ctx.WaitAttribute(Resolve(key), AttributeValueCodec.Encode(targetValue), null, timeout, requireGoodQuality);
/// <summary>
/// WaitForAttribute (spec §3-§5): predicate form — waits event-driven until
/// <paramref name="predicate"/> returns <c>true</c> for the attribute's current
/// value, bounded by <paramref name="timeout"/>. Site-local only (the predicate
/// is an in-process delegate). Returns <c>true</c> if matched within the timeout,
/// <c>false</c> on timeout (no throw). Scope/composition path resolution applies.
///
/// <para>
/// <b>Quality-agnostic by default (spec §4.2):</b> the predicate is tested against
/// the VALUE, regardless of quality — a value arriving at Bad quality still
/// satisfies the wait if the predicate passes. Pass <paramref name="requireGoodQuality"/>
/// <c>:true</c> for quality-gated ("Good"-only) matching: a value satisfying the
/// predicate at Bad/Uncertain quality is ignored until it does so at "Good" quality.
/// </para>
/// </summary>
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
/// <param name="predicate">The site-local predicate tested against the current value.</param>
/// <param name="timeout">How long to wait before returning false.</param>
/// <param name="requireGoodQuality">
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to
/// <c>false</c> (quality-agnostic).
/// </param>
/// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
public Task<bool> WaitAsync(string key, Func<object?, bool> predicate, TimeSpan timeout, bool requireGoodQuality = false)
=> _ctx.WaitAttribute(Resolve(key), null, predicate, timeout, requireGoodQuality);
/// <summary>
/// WaitForAttribute (spec §3): richer value-equality form — like
/// <see cref="WaitAsync(string, object?, TimeSpan, bool)"/> but returns the full
/// <see cref="WaitResult"/> (matched flag + matched value + quality + timed-out
/// flag) instead of a bare bool. Scope/composition path resolution
/// (<see cref="Resolve"/>) is applied to <paramref name="key"/> just like the
/// other accessors. Never throws on timeout — a timeout yields
/// <c>WaitResult { Matched = false, TimedOut = true }</c>.
/// </summary>
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
/// <param name="targetValue">
/// The value to wait for (codec-encoded for comparison); <c>null</c> means
/// "match on any change".
/// </param>
/// <param name="timeout">How long to wait before returning a timed-out result.</param>
/// <param name="requireGoodQuality">
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to <c>false</c>.
/// </param>
/// <returns>The full <see cref="WaitResult"/> for the wait.</returns>
public Task<WaitResult> WaitForAsync(string key, object? targetValue, TimeSpan timeout, bool requireGoodQuality = false)
=> _ctx.WaitAttributeFull(Resolve(key), AttributeValueCodec.Encode(targetValue), null, timeout, requireGoodQuality);
/// <summary>
/// WaitForAttribute (spec §3): richer predicate form — like
/// <see cref="WaitAsync(string, Func{object?, bool}, TimeSpan, bool)"/> but returns
/// the full <see cref="WaitResult"/>. Site-local only (the predicate is an
/// in-process delegate). Scope/composition path resolution applies. Never throws
/// on timeout (<c>WaitResult { Matched = false, TimedOut = true }</c>).
/// </summary>
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
/// <param name="predicate">The site-local predicate tested against the current value.</param>
/// <param name="timeout">How long to wait before returning a timed-out result.</param>
/// <param name="requireGoodQuality">
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to <c>false</c>.
/// </param>
/// <returns>The full <see cref="WaitResult"/> for the wait.</returns>
public Task<WaitResult> WaitForAsync(string key, Func<object?, bool> predicate, TimeSpan timeout, bool requireGoodQuality = false)
=> _ctx.WaitAttributeFull(Resolve(key), null, predicate, timeout, requireGoodQuality);
}
/// <summary>
@@ -46,6 +46,16 @@ public class ScriptRuntimeContext
private readonly ILogger _logger;
private readonly string _instanceName;
/// <summary>
/// WaitForAttribute (spec §4.3): the per-script execution-timeout token from
/// the owning <c>ScriptExecutionActor</c>/<c>AlarmExecutionActor</c>
/// (<c>cts.Token</c>). Bounds the <c>Attributes.WaitAsync</c> Ask so a script
/// that hits its own <c>ExecutionTimeoutSeconds</c> abandons the wait. Defaults
/// to <see cref="CancellationToken.None"/> for contexts that do not thread one
/// (legacy callers / tests / the alarm path when it has no CTS).
/// </summary>
private readonly CancellationToken _scriptTimeoutToken;
/// <summary>
/// WP-13: External system client for ExternalSystem.Call/CachedCall.
/// </summary>
@@ -194,6 +204,13 @@ public class ScriptRuntimeContext
/// <c>ILogger.LogError</c> + throw. When null the existing behaviour is
/// unchanged; all existing callers and tests remain source-compatible.
/// </param>
/// <param name="scriptTimeoutToken">
/// WaitForAttribute (spec §4.3): the per-script execution-timeout token
/// (<c>cts.Token</c> on the owning execution actor) used to bound
/// <c>Attributes.WaitAsync</c>. Defaults to
/// <see cref="CancellationToken.None"/> for callers / tests that do not
/// thread one — those waits are bounded only by their own timeout.
/// </param>
public ScriptRuntimeContext(
IActorRef instanceActor,
IActorRef self,
@@ -215,7 +232,8 @@ public class ScriptRuntimeContext
Guid? executionId = null,
Guid? parentExecutionId = null,
string? sourceNode = null,
ISiteEventLogger? siteEventLogger = null)
ISiteEventLogger? siteEventLogger = null,
CancellationToken scriptTimeoutToken = default)
{
_instanceActor = instanceActor;
_self = self;
@@ -245,6 +263,66 @@ public class ScriptRuntimeContext
_parentExecutionId = parentExecutionId;
// M2.12 (#25): optional — null when not wired (tests / AlarmExecutionActor).
_siteEventLogger = siteEventLogger;
// WaitForAttribute (spec §4.3): default(CancellationToken) == None when
// not threaded in — the WaitAsync Ask is then bounded only by its own timeout.
_scriptTimeoutToken = scriptTimeoutToken;
}
/// <summary>
/// Audit Log #23 (M5.4): this run's own per-execution id. Exposed so a
/// nested <c>Scripts.CallShared</c> can record it as the spawned shared
/// script's <c>ParentExecutionId</c>, forming a true execution tree.
/// </summary>
internal Guid ExecutionId => _executionId;
/// <summary>
/// Audit Log #23 (M5.4): the spawning execution's id for this run (null for
/// a root run). Exposed for test assertions on the execution tree.
/// </summary>
internal Guid? ParentExecutionId => _parentExecutionId;
/// <summary>
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): builds a child
/// <see cref="ScriptRuntimeContext"/> for an inline <c>Scripts.CallShared</c>
/// invocation. The shared script runs inline (no actor hop) but is modelled
/// as its OWN execution node in the audit tree: it mints a fresh
/// <see cref="_executionId"/> and records THIS run's <see cref="_executionId"/>
/// as its <c>ParentExecutionId</c>, so <c>B → CallShared(C)</c> yields
/// <c>C.ParentExecutionId == B.ExecutionId</c>. Every other dependency
/// (actors, gateways, audit writer, site id, source node, call-depth) is
/// carried over verbatim from this context.
/// </summary>
/// <param name="childCallDepth">The recursion depth of the shared-script call.</param>
internal ScriptRuntimeContext CreateChildContextForSharedScript(int childCallDepth)
{
return new ScriptRuntimeContext(
_instanceActor,
_self,
_sharedScriptLibrary,
childCallDepth,
_maxCallDepth,
_askTimeout,
_instanceName,
_logger,
_externalSystemClient,
_databaseGateway,
_storeAndForward,
_siteCommunicationActor,
_siteId,
_sourceScript,
_auditWriter,
_operationTrackingStore,
_cachedForwarder,
// Fresh execution id for the shared-script run (omit so the ctor mints one)…
executionId: null,
// …parented to THIS run's execution id (the spawner).
parentExecutionId: _executionId,
sourceNode: _sourceNode,
siteEventLogger: _siteEventLogger,
// WaitForAttribute (spec §4.3): an inline shared-script call shares the
// parent run's execution-timeout token so a WaitAsync inside the shared
// script is bounded by the SAME script deadline.
scriptTimeoutToken: _scriptTimeoutToken);
}
/// <summary>
@@ -307,6 +385,115 @@ public class ScriptRuntimeContext
return response.Value;
}
/// <summary>
/// WaitForAttribute (spec §3-§5): waits event-driven for an attribute to reach
/// a value (encoded-equality), satisfy a site-local predicate, or change at all,
/// bounded by <paramref name="timeout"/>. Returns <c>true</c> if matched within
/// the timeout, <c>false</c> on timeout — NEVER throws on timeout. The backing
/// <c>Attributes.WaitAsync</c> for the accessor.
///
/// <para>
/// The Ask is bounded by the script's own execution-timeout token (§4.3): a
/// script that hits its <c>ExecutionTimeoutSeconds</c> abandons the wait. The
/// Ask timeout is the wait timeout plus a small <see cref="_askTimeout"/> slack
/// so the InstanceActor's own scheduled timeout reply is the authoritative path
/// for the false/timed-out outcome, not the Ask deadline.
/// </para>
///
/// <para>
/// <b>Quality-agnostic by default (spec §4.2):</b> a value arriving at Bad
/// quality still satisfies the wait — the match tests the value, not the quality.
/// A quality-gated ("Good"-only) mode is a planned enhancement, deferred per spec §4.2.
/// </para>
///
/// <para>
/// <b>Never throws on timeout.</b> An <see cref="Akka.Actor.AskTimeoutException"/>
/// (the pathological case where the InstanceActor's authoritative timeout reply
/// never arrives — actor stopped/restarted) is caught and surfaced as <c>false</c>,
/// matching the timeout contract. An <see cref="OperationCanceledException"/> /
/// <see cref="TaskCanceledException"/> from the script-deadline token is NOT caught
/// — it propagates to abort the script (intended §4.3 behaviour).
/// </para>
/// </summary>
/// <param name="name">The scope-resolved attribute name to wait on.</param>
/// <param name="targetValueEncoded">
/// The codec-encoded target value; null (with null <paramref name="predicate"/>)
/// means "any change".
/// </param>
/// <param name="predicate">Site-local predicate; null when the encoded target is used.</param>
/// <param name="timeout">How long to wait before returning false.</param>
/// <param name="requireGoodQuality">
/// Quality-gated ("Good"-only) mode (spec §4.2): when <see langword="true"/>, a
/// value reaching the target / satisfying the predicate at Bad/Uncertain quality
/// is NOT a match — the wait holds until the value satisfies the test at Good
/// quality (or times out). Defaults to <see langword="false"/> (quality-agnostic).
/// </param>
/// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
public async Task<bool> WaitAttribute(
string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
bool requireGoodQuality = false)
=> (await WaitInternal(name, targetValueEncoded, predicate, timeout, requireGoodQuality)).Matched;
/// <summary>
/// WaitForAttribute (spec §3): the richer overload backing <c>Attributes.WaitForAsync</c>
/// — identical semantics to <see cref="WaitAttribute"/> but surfaces the full
/// <see cref="WaitResult"/> (matched flag + matched value + quality + timed-out
/// flag) instead of a bare bool. Never throws on timeout (see <see cref="WaitInternal"/>).
/// </summary>
/// <param name="name">The scope-resolved attribute name to wait on.</param>
/// <param name="targetValueEncoded">The codec-encoded target value; null (with null predicate) means "any change".</param>
/// <param name="predicate">Site-local predicate; null when the encoded target is used.</param>
/// <param name="timeout">How long to wait before returning a timed-out result.</param>
/// <param name="requireGoodQuality">Quality-gated ("Good"-only) mode (spec §4.2); defaults to <see langword="false"/>.</param>
/// <returns>The full <see cref="WaitResult"/> — on timeout: <c>Matched:false, TimedOut:true</c>.</returns>
public async Task<WaitResult> WaitAttributeFull(
string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
bool requireGoodQuality = false)
{
var r = await WaitInternal(name, targetValueEncoded, predicate, timeout, requireGoodQuality);
return new WaitResult(r.Matched, r.Value, r.Quality, r.TimedOut);
}
/// <summary>
/// Shared core for <see cref="WaitAttribute"/> / <see cref="WaitAttributeFull"/>:
/// builds the <see cref="WaitForAttributeRequest"/> (incl. the §4.2
/// <paramref name="requireGoodQuality"/> flag), Asks the InstanceActor bounded by
/// the script's execution-timeout token, and returns the full response. An
/// <see cref="AskTimeoutException"/> (the pathological case where the actor's own
/// authoritative timeout reply never arrives — actor stopped/restarted) is caught
/// and surfaced as a synthetic non-matched/timed-out response, preserving the
/// "never throw on timeout" contract. An <see cref="OperationCanceledException"/> /
/// <see cref="TaskCanceledException"/> from the script-deadline token is NOT caught
/// — it propagates to abort the script (§4.3).
/// </summary>
private async Task<WaitForAttributeResponse> WaitInternal(
string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
bool requireGoodQuality)
{
var cid = Guid.NewGuid().ToString();
var req = new WaitForAttributeRequest(
cid, _instanceName, name, targetValueEncoded, predicate, timeout, DateTimeOffset.UtcNow,
requireGoodQuality);
try
{
return await _instanceActor.Ask<WaitForAttributeResponse>(
req, timeout + _askTimeout, _scriptTimeoutToken);
}
catch (AskTimeoutException)
{
// Pathological: the InstanceActor's own scheduled timeout reply never
// arrived (e.g. the actor stopped/restarted under us). The helper's
// contract is "false on timeout, never throw" — so synthesize a
// non-matched/timed-out response rather than leaking the Ask exception.
// OperationCanceledException / TaskCanceledException from the
// script-deadline token are deliberately NOT caught here: they must
// propagate to abort the script (§4.3).
return new WaitForAttributeResponse(
cid, Matched: false, null, null, TimedOut: true);
}
}
/// <summary>
/// Sets an attribute value. For data-connected attributes the Instance Actor
/// forwards the write to the DCL, which writes the physical device; the
@@ -366,7 +553,14 @@ public class ScriptRuntimeContext
scriptName,
ScriptArgs.Normalize(parameters),
nextDepth,
correlationId);
correlationId,
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the child
// script run is a NEW execution spawned BY this run. Its parent is
// THIS run's own ExecutionId — NOT the inherited _parentExecutionId.
// So A → CallScript(B) yields B.ParentExecutionId == A.ExecutionId,
// building a true multi-level execution tree rather than flattening
// every nested call under the original inbound spawner.
ParentExecutionId: _executionId);
// Ask the Instance Actor, which routes to the appropriate Script Actor
var result = await _instanceActor.Ask<ScriptCallResult>(request, _askTimeout);
@@ -526,8 +720,14 @@ public class ScriptRuntimeContext
throw new InvalidOperationException(msg);
}
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the shared
// script runs inline, but is modelled as its OWN execution node — a
// child context mints a fresh ExecutionId parented to the caller's
// ExecutionId, so its audit rows chain under the calling run.
var childContext = _context.CreateChildContextForSharedScript(nextDepth);
return await _library.ExecuteAsync(
scriptName, _context, ScriptArgs.Normalize(parameters), cancellationToken);
scriptName, childContext, ScriptArgs.Normalize(parameters), cancellationToken);
}
}