merge: integrate WaitAsync/M5-audit (parallel session) with galaxy array-write + inbound-timeout fixes
This commit is contained in:
@@ -39,10 +39,12 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
public sealed class AuditCentralHealthSnapshot
|
||||
: IAuditCentralHealthSnapshot,
|
||||
ICentralAuditWriteFailureCounter,
|
||||
IAuditRedactionFailureCounter
|
||||
IAuditRedactionFailureCounter,
|
||||
IAuditInboundCeilingHitsCounter
|
||||
{
|
||||
private int _centralAuditWriteFailures;
|
||||
private int _auditRedactionFailure;
|
||||
private int _auditInboundCeilingHits;
|
||||
private readonly ConcurrentDictionary<string, bool> _stalled = new();
|
||||
|
||||
/// <inheritdoc/>
|
||||
@@ -53,6 +55,10 @@ public sealed class AuditCentralHealthSnapshot
|
||||
public int AuditRedactionFailure =>
|
||||
Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public int AuditInboundCeilingHits =>
|
||||
Interlocked.CompareExchange(ref _auditInboundCeilingHits, 0, 0);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
|
||||
new Dictionary<string, bool>(_stalled);
|
||||
@@ -78,4 +84,8 @@ public sealed class AuditCentralHealthSnapshot
|
||||
/// <inheritdoc/>
|
||||
void IAuditRedactionFailureCounter.Increment() =>
|
||||
Interlocked.Increment(ref _auditRedactionFailure);
|
||||
|
||||
/// <inheritdoc/>
|
||||
void IAuditInboundCeilingHitsCounter.Increment() =>
|
||||
Interlocked.Increment(ref _auditInboundCeilingHits);
|
||||
}
|
||||
|
||||
@@ -167,6 +167,9 @@ public class AuditLogPurgeActor : ReceiveActor
|
||||
|
||||
if (boundaries.Count == 0)
|
||||
{
|
||||
// No whole-month partitions are eligible, but per-channel overrides may
|
||||
// still expire rows earlier than the global window — run them below.
|
||||
await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -202,6 +205,80 @@ public class AuditLogPurgeActor : ReceiveActor
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
|
||||
// M5.5 (T3): after the channel-blind global partition switch-out, apply any
|
||||
// per-channel retention overrides that are SHORTER than the global window via
|
||||
// a bounded, batched row DELETE on the same maintenance path. The global
|
||||
// switch-out has already dropped whole months older than RetentionDays; these
|
||||
// deletes only ever expire rows EARLIER than that, so they run last and are a
|
||||
// strict tightening.
|
||||
await RunPerChannelOverridesAsync(repository).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M5.5 (T3): runs each per-channel retention override whose window is strictly
|
||||
/// shorter than the global <see cref="AuditLogOptions.RetentionDays"/>, deleting
|
||||
/// rows of that channel older than the channel-specific threshold via a bounded,
|
||||
/// batched maintenance-path DELETE. Each channel runs inside its own try/catch so
|
||||
/// one bad channel does not abandon the others on the same tick, mirroring the
|
||||
/// per-boundary error isolation of the partition switch-out loop.
|
||||
/// </summary>
|
||||
/// <param name="repository">The repository resolved for this tick's DI scope.</param>
|
||||
private async Task RunPerChannelOverridesAsync(IAuditLogRepository repository)
|
||||
{
|
||||
var overrides = _auditOptions.PerChannelRetentionDays;
|
||||
if (overrides is null || overrides.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var globalDays = _auditOptions.RetentionDays;
|
||||
|
||||
foreach (var (channel, days) in overrides)
|
||||
{
|
||||
// Only act when the per-channel window is strictly shorter than the global
|
||||
// one. Equal/longer windows are already covered by the global partition
|
||||
// switch-out, so a row DELETE would be redundant work (and a longer window
|
||||
// is meaningless — the partition is dropped on the global schedule).
|
||||
if (days >= globalDays)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var channelThreshold = DateTime.UtcNow - TimeSpan.FromDays(days);
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
var rowsDeleted = await repository
|
||||
.PurgeChannelOlderThanAsync(channel, channelThreshold, _purgeOptions.ChannelPurgeBatchSize)
|
||||
.ConfigureAwait(false);
|
||||
sw.Stop();
|
||||
|
||||
if (rowsDeleted > 0)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Purged {RowsDeleted} AuditLog rows for channel {Channel} older than {Threshold:o} " +
|
||||
"(per-channel override {Days}d < global {GlobalDays}d) in {DurationMs} ms.",
|
||||
rowsDeleted,
|
||||
channel,
|
||||
channelThreshold,
|
||||
days,
|
||||
globalDays,
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to apply per-channel retention override for channel {Channel} " +
|
||||
"({Days}d); other channels continue. Elapsed {DurationMs} ms.",
|
||||
channel,
|
||||
days,
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
|
||||
|
||||
@@ -28,6 +28,24 @@ public sealed class AuditLogPurgeOptions
|
||||
/// <summary>Period of the purge tick in hours (default 24).</summary>
|
||||
public int IntervalHours { get; set; } = 24;
|
||||
|
||||
/// <summary>
|
||||
/// M5.5 (T3): batch size for the per-channel retention-override row DELETE
|
||||
/// (<see cref="ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories.IAuditLogRepository.PurgeChannelOlderThanAsync"/>).
|
||||
/// Each <c>DELETE TOP (@batch)</c> caps the transaction-log and lock footprint
|
||||
/// per statement; the repository loops batches until no rows remain. Default
|
||||
/// 5000 keeps individual deletes short on a busy central DB while still draining
|
||||
/// a large backlog within a tick. Clamped to a sane minimum in
|
||||
/// <see cref="ChannelPurgeBatchSize"/>.
|
||||
/// </summary>
|
||||
public int ChannelPurgeBatchSizeConfigured { get; set; } = 5000;
|
||||
|
||||
/// <summary>
|
||||
/// Resolves the effective per-channel purge batch size, clamped to at least 1 so
|
||||
/// a misconfigured <c>0</c>/negative value cannot make the repository's DELETE
|
||||
/// loop spin or throw.
|
||||
/// </summary>
|
||||
public int ChannelPurgeBatchSize => ChannelPurgeBatchSizeConfigured < 1 ? 1 : ChannelPurgeBatchSizeConfigured;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only override for finer control over the tick cadence than
|
||||
/// whole-hour resolution allows. When non-null, takes precedence over
|
||||
|
||||
@@ -50,6 +50,17 @@ public interface IAuditCentralHealthSnapshot
|
||||
/// </summary>
|
||||
int AuditRedactionFailure { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Count of inbound request/response body truncations at the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.InboundMaxBytes"/>
|
||||
/// ceiling since process start. Incremented by
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware.AuditWriteMiddleware"/>
|
||||
/// whenever either the request or response body exceeds the cap and is
|
||||
/// truncated in the audit copy. A sustained non-zero count can indicate
|
||||
/// callers sending unexpectedly large bodies.
|
||||
/// </summary>
|
||||
int AuditInboundCeilingHits { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-site latched stalled state: <c>true</c> when the
|
||||
/// <see cref="SiteAuditReconciliationActor"/> has observed two
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M5.3 (T7) counter sink incremented by
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware.AuditWriteMiddleware"/>
|
||||
/// whenever an inbound request or response body is truncated at the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Configuration.AuditLogOptions.InboundMaxBytes"/>
|
||||
/// ceiling. Mirrors the <see cref="ICentralAuditWriteFailureCounter"/> shape:
|
||||
/// one-method, NoOp default, must-never-abort-the-user-facing-action invariant.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// A ceiling hit is a normal operational event (the caller sent a large
|
||||
/// body) rather than a failure, but surfacing a cumulative count lets
|
||||
/// operators detect over-size callers early. The
|
||||
/// <see cref="AuditCentralHealthSnapshot"/> production implementation
|
||||
/// accumulates the count via an <c>Interlocked</c> field alongside
|
||||
/// <see cref="ICentralAuditWriteFailureCounter"/> and
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Payload.IAuditRedactionFailureCounter"/>.
|
||||
/// </remarks>
|
||||
public interface IAuditInboundCeilingHitsCounter
|
||||
{
|
||||
/// <summary>Increment the inbound body-ceiling hit counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditInboundCeilingHitsCounter"/> binding used when
|
||||
/// the central health snapshot is not wired (e.g. site composition roots,
|
||||
/// test harnesses that have no health dashboard). All increments are silently
|
||||
/// dropped — correct for environments that have no audit KPI surface.
|
||||
/// </summary>
|
||||
public sealed class NoOpAuditInboundCeilingHitsCounter : IAuditInboundCeilingHitsCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment() { }
|
||||
}
|
||||
@@ -37,6 +37,33 @@ public sealed class AuditLogOptions
|
||||
/// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
|
||||
public int RetentionDays { get; set; } = 365;
|
||||
|
||||
/// <summary>
|
||||
/// M5.5 (T3) per-channel retention overrides, keyed by the canonical channel name
|
||||
/// (the <see cref="AuditChannel"/> enum name — e.g. <c>ApiOutbound</c>,
|
||||
/// <c>DbOutbound</c>, <c>Notification</c>, <c>ApiInbound</c>). The value is a
|
||||
/// retention window in days that MUST be SHORTER than or equal to the global
|
||||
/// <see cref="RetentionDays"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The global <see cref="RetentionDays"/> window is enforced by month-partition
|
||||
/// switch-out, which is channel-blind: it can only drop a whole month once every
|
||||
/// row in it is older than the global window. A per-channel override therefore
|
||||
/// can only ever expire rows EARLIER than the global purge would — never later
|
||||
/// (a longer per-channel window is meaningless because the partition switch-out
|
||||
/// would already have dropped the month). Overrides shorter than the global window
|
||||
/// are honoured by the purge actor as a bounded, batched row DELETE on the
|
||||
/// maintenance path (see <c>AuditLogPurgeActor</c>); the append-only writer/ingest
|
||||
/// role is unaffected.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Each value is validated to be in <c>[30, RetentionDays]</c> by
|
||||
/// <c>AuditLogOptionsValidator</c>; keys that are not recognized
|
||||
/// <see cref="AuditChannel"/> names are rejected.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public Dictionary<string, int> PerChannelRetentionDays { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Per-body byte ceiling applied to <see cref="AuditEvent.RequestSummary"/> and
|
||||
/// <see cref="AuditEvent.ResponseSummary"/> for <see cref="AuditChannel.ApiInbound"/> rows
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using ZB.MOM.WW.Configuration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
|
||||
@@ -52,5 +53,27 @@ public sealed class AuditLogOptionsValidator : OptionsValidatorBase<AuditLogOpti
|
||||
!(options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes),
|
||||
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
|
||||
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
|
||||
|
||||
// M5.5 (T3): per-channel retention overrides. Each entry must be keyed by a
|
||||
// recognized AuditChannel name and carry a window in [MinRetentionDays,
|
||||
// RetentionDays] — i.e. SHORTER than or equal to the global window. A longer
|
||||
// per-channel window is meaningless under month-partition switch-out (governed
|
||||
// by the global window), so it is rejected rather than silently ignored.
|
||||
foreach (var (channelKey, days) in options.PerChannelRetentionDays)
|
||||
{
|
||||
builder.RequireThat(
|
||||
Enum.TryParse<AuditChannel>(channelKey, ignoreCase: false, out _),
|
||||
$"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)} key '{channelKey}' " +
|
||||
$"is not a recognized channel name. Valid keys: {string.Join(", ", Enum.GetNames<AuditChannel>())}.");
|
||||
|
||||
// Valid when days is within [MinRetentionDays, RetentionDays] inclusive.
|
||||
// The lower bound matches the global RetentionDays floor; the upper bound
|
||||
// is the configured global window (longer is meaningless — see remarks).
|
||||
builder.RequireThat(
|
||||
!(days < MinRetentionDays || days > options.RetentionDays),
|
||||
$"AuditLog:{nameof(AuditLogOptions.PerChannelRetentionDays)}['{channelKey}'] ({days}) " +
|
||||
$"must be in [{MinRetentionDays}, {nameof(AuditLogOptions.RetentionDays)}={options.RetentionDays}] days " +
|
||||
"— a per-channel window must be shorter than or equal to the global retention window.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,4 +25,15 @@ public sealed class PerTargetRedactionOverride
|
||||
/// rows.
|
||||
/// </summary>
|
||||
public string? RedactSqlParamsMatching { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// When <c>true</c>, the inbound API audit row for this target records
|
||||
/// request/response headers and metadata (status, duration, actor, etc.)
|
||||
/// but the request and response body strings are omitted
|
||||
/// (<c>RequestSummary</c> / <c>ResponseSummary</c> are left null). The
|
||||
/// audit row itself is always emitted — only the body content is suppressed.
|
||||
/// Null (the default, equivalent to <c>false</c>) means body capture
|
||||
/// proceeds normally up to <see cref="AuditLogOptions.InboundMaxBytes"/>.
|
||||
/// </summary>
|
||||
public bool SkipBodyCapture { get; set; }
|
||||
}
|
||||
|
||||
@@ -200,6 +200,13 @@ public static class ServiceCollectionExtensions
|
||||
// surface on the central dashboard.
|
||||
services.TryAddSingleton<ICentralAuditWriteFailureCounter, NoOpCentralAuditWriteFailureCounter>();
|
||||
|
||||
// M5.3 (T7): inbound body-ceiling hit counter — NoOp default for
|
||||
// site/test roots. AddAuditLogCentralMaintenance replaces this binding
|
||||
// with the AuditCentralHealthSnapshot implementation so ceiling-hit
|
||||
// counts surface on the central dashboard alongside write-failure and
|
||||
// redaction-failure counters.
|
||||
services.TryAddSingleton<IAuditInboundCeilingHitsCounter, NoOpAuditInboundCeilingHitsCounter>();
|
||||
|
||||
// M4 Bundle B: central direct-write audit writer used by
|
||||
// NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
|
||||
// emit AuditLog rows that originate ON central, not via site telemetry.
|
||||
@@ -383,6 +390,12 @@ public static class ServiceCollectionExtensions
|
||||
// HealthMetricsAuditRedactionFailureCounter shape one-for-one.
|
||||
services.Replace(ServiceDescriptor.Singleton<IAuditRedactionFailureCounter,
|
||||
CentralAuditRedactionFailureCounter>());
|
||||
// M5.3 (T7): replace the NoOp IAuditInboundCeilingHitsCounter with the
|
||||
// AuditCentralHealthSnapshot so ceiling-hit counts surface on the
|
||||
// central dashboard. Same singleton-forward pattern as
|
||||
// ICentralAuditWriteFailureCounter above.
|
||||
services.Replace(ServiceDescriptor.Singleton<IAuditInboundCeilingHitsCounter>(
|
||||
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>()));
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Arguments for an <c>audit backfill-source-node</c> invocation.
|
||||
/// </summary>
|
||||
public sealed class AuditBackfillSourceNodeArgs
|
||||
{
|
||||
/// <summary>
|
||||
/// Value written into <c>SourceNode</c> for NULL rows (default <c>"unknown"</c>).
|
||||
/// </summary>
|
||||
public string Sentinel { get; set; } = "unknown";
|
||||
|
||||
/// <summary>
|
||||
/// Only rows with <c>OccurredAtUtc</c> strictly before this UTC datetime are
|
||||
/// eligible. Required — must be an ISO-8601 UTC datetime.
|
||||
/// </summary>
|
||||
public string Before { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum rows updated per batch (default 5000). Caps the per-transaction
|
||||
/// log footprint; the loop repeats until no rows remain.
|
||||
/// </summary>
|
||||
public int BatchSize { get; set; } = 5000;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pure helpers for the <c>audit backfill-source-node</c> subcommand (Audit Log
|
||||
/// #23 M5.6 T5). Builds the request body, POSTs to
|
||||
/// <c>/api/audit/backfill-source-node</c>, and renders the result. Kept separate
|
||||
/// from the command wiring so each piece is unit-testable without standing up the
|
||||
/// command tree.
|
||||
/// </summary>
|
||||
public static class AuditBackfillHelpers
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonWriteOptions = new()
|
||||
{
|
||||
WriteIndented = true,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Builds the JSON request body for <c>POST /api/audit/backfill-source-node</c>.
|
||||
/// </summary>
|
||||
/// <param name="args">The backfill arguments.</param>
|
||||
/// <returns>A JSON string suitable for the request body.</returns>
|
||||
public static string BuildRequestBody(AuditBackfillSourceNodeArgs args)
|
||||
{
|
||||
var obj = new
|
||||
{
|
||||
sentinel = args.Sentinel,
|
||||
before = args.Before,
|
||||
batchSize = args.BatchSize,
|
||||
};
|
||||
return JsonSerializer.Serialize(obj);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes the backfill: POSTs <c>/api/audit/backfill-source-node</c> and
|
||||
/// prints the result. Returns the process exit code (0 = success,
|
||||
/// 1 = error, 2 = authorization failure).
|
||||
/// </summary>
|
||||
/// <param name="client">The management HTTP client.</param>
|
||||
/// <param name="args">The backfill arguments.</param>
|
||||
/// <param name="output">The output writer for results.</param>
|
||||
/// <returns>A task that resolves to the process exit code.</returns>
|
||||
public static async Task<int> RunBackfillAsync(
|
||||
ManagementHttpClient client,
|
||||
AuditBackfillSourceNodeArgs args,
|
||||
TextWriter output)
|
||||
{
|
||||
var body = BuildRequestBody(args);
|
||||
var response = await client.SendPostAsync(
|
||||
"api/audit/backfill-source-node", body, TimeSpan.FromMinutes(10));
|
||||
|
||||
if (response.JsonData == null)
|
||||
{
|
||||
OutputFormatter.WriteError(
|
||||
response.Error ?? "Backfill request failed.", response.ErrorCode ?? "ERROR");
|
||||
return CommandHelpers.IsAuthorizationFailure(response) ? 2 : 1;
|
||||
}
|
||||
|
||||
// Parse and display the result.
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(response.JsonData);
|
||||
var root = doc.RootElement;
|
||||
var rowsUpdated = root.TryGetProperty("rowsUpdated", out var r)
|
||||
? r.GetInt64()
|
||||
: 0L;
|
||||
var sentinel = root.TryGetProperty("sentinel", out var s)
|
||||
? s.GetString() ?? args.Sentinel
|
||||
: args.Sentinel;
|
||||
var before = root.TryGetProperty("before", out var b)
|
||||
? b.GetString() ?? args.Before
|
||||
: args.Before;
|
||||
|
||||
output.WriteLine($"SourceNode backfill complete.");
|
||||
output.WriteLine($" rows updated : {rowsUpdated}");
|
||||
output.WriteLine($" sentinel : {sentinel}");
|
||||
output.WriteLine($" before : {before}");
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
// Server returned success but non-JSON body — not expected; print raw.
|
||||
output.WriteLine(response.JsonData);
|
||||
}
|
||||
|
||||
output.Flush();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@@ -6,13 +6,15 @@ namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
|
||||
/// <summary>
|
||||
/// The <c>scadabridge audit</c> command group (Audit Log #23 M8). Provides read access to
|
||||
/// the centralized append-only Audit Log via the Bundle B REST endpoints
|
||||
/// (<c>GET /api/audit/query</c>, <c>GET /api/audit/export</c>), plus a v1 no-op
|
||||
/// <c>verify-chain</c> placeholder for the deferred hash-chain tamper-evidence feature.
|
||||
/// (<c>GET /api/audit/query</c>, <c>GET /api/audit/export</c>,
|
||||
/// <c>GET /api/audit/tree</c>), plus a v1 no-op <c>verify-chain</c> placeholder
|
||||
/// for the deferred hash-chain tamper-evidence feature.
|
||||
/// </summary>
|
||||
public static class AuditCommands
|
||||
{
|
||||
/// <summary>
|
||||
/// Builds the <c>audit</c> command group with query, export, and verify-chain sub-commands.
|
||||
/// Builds the <c>audit</c> command group with query, export, tree, and verify-chain
|
||||
/// sub-commands.
|
||||
/// </summary>
|
||||
/// <param name="urlOption">Global <c>--url</c> option for the management API endpoint.</param>
|
||||
/// <param name="formatOption">Global <c>--format</c> option for output format.</param>
|
||||
@@ -25,7 +27,9 @@ public static class AuditCommands
|
||||
|
||||
command.Add(BuildQuery(urlOption, formatOption, usernameOption, passwordOption));
|
||||
command.Add(BuildExport(urlOption, formatOption, usernameOption, passwordOption));
|
||||
command.Add(BuildTree(urlOption, formatOption, usernameOption, passwordOption));
|
||||
command.Add(BuildVerifyChain(urlOption, formatOption, usernameOption, passwordOption));
|
||||
command.Add(BuildBackfillSourceNode(urlOption, formatOption, usernameOption, passwordOption));
|
||||
|
||||
return command;
|
||||
}
|
||||
@@ -224,6 +228,44 @@ public static class AuditCommands
|
||||
return cmd;
|
||||
}
|
||||
|
||||
private static Command BuildTree(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
|
||||
{
|
||||
var executionIdOption = new Option<string>("--execution-id")
|
||||
{
|
||||
Description = "Execution ID (GUID) to look up — may be any node in the chain",
|
||||
Required = true,
|
||||
};
|
||||
|
||||
var cmd = new Command("tree") { Description = "Display the full execution-chain tree for an audit execution" };
|
||||
cmd.Add(executionIdOption);
|
||||
|
||||
cmd.SetAction(async (ParseResult result) =>
|
||||
{
|
||||
var connection = AuditCommandHelpers.ResolveConnection(result, urlOption, usernameOption, passwordOption);
|
||||
if (connection.Error != null)
|
||||
{
|
||||
OutputFormatter.WriteError(connection.Error, connection.ErrorCode!);
|
||||
return 1;
|
||||
}
|
||||
|
||||
var rawId = result.GetValue(executionIdOption);
|
||||
if (!Guid.TryParse(rawId, out var executionId))
|
||||
{
|
||||
OutputFormatter.WriteError(
|
||||
$"Invalid execution ID '{rawId}'. Expected a GUID (e.g. 11111111-1111-1111-1111-111111111111).",
|
||||
"INVALID_ARGUMENT");
|
||||
return 1;
|
||||
}
|
||||
|
||||
var format = AuditCommandHelpers.ResolveFormat(result, formatOption);
|
||||
|
||||
using var client = new ManagementHttpClient(connection.Url!, connection.Username!, connection.Password!);
|
||||
return await AuditTreeHelpers.RunTreeAsync(client, executionId, format, Console.Out);
|
||||
});
|
||||
|
||||
return cmd;
|
||||
}
|
||||
|
||||
private static Command BuildVerifyChain(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
|
||||
{
|
||||
var monthOption = new Option<string>("--month") { Description = "Month to verify (YYYY-MM)", Required = true };
|
||||
@@ -247,4 +289,76 @@ public static class AuditCommands
|
||||
});
|
||||
return cmd;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the <c>audit backfill-source-node</c> sub-command (Audit Log #23 M5.6 T5).
|
||||
/// Sets <c>SourceNode</c> on historical pre-feature rows whose <c>SourceNode IS NULL</c>
|
||||
/// and <c>OccurredAtUtc</c> is older than <c>--before</c>, in batches. Admin-only.
|
||||
/// </summary>
|
||||
private static Command BuildBackfillSourceNode(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
|
||||
{
|
||||
var sentinelOption = new Option<string>("--sentinel")
|
||||
{
|
||||
Description = "Value to write for pre-feature rows whose node-of-origin is unknown (default: unknown)",
|
||||
};
|
||||
sentinelOption.DefaultValueFactory = _ => "unknown";
|
||||
|
||||
var beforeOption = new Option<string>("--before")
|
||||
{
|
||||
Description = "ISO-8601 UTC datetime; only rows older than this date are eligible (required)",
|
||||
Required = true,
|
||||
};
|
||||
|
||||
var batchOption = new Option<int>("--batch")
|
||||
{
|
||||
Description = "Max rows updated per batch (default: 5000)",
|
||||
};
|
||||
batchOption.DefaultValueFactory = _ => 5000;
|
||||
|
||||
var cmd = new Command("backfill-source-node")
|
||||
{
|
||||
Description = "Set SourceNode to a sentinel value on pre-feature rows where it is NULL (admin-only, maintenance path)",
|
||||
};
|
||||
cmd.Add(sentinelOption);
|
||||
cmd.Add(beforeOption);
|
||||
cmd.Add(batchOption);
|
||||
|
||||
cmd.SetAction(async (ParseResult result) =>
|
||||
{
|
||||
var connection = AuditCommandHelpers.ResolveConnection(result, urlOption, usernameOption, passwordOption);
|
||||
if (connection.Error != null)
|
||||
{
|
||||
OutputFormatter.WriteError(connection.Error, connection.ErrorCode!);
|
||||
return 1;
|
||||
}
|
||||
|
||||
var sentinel = result.GetValue(sentinelOption) ?? "unknown";
|
||||
var before = result.GetValue(beforeOption)!;
|
||||
var batch = result.GetValue(batchOption);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(sentinel))
|
||||
{
|
||||
OutputFormatter.WriteError("--sentinel must be a non-empty string.", "INVALID_ARGUMENT");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (batch <= 0)
|
||||
{
|
||||
OutputFormatter.WriteError("--batch must be > 0.", "INVALID_ARGUMENT");
|
||||
return 1;
|
||||
}
|
||||
|
||||
var args = new AuditBackfillSourceNodeArgs
|
||||
{
|
||||
Sentinel = sentinel,
|
||||
Before = before,
|
||||
BatchSize = batch,
|
||||
};
|
||||
|
||||
using var client = new ManagementHttpClient(connection.Url!, connection.Username!, connection.Password!);
|
||||
return await AuditBackfillHelpers.RunBackfillAsync(client, args, Console.Out);
|
||||
});
|
||||
|
||||
return cmd;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,208 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.CLI.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Arguments for an <c>audit tree</c> invocation.
|
||||
/// </summary>
|
||||
public sealed class AuditTreeArgs
|
||||
{
|
||||
/// <summary>
|
||||
/// The execution ID (GUID) to look up. May be any node in the chain — the
|
||||
/// server walks to the root and returns the full tree.
|
||||
/// </summary>
|
||||
public string ExecutionId { get; set; } = string.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents one execution node as returned by <c>GET /api/audit/tree</c>.
|
||||
/// Property names match the server's camelCase JSON serialisation of
|
||||
/// <c>ExecutionTreeNode</c>.
|
||||
/// </summary>
|
||||
internal sealed class AuditTreeNodeDto
|
||||
{
|
||||
public Guid ExecutionId { get; init; }
|
||||
public Guid? ParentExecutionId { get; init; }
|
||||
public int RowCount { get; init; }
|
||||
public string[] Channels { get; init; } = Array.Empty<string>();
|
||||
public string[] Statuses { get; init; } = Array.Empty<string>();
|
||||
public string? SourceSiteId { get; init; }
|
||||
public string? SourceInstanceId { get; init; }
|
||||
public DateTime? FirstOccurredAtUtc { get; init; }
|
||||
public DateTime? LastOccurredAtUtc { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pure helpers for the <c>audit tree</c> subcommand: builds the query string,
|
||||
/// calls <c>GET /api/audit/tree</c>, and renders the result as either an
|
||||
/// indented ASCII tree (table format) or raw JSON. Kept separate from the
|
||||
/// command wiring so each piece is unit-testable without standing up the
|
||||
/// command tree.
|
||||
/// </summary>
|
||||
public static class AuditTreeHelpers
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonReadOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
};
|
||||
|
||||
private static readonly JsonSerializerOptions JsonWriteOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
WriteIndented = true,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Builds the query string for <c>GET /api/audit/tree</c>.
|
||||
/// </summary>
|
||||
/// <param name="executionId">The execution ID GUID.</param>
|
||||
/// <returns>A relative path + query string ready to append to the base URL.</returns>
|
||||
public static string BuildUrl(Guid executionId)
|
||||
=> $"api/audit/tree?executionId={executionId:D}";
|
||||
|
||||
/// <summary>
|
||||
/// Executes the tree lookup: GETs <c>/api/audit/tree</c> and renders the result
|
||||
/// in the requested format. Returns the process exit code (0 = success,
|
||||
/// 1 = error, 2 = authorization failure).
|
||||
/// </summary>
|
||||
/// <param name="client">The management HTTP client.</param>
|
||||
/// <param name="executionId">The execution ID to look up.</param>
|
||||
/// <param name="format">"table" (default) or "json".</param>
|
||||
/// <param name="output">The output writer for results.</param>
|
||||
/// <returns>A task that resolves to the process exit code.</returns>
|
||||
public static async Task<int> RunTreeAsync(
|
||||
ManagementHttpClient client,
|
||||
Guid executionId,
|
||||
string format,
|
||||
TextWriter output)
|
||||
{
|
||||
var url = BuildUrl(executionId);
|
||||
var response = await client.SendGetAsync(url, TimeSpan.FromSeconds(30));
|
||||
|
||||
if (response.JsonData == null)
|
||||
{
|
||||
OutputFormatter.WriteError(
|
||||
response.Error ?? "Audit tree request failed.", response.ErrorCode ?? "ERROR");
|
||||
return CommandHelpers.IsAuthorizationFailure(response) ? 2 : 1;
|
||||
}
|
||||
|
||||
var nodes = ParseNodes(response.JsonData);
|
||||
|
||||
if (format == "json")
|
||||
{
|
||||
WriteJson(nodes, output);
|
||||
}
|
||||
else
|
||||
{
|
||||
WriteTable(nodes, executionId, output);
|
||||
}
|
||||
|
||||
output.Flush();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses the JSON array from the server into an array of
|
||||
/// <see cref="AuditTreeNodeDto"/>.
|
||||
/// </summary>
|
||||
/// <param name="json">The raw JSON response body.</param>
|
||||
/// <returns>An array of deserialized tree nodes (empty on parse failure).</returns>
|
||||
internal static AuditTreeNodeDto[] ParseNodes(string json)
|
||||
{
|
||||
try
|
||||
{
|
||||
return JsonSerializer.Deserialize<AuditTreeNodeDto[]>(json, JsonReadOptions)
|
||||
?? Array.Empty<AuditTreeNodeDto>();
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
return Array.Empty<AuditTreeNodeDto>();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Renders the nodes as pretty-printed JSON to <paramref name="output"/>.
|
||||
/// </summary>
|
||||
internal static void WriteJson(AuditTreeNodeDto[] nodes, TextWriter output)
|
||||
{
|
||||
output.WriteLine(JsonSerializer.Serialize(nodes, JsonWriteOptions));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Renders the nodes as an indented ASCII tree. The root node (null
|
||||
/// <c>ParentExecutionId</c>) is printed first; each child is indented
|
||||
/// two spaces per depth level. The queried/entry-point node is marked
|
||||
/// with <c> [*]</c>.
|
||||
/// </summary>
|
||||
internal static void WriteTable(
|
||||
AuditTreeNodeDto[] nodes,
|
||||
Guid queriedExecutionId,
|
||||
TextWriter output)
|
||||
{
|
||||
if (nodes.Length == 0)
|
||||
{
|
||||
output.WriteLine("(no execution tree found)");
|
||||
return;
|
||||
}
|
||||
|
||||
// Build a parent → children lookup (keyed by non-null parent Guid).
|
||||
// Nodes whose ParentExecutionId is null are roots and are not placed in
|
||||
// the lookup; they are identified separately below.
|
||||
var childrenOf = new Dictionary<Guid, List<AuditTreeNodeDto>>();
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
if (node.ParentExecutionId is { } parentId)
|
||||
{
|
||||
if (!childrenOf.ContainsKey(parentId))
|
||||
childrenOf[parentId] = new List<AuditTreeNodeDto>();
|
||||
childrenOf[parentId].Add(node);
|
||||
}
|
||||
}
|
||||
|
||||
// Identify roots: nodes whose ParentExecutionId is null, or whose parent
|
||||
// is not present in the node set (stub-root case).
|
||||
var nodeIds = new HashSet<Guid>(nodes.Select(n => n.ExecutionId));
|
||||
var roots = nodes
|
||||
.Where(n => n.ParentExecutionId == null || !nodeIds.Contains(n.ParentExecutionId.Value))
|
||||
.ToList();
|
||||
|
||||
// Render depth-first.
|
||||
var sb = new StringBuilder();
|
||||
foreach (var root in roots)
|
||||
{
|
||||
RenderNode(root, depth: 0, childrenOf, queriedExecutionId, sb);
|
||||
}
|
||||
|
||||
output.Write(sb.ToString());
|
||||
}
|
||||
|
||||
private static void RenderNode(
|
||||
AuditTreeNodeDto node,
|
||||
int depth,
|
||||
Dictionary<Guid, List<AuditTreeNodeDto>> childrenOf,
|
||||
Guid queriedExecutionId,
|
||||
StringBuilder sb)
|
||||
{
|
||||
var indent = new string(' ', depth * 2);
|
||||
var marker = node.ExecutionId == queriedExecutionId ? " [*]" : string.Empty;
|
||||
var channels = node.Channels.Length > 0 ? string.Join(",", node.Channels) : "-";
|
||||
var statuses = node.Statuses.Length > 0 ? string.Join(",", node.Statuses) : "-";
|
||||
var site = node.SourceSiteId ?? "-";
|
||||
var instance = node.SourceInstanceId ?? "-";
|
||||
var first = node.FirstOccurredAtUtc.HasValue
|
||||
? node.FirstOccurredAtUtc.Value.ToString("yyyy-MM-ddTHH:mm:ssZ")
|
||||
: "-";
|
||||
|
||||
sb.AppendLine(
|
||||
$"{indent}{node.ExecutionId:D}{marker} rows={node.RowCount} channels=[{channels}] statuses=[{statuses}] site={site} instance={instance} first={first}");
|
||||
|
||||
if (childrenOf.TryGetValue(node.ExecutionId, out var children))
|
||||
{
|
||||
foreach (var child in children)
|
||||
{
|
||||
RenderNode(child, depth + 1, childrenOf, queriedExecutionId, sb);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -142,6 +142,60 @@ public class ManagementHttpClient : IDisposable
|
||||
return new ManagementResponse((int)httpResponse.StatusCode, null, error, code);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Issues a plain HTTP <c>POST</c> against a REST endpoint (e.g. the audit
|
||||
/// maintenance endpoints) with a JSON body and returns the response. Unlike
|
||||
/// <see cref="SendCommandAsync"/>, this does not wrap the call in the
|
||||
/// <c>POST /management</c> command envelope — these are plain REST resources.
|
||||
/// Authentication (HTTP Basic) and the base address are shared.
|
||||
/// </summary>
|
||||
/// <param name="relativePath">Path relative to the base URL.</param>
|
||||
/// <param name="body">The JSON body to send, or <c>null</c> for an empty body.</param>
|
||||
/// <param name="timeout">The request timeout.</param>
|
||||
/// <returns>A management response containing status and data.</returns>
|
||||
public async Task<ManagementResponse> SendPostAsync(string relativePath, string? body, TimeSpan timeout)
|
||||
{
|
||||
using var cts = new CancellationTokenSource(timeout);
|
||||
|
||||
var content = new StringContent(body ?? "{}", Encoding.UTF8, "application/json");
|
||||
|
||||
HttpResponseMessage httpResponse;
|
||||
try
|
||||
{
|
||||
httpResponse = await _httpClient.PostAsync(relativePath, content, cts.Token);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
return new ManagementResponse(504, null, "Request timed out.", "TIMEOUT");
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
return new ManagementResponse(0, null, $"Connection failed: {ex.Message}", "CONNECTION_FAILED");
|
||||
}
|
||||
|
||||
var responseBody = await httpResponse.Content.ReadAsStringAsync(cts.Token);
|
||||
|
||||
if (httpResponse.IsSuccessStatusCode)
|
||||
{
|
||||
return new ManagementResponse((int)httpResponse.StatusCode, responseBody, null, null);
|
||||
}
|
||||
|
||||
string? error = null;
|
||||
string? code = null;
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(responseBody);
|
||||
error = doc.RootElement.TryGetProperty("error", out var e) ? e.GetString() : responseBody;
|
||||
code = doc.RootElement.TryGetProperty("code", out var c) ? c.GetString() : null;
|
||||
}
|
||||
catch
|
||||
{
|
||||
error = responseBody;
|
||||
}
|
||||
|
||||
return new ManagementResponse((int)httpResponse.StatusCode, null, error, code);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Issues a plain HTTP <c>GET</c> and returns the raw <see cref="HttpResponseMessage"/>
|
||||
/// so the caller can stream the response body without buffering it in memory — used
|
||||
|
||||
@@ -1269,15 +1269,18 @@ script-trust-boundary action: outbound API calls (sync + cached), outbound DB
|
||||
operations (sync + cached), notifications, and inbound API calls. This is distinct
|
||||
from the configuration-change audit trail exposed by [`audit-config`](#audit-config--configuration-change-audit-log).
|
||||
|
||||
The subcommands map directly onto the `GET /api/audit/query` and
|
||||
`GET /api/audit/export` management endpoints. Filters and the result columns mirror
|
||||
the Central UI **Audit** page, so a CLI query and a UI query with the same filters
|
||||
return the same rows — CLI ↔ UI filter parity is intentional.
|
||||
The subcommands map directly onto the `GET /api/audit/query`,
|
||||
`GET /api/audit/export`, `GET /api/audit/tree`, and
|
||||
`POST /api/audit/backfill-source-node` management endpoints. Filters and the
|
||||
result columns mirror the Central UI **Audit** page, so a CLI query and a UI
|
||||
query with the same filters return the same rows — CLI ↔ UI filter parity is
|
||||
intentional.
|
||||
|
||||
**Permissions.** Querying requires the `OperationalAudit` permission (roles `Admin`,
|
||||
`Audit`, or `AuditReadOnly`). Exporting requires the stricter `AuditExport` permission
|
||||
(roles `Admin` or `Audit`) — read access does *not* imply export access. A request
|
||||
without the required role returns exit code `2`.
|
||||
**Permissions.** Querying and tree traversal require the `OperationalAudit`
|
||||
permission (roles `Admin`, `Audit`, or `AuditReadOnly`). Exporting requires the
|
||||
stricter `AuditExport` permission (roles `Admin` or `Audit`) — read access does
|
||||
*not* imply export access. The `backfill-source-node` maintenance command requires
|
||||
the `Admin` role. A request without the required role returns exit code `2`.
|
||||
|
||||
#### `audit query`
|
||||
|
||||
@@ -1342,6 +1345,46 @@ scadabridge --url <url> audit export --since <time> --until <time> --format <fmt
|
||||
> Implemented` — Parquet archival is deferred to v1.x (see `Component-AuditLog.md`).
|
||||
> Use `csv` or `jsonl`.
|
||||
|
||||
#### `audit tree` (M5.3 T8)
|
||||
|
||||
Display the full execution-chain tree for a given execution ID. The server walks
|
||||
`ParentExecutionId` to find the root, then traverses downward to collect all
|
||||
reachable executions in the chain.
|
||||
|
||||
```sh
|
||||
scadabridge --url <url> audit tree --execution-id <guid> [--format table|json]
|
||||
```
|
||||
|
||||
| Option | Required | Default | Description |
|
||||
|--------|----------|---------|-------------|
|
||||
| `--execution-id` | yes | — | Any `ExecutionId` in the chain (root or child) |
|
||||
| `--format` | no | `json` | Output format: `json` (structured tree) or `table` (indented tree) |
|
||||
|
||||
The `--execution-id` can be any node in the chain — the server resolves the root
|
||||
automatically. With `--format table` the tree is printed as an indented text
|
||||
representation. With `--format json` (the default) a structured JSON tree is
|
||||
returned, suitable for scripting. Backed by `GET /api/audit/tree?executionId=<guid>`.
|
||||
Requires `OperationalAudit` permission.
|
||||
|
||||
#### `audit backfill-source-node` (M5.6 T5)
|
||||
|
||||
Set `SourceNode` to a sentinel value on pre-feature rows where `SourceNode IS NULL`
|
||||
and `OccurredAtUtc` is older than `--before`. Admin-only maintenance command.
|
||||
|
||||
```sh
|
||||
scadabridge --url <url> audit backfill-source-node --before <ISO-8601-UTC> [--sentinel <value>] [--batch <n>]
|
||||
```
|
||||
|
||||
| Option | Required | Default | Description |
|
||||
|--------|----------|---------|-------------|
|
||||
| `--before` | yes | — | ISO-8601 UTC datetime; only rows older than this date are eligible |
|
||||
| `--sentinel` | no | `unknown` | Value to write (must be non-empty) |
|
||||
| `--batch` | no | `5000` | Max rows updated per batch; controls transaction size |
|
||||
|
||||
The command is idempotent — running it multiple times converges (only rows where
|
||||
`SourceNode IS NULL` are eligible; already-set rows are untouched). Backed by
|
||||
`POST /api/audit/backfill-source-node`. Requires `Admin` role.
|
||||
|
||||
#### `audit verify-chain`
|
||||
|
||||
Verify the audit log hash chain for a given month.
|
||||
@@ -1354,11 +1397,11 @@ scadabridge --url <url> audit verify-chain --month <YYYY-MM>
|
||||
|--------|----------|---------|-------------|
|
||||
| `--month` | yes | — | Month to verify, `YYYY-MM` (e.g. `2026-05`) |
|
||||
|
||||
> **v1 no-op.** Hash-chain tamper-evidence is not enabled in this release. The
|
||||
> subcommand validates the `--month` argument and prints a notice pointing at the
|
||||
> v1.x roadmap in `Component-AuditLog.md`; it exits `0` without contacting the server.
|
||||
> The command exists now so scripts and operator habits do not need to change when
|
||||
> tamper-evidence ships.
|
||||
> **v1 no-op.** Hash-chain tamper-evidence is not enabled in this release (T1
|
||||
> deferred to v1.x). The subcommand validates the `--month` argument and prints a
|
||||
> notice pointing at the v1.x roadmap in `Component-AuditLog.md`; it exits `0`
|
||||
> without contacting the server. The command exists now so scripts and operator
|
||||
> habits do not need to change when tamper-evidence ships.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -58,3 +58,31 @@
|
||||
{
|
||||
<div class="text-muted small mb-3">Site Call KPIs unavailable: @ErrorMessage</div>
|
||||
}
|
||||
@* ── Per-node stuck/parked sub-table (T6: M5.2 per-node stuck-count KPIs) ── *@
|
||||
@if (HasNodeBreakdown)
|
||||
{
|
||||
<div class="mb-3">
|
||||
<div class="d-flex justify-content-between align-items-center mb-1">
|
||||
<small class="text-muted">By node</small>
|
||||
</div>
|
||||
<table class="table table-sm table-borderless mb-0 site-call-kpi-node-table">
|
||||
<thead class="table-light">
|
||||
<tr>
|
||||
<th class="small py-1">Node</th>
|
||||
<th class="text-end small py-1">Stuck</th>
|
||||
<th class="text-end small py-1">Parked</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var n in PerNodeSnapshots!)
|
||||
{
|
||||
<tr @key="n.SourceNode">
|
||||
<td class="small py-1"><code>@n.SourceNode</code></td>
|
||||
<td class="text-end font-monospace small py-1 @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
|
||||
<td class="text-end font-monospace small py-1 @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using Microsoft.AspNetCore.Components;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.CentralUI.Components.Health;
|
||||
|
||||
@@ -59,6 +60,24 @@ public partial class SiteCallKpiTiles
|
||||
/// </summary>
|
||||
[Parameter] public string? ErrorMessage { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional per-node KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
|
||||
/// When non-null and non-empty, a compact node-level stuck/parked sub-table
|
||||
/// is rendered below the main tiles. <c>null</c> means the parent has not
|
||||
/// loaded it yet or has opted out — the sub-table is suppressed entirely.
|
||||
/// </summary>
|
||||
[Parameter] public IReadOnlyList<SiteCallNodeKpiSnapshot>? PerNodeSnapshots { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// True when <see cref="PerNodeSnapshots"/> is a successful query result.
|
||||
/// Used to suppress the sub-table on a load failure.
|
||||
/// </summary>
|
||||
[Parameter] public bool PerNodeAvailable { get; set; }
|
||||
|
||||
/// <summary>Whether the per-node sub-table has data to render.</summary>
|
||||
internal bool HasNodeBreakdown =>
|
||||
PerNodeAvailable && PerNodeSnapshots is { Count: > 0 };
|
||||
|
||||
// ── Buffered tile ───────────────────────────────────────────────────────
|
||||
|
||||
private string BufferedDisplay =>
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
@using ZB.MOM.WW.ScadaBridge.HealthMonitoring
|
||||
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification
|
||||
@using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit
|
||||
@using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit
|
||||
@using ZB.MOM.WW.ScadaBridge.Communication
|
||||
@implements IDisposable
|
||||
@inject ICentralHealthAggregator HealthAggregator
|
||||
@@ -65,7 +66,9 @@
|
||||
(buffered / stuck / parked). Refreshed alongside the site states. *@
|
||||
<SiteCallKpiTiles Snapshot="@_siteCallKpi"
|
||||
IsAvailable="@_siteCallKpiAvailable"
|
||||
ErrorMessage="@_siteCallKpiError" />
|
||||
ErrorMessage="@_siteCallKpiError"
|
||||
PerNodeSnapshots="@_siteCallNodeKpis"
|
||||
PerNodeAvailable="@_siteCallNodeKpiAvailable" />
|
||||
|
||||
@* Audit Log (#23) M7 Bundle E — three KPI tiles for the Audit channel
|
||||
(volume / error rate / backlog). Refreshed alongside the site states. *@
|
||||
@@ -378,6 +381,12 @@
|
||||
private bool _siteCallKpiAvailable;
|
||||
private string? _siteCallKpiError;
|
||||
|
||||
// Per-node Site Call KPI breakdown (T6: M5.2 per-node stuck-count KPIs).
|
||||
// Passed to SiteCallKpiTiles as an optional sub-table.
|
||||
private IReadOnlyList<SiteCallNodeKpiSnapshot> _siteCallNodeKpis =
|
||||
Array.Empty<SiteCallNodeKpiSnapshot>();
|
||||
private bool _siteCallNodeKpiAvailable;
|
||||
|
||||
private static bool SiteHasActiveErrors(SiteHealthState state)
|
||||
{
|
||||
var report = state.LatestReport;
|
||||
@@ -415,7 +424,7 @@
|
||||
{
|
||||
_siteStates = HealthAggregator.GetAllSiteStates();
|
||||
await LoadOutboxKpis();
|
||||
await LoadSiteCallKpis();
|
||||
await Task.WhenAll(LoadSiteCallKpis(), LoadSiteCallNodeKpis());
|
||||
await LoadAuditKpis();
|
||||
}
|
||||
|
||||
@@ -474,6 +483,30 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Per-node site-call KPI loader (T6: M5.2). Best-effort; a fault silently
|
||||
// suppresses the per-node sub-table rather than degrading the dashboard.
|
||||
private async Task LoadSiteCallNodeKpis()
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await CommunicationService.GetPerNodeSiteCallKpisAsync(
|
||||
new PerNodeSiteCallKpiRequest(Guid.NewGuid().ToString("N")));
|
||||
if (response.Success)
|
||||
{
|
||||
_siteCallNodeKpis = response.Nodes;
|
||||
_siteCallNodeKpiAvailable = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
_siteCallNodeKpiAvailable = false;
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
_siteCallNodeKpiAvailable = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Tiles show the numeric KPI when available, or an em dash when the outbox
|
||||
// KPI query failed — matching how the page renders other unavailable data.
|
||||
private string OutboxTileValue(int value) =>
|
||||
|
||||
+73
-2
@@ -69,6 +69,51 @@
|
||||
</div>
|
||||
}
|
||||
|
||||
@* ── Per-node breakdown (T6: additive) ── *@
|
||||
<h5 class="mb-2">Per-node breakdown</h5>
|
||||
@if (_perNodeError != null)
|
||||
{
|
||||
<div class="alert alert-warning py-2">Per-node KPIs unavailable: @_perNodeError</div>
|
||||
}
|
||||
else if (_perNode.Count == 0)
|
||||
{
|
||||
<div class="card mb-3">
|
||||
<div class="card-body text-center text-muted py-3">
|
||||
<div class="small">No per-node activity (rows may have a null SourceNode).</div>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
<div class="table-responsive mb-3">
|
||||
<table class="table table-sm table-hover align-middle">
|
||||
<thead class="table-light">
|
||||
<tr>
|
||||
<th>Node</th>
|
||||
<th class="text-end">Queue Depth</th>
|
||||
<th class="text-end">Stuck</th>
|
||||
<th class="text-end">Parked</th>
|
||||
<th class="text-end">Delivered (last interval)</th>
|
||||
<th class="text-end">Oldest Pending Age</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var n in _perNode)
|
||||
{
|
||||
<tr @key="n.SourceNode" class="@(n.StuckCount > 0 ? "table-warning" : "")">
|
||||
<td><code>@n.SourceNode</code></td>
|
||||
<td class="text-end font-monospace">@n.QueueDepth</td>
|
||||
<td class="text-end font-monospace @(n.StuckCount > 0 ? "text-warning" : "")">@n.StuckCount</td>
|
||||
<td class="text-end font-monospace @(n.ParkedCount > 0 ? "text-danger" : "")">@n.ParkedCount</td>
|
||||
<td class="text-end font-monospace text-success">@n.DeliveredLastInterval</td>
|
||||
<td class="text-end font-monospace">@FormatAge(n.OldestPendingAge)</td>
|
||||
</tr>
|
||||
}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
}
|
||||
|
||||
@* ── Per-site breakdown ── *@
|
||||
<h5 class="mb-2">Per-site breakdown</h5>
|
||||
@if (_perSiteError != null)
|
||||
@@ -124,6 +169,10 @@
|
||||
private IReadOnlyList<SiteNotificationKpiSnapshot> _perSite = Array.Empty<SiteNotificationKpiSnapshot>();
|
||||
private string? _perSiteError;
|
||||
|
||||
// ── Per-node (T6: M5.2 per-node stuck-count KPIs) ──
|
||||
private IReadOnlyList<NodeNotificationKpiSnapshot> _perNode = Array.Empty<NodeNotificationKpiSnapshot>();
|
||||
private string? _perNodeError;
|
||||
|
||||
private bool _loading;
|
||||
|
||||
protected override async Task OnInitializedAsync()
|
||||
@@ -144,9 +193,9 @@
|
||||
private async Task RefreshAll()
|
||||
{
|
||||
_loading = true;
|
||||
// Race-free despite both tasks mutating component fields: Blazor Server runs
|
||||
// Race-free despite all tasks mutating component fields: Blazor Server runs
|
||||
// every continuation on the circuit's single-threaded synchronization context.
|
||||
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis());
|
||||
await Task.WhenAll(LoadGlobalKpis(), LoadPerSiteKpis(), LoadPerNodeKpis());
|
||||
_loading = false;
|
||||
}
|
||||
|
||||
@@ -194,6 +243,28 @@
|
||||
}
|
||||
}
|
||||
|
||||
private async Task LoadPerNodeKpis()
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await CommunicationService.GetPerNodeNotificationKpisAsync(
|
||||
new PerNodeNotificationKpiRequest(Guid.NewGuid().ToString("N")));
|
||||
if (response.Success)
|
||||
{
|
||||
_perNode = response.Nodes;
|
||||
_perNodeError = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
_perNodeError = response.ErrorMessage ?? "Per-node KPI query failed.";
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_perNodeError = $"Per-node KPI query failed: {ex.Message}";
|
||||
}
|
||||
}
|
||||
|
||||
private string SiteName(string siteId) =>
|
||||
_sites.FirstOrDefault(s => s.SiteIdentifier == siteId)?.Name ?? siteId;
|
||||
|
||||
|
||||
@@ -87,6 +87,42 @@ public interface IAuditLogRepository
|
||||
/// <returns>A task that resolves to the approximate number of rows discarded by the partition switch.</returns>
|
||||
Task<long> SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// M5.5 (T3) per-channel retention override purge. Deletes <c>AuditLog</c> rows for a
|
||||
/// single <paramref name="channel"/> (matched against the canonical
|
||||
/// <c>Category</c> column — the bare channel name, e.g. <c>ApiOutbound</c>) whose
|
||||
/// <c>OccurredAtUtc</c> is strictly older than <paramref name="threshold"/>, in
|
||||
/// bounded batches of <paramref name="batchSize"/> rows, looping until no further
|
||||
/// rows match. Returns the total number of rows deleted across all batches.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Maintenance path — NOT the writer role.</b> The append-only invariant binds
|
||||
/// the <c>scadabridge_audit_writer</c> ingest role (INSERT + SELECT only). This row
|
||||
/// DELETE runs on the purge/maintenance connection, the same path that performs the
|
||||
/// global partition switch-out (also a destructive operation forbidden to the writer
|
||||
/// role). Per-channel overrides can only ever expire rows EARLIER than the global
|
||||
/// month-partition switch-out would — never later — so this is a strict tightening
|
||||
/// of the retention window, applied AFTER the global purge on the same tick.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Bounded + idempotent.</b> Each batch is a <c>DELETE TOP (@batch)</c> so the
|
||||
/// transaction log and lock footprint stay bounded regardless of backlog. Re-running
|
||||
/// the purge is a no-op once every eligible row is gone (the loop exits when a batch
|
||||
/// deletes zero rows), so a crash mid-loop is recoverable by simply running again.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="channel">Canonical channel name (the <c>Category</c> column value, e.g. <c>ApiOutbound</c>).</param>
|
||||
/// <param name="threshold">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are deleted.</param>
|
||||
/// <param name="batchSize">Maximum rows deleted per batch; must be > 0.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A task that resolves to the total number of rows deleted across all batches.</returns>
|
||||
Task<long> PurgeChannelOlderThanAsync(
|
||||
string channel,
|
||||
DateTime threshold,
|
||||
int batchSize,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Returns the set of <c>pf_AuditLog_Month</c> partition lower-bound
|
||||
/// boundaries whose partitions contain only rows with
|
||||
@@ -201,4 +237,59 @@ public interface IAuditLogRepository
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A task that resolves to the distinct, non-null source node names in ascending order.</returns>
|
||||
Task<IReadOnlyList<string>> GetDistinctSourceNodesAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// M5.6 (T5) one-time operational backfill: sets <c>SourceNode</c> to
|
||||
/// <paramref name="sentinel"/> on every row where <c>SourceNode IS NULL</c>
|
||||
/// and <c>OccurredAtUtc < <paramref name="before"/></c>, in bounded
|
||||
/// batches of <paramref name="batchSize"/> rows, looping until no further
|
||||
/// rows match. Returns the total number of rows updated across all batches.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a sentinel, not the real value.</b> <c>SourceNode</c> captures the
|
||||
/// physical cluster node on which an event was emitted. For pre-feature rows
|
||||
/// that were ingested before the column was stamped, the true node-of-origin
|
||||
/// is UNKNOWABLE — the original emitter is long gone and there is no
|
||||
/// retroactive way to determine it. Backfilling a configurable sentinel
|
||||
/// (default <c>"unknown"</c>) makes it explicit that these rows pre-date the
|
||||
/// feature rather than silently leaving them NULL (which the filter UI already
|
||||
/// treats as "unresolved" but which an operator might mistake for a bug).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b><c>ExecutionId</c> / <c>ParentExecutionId</c> cannot be backfilled.</b>
|
||||
/// These are PERSISTED COMPUTED columns derived from <c>DetailsJson</c>. The
|
||||
/// AuditLog append-only invariant forbids mutating <c>DetailsJson</c>, so
|
||||
/// the computed values for pre-feature rows remain NULL permanently. This is
|
||||
/// documented rather than coded — see the Ops Note in
|
||||
/// <c>Component-AuditLog.md § Ops Notes — Historical Null Columns</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Maintenance path — NOT the writer role.</b> This UPDATE runs on the
|
||||
/// purge/maintenance connection (the same path as
|
||||
/// <see cref="SwitchOutPartitionAsync"/> and any per-channel purge), NOT the
|
||||
/// append-only <c>scadabridge_audit_writer</c> role. The CI guard
|
||||
/// (<c>AuditLogAppendOnlyGuardTests</c>) recognises the
|
||||
/// <c>// AUDIT-PURGE-ALLOWED</c> marker on the UPDATE line and forgives
|
||||
/// exactly this one sanctioned maintenance-path UPDATE; any other UPDATE
|
||||
/// against <c>AuditLog</c> still trips the guard.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Bounded + idempotent.</b> <c>UPDATE TOP (@batch)</c> caps the
|
||||
/// transaction-log and lock footprint per statement. The loop exits when a
|
||||
/// batch updates zero rows, so a crash mid-loop is recoverable by simply
|
||||
/// running again; re-running after completion is a no-op (no NULL rows
|
||||
/// remain for the given <paramref name="before"/> window).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="sentinel">Value to write into <c>SourceNode</c> for pre-feature rows (e.g. <c>"unknown"</c>).</param>
|
||||
/// <param name="before">Rows with <c>OccurredAtUtc</c> strictly older than this UTC datetime are eligible.</param>
|
||||
/// <param name="batchSize">Maximum rows updated per batch; must be > 0.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A task that resolves to the total number of rows updated across all batches.</returns>
|
||||
Task<long> BackfillSourceNodeAsync(
|
||||
string sentinel,
|
||||
DateTime before,
|
||||
int batchSize,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
+13
@@ -100,6 +100,19 @@ public interface INotificationOutboxRepository
|
||||
Task<IReadOnlyList<SiteNotificationKpiSnapshot>> ComputePerSiteKpisAsync(
|
||||
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Computes a point-in-time <see cref="NodeNotificationKpiSnapshot"/> per originating node.
|
||||
/// Nodes with no notification rows at all are omitted; rows with a <c>NULL</c>
|
||||
/// <c>SourceNode</c> are excluded. The stuck and delivered cutoffs are supplied by the
|
||||
/// caller; the current time used for <c>OldestPendingAge</c> is captured inside the method.
|
||||
/// </summary>
|
||||
/// <param name="stuckCutoff">The time threshold for marking notifications as stuck.</param>
|
||||
/// <param name="deliveredSince">The time threshold for counting delivered notifications.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>A list of per-node KPI snapshots, ordered by node name.</returns>
|
||||
Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Persists pending changes tracked on the underlying context. Use this when staging
|
||||
/// multiple changes for a single commit; the individual mutating methods on this
|
||||
|
||||
@@ -107,4 +107,19 @@ public interface ISiteCallAuditRepository
|
||||
DateTime stuckCutoff,
|
||||
DateTime intervalSince,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Computes a point-in-time <see cref="SiteCallNodeKpiSnapshot"/> per originating
|
||||
/// node. Nodes with no <c>SiteCalls</c> rows at all are omitted; rows with a
|
||||
/// <c>NULL</c> <c>SourceNode</c> are excluded. The stuck cutoff and interval
|
||||
/// bounds are interpreted as in <see cref="ComputeKpisAsync"/>.
|
||||
/// </summary>
|
||||
/// <param name="stuckCutoff">UTC threshold for classifying a row as stuck.</param>
|
||||
/// <param name="intervalSince">UTC start of the delivered/failed interval window.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A task that resolves to a per-node KPI list; nodes with no rows are omitted.</returns>
|
||||
Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTime stuckCutoff,
|
||||
DateTime intervalSince,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
@@ -164,3 +164,24 @@ public sealed record PerSiteSiteCallKpiResponse(
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<SiteCallSiteKpiSnapshot> Sites);
|
||||
|
||||
/// <summary>
|
||||
/// Site Calls UI -> Central: request for the per-node <c>SiteCalls</c>
|
||||
/// KPI breakdown. Mirrors <see cref="PerSiteSiteCallKpiRequest"/> but groups
|
||||
/// by <c>SourceNode</c> instead of <c>SourceSite</c>. Additive — does not
|
||||
/// change per-site behaviour.
|
||||
/// </summary>
|
||||
public sealed record PerNodeSiteCallKpiRequest(
|
||||
string CorrelationId);
|
||||
|
||||
/// <summary>
|
||||
/// Central -> Site Calls UI: per-node KPI breakdown for the Site Calls KPIs
|
||||
/// page. On a repository fault <see cref="Success"/> is <c>false</c>,
|
||||
/// <see cref="ErrorMessage"/> carries the cause, and <see cref="Nodes"/> is empty.
|
||||
/// Nodes with a <c>NULL</c> <c>SourceNode</c> are omitted.
|
||||
/// </summary>
|
||||
public sealed record PerNodeSiteCallKpiResponse(
|
||||
string CorrelationId,
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<SiteCallNodeKpiSnapshot> Nodes);
|
||||
|
||||
@@ -83,3 +83,46 @@ public record RouteToSetAttributesResponse(
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
DateTimeOffset Timestamp);
|
||||
|
||||
/// <summary>
|
||||
/// Request to block until a remote instance attribute reaches a target value
|
||||
/// (spec §6 — <c>Route.To("inst").WaitForAttribute(name, targetValue, timeout)</c>).
|
||||
/// Value-equality ONLY across the wire: <see cref="TargetValueEncoded"/> carries the
|
||||
/// canonical <c>AttributeValueCodec</c>-encoded target; there is no predicate and no
|
||||
/// quality flag in the comparison. The site evaluates equality and either matches or
|
||||
/// times out.
|
||||
/// </summary>
|
||||
/// <param name="ParentExecutionId">
|
||||
/// Audit Log #23 (ParentExecutionId): mirrors <see cref="RouteToCallRequest.ParentExecutionId"/>.
|
||||
/// For an inbound-API-routed wait this is the inbound request's per-request execution id;
|
||||
/// future site-side audit emission for routed waits can stamp it as <c>ParentExecutionId</c>
|
||||
/// so the inbound→site execution-tree link survives the wait path. Additive trailing
|
||||
/// member — null for the Central UI sandbox path or for callers built before the field existed.
|
||||
/// </param>
|
||||
public record RouteToWaitForAttributeRequest(
|
||||
string CorrelationId,
|
||||
string InstanceUniqueName,
|
||||
string AttributeName,
|
||||
string? TargetValueEncoded,
|
||||
TimeSpan Timeout,
|
||||
DateTimeOffset Timestamp,
|
||||
Guid? ParentExecutionId = null);
|
||||
|
||||
/// <summary>
|
||||
/// Response from a remote attribute wait. <see cref="Success"/>/<see cref="ErrorMessage"/>
|
||||
/// convey the routing-level outcome (e.g. instance-not-found); <see cref="Matched"/>,
|
||||
/// <see cref="TimedOut"/>, <see cref="Value"/>, and <see cref="Quality"/> convey the wait
|
||||
/// outcome itself. When <see cref="Success"/> is <c>true</c>, exactly one of
|
||||
/// <see cref="Matched"/>/<see cref="TimedOut"/> holds: <see cref="Matched"/> means the
|
||||
/// attribute reached the target value (with <see cref="Value"/>/<see cref="Quality"/>
|
||||
/// captured at the match), <see cref="TimedOut"/> means the deadline elapsed first.
|
||||
/// </summary>
|
||||
public record RouteToWaitForAttributeResponse(
|
||||
string CorrelationId,
|
||||
bool Matched,
|
||||
object? Value,
|
||||
string? Quality,
|
||||
bool TimedOut,
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
DateTimeOffset Timestamp);
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Instance;
|
||||
|
||||
/// <summary>
|
||||
/// Request to wait, event-driven, until an attribute reaches a value (or any
|
||||
/// value satisfying a predicate), bounded by a timeout — the backing protocol for
|
||||
/// the script-facing <c>Attributes.WaitAsync</c> helper.
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Site-local only.</b> The optional <see cref="Predicate"/> is a non-serializable
|
||||
/// in-process delegate, so this message MUST flow only within a single site node's
|
||||
/// actor system (script execution → Instance Actor). It is never sent across the
|
||||
/// ClusterClient / gRPC boundary. The value-equality form (<see cref="TargetValueEncoded"/>)
|
||||
/// would serialize, but the routed/inbound variant is deliberately out of scope here.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="CorrelationId">Per-wait correlation id; keys the waiter registry and the timeout self-message.</param>
|
||||
/// <param name="InstanceName">The instance this wait targets.</param>
|
||||
/// <param name="AttributeName">The attribute to watch — already scope-resolved by the accessor.</param>
|
||||
/// <param name="TargetValueEncoded">
|
||||
/// The codec-encoded target value (<c>AttributeValueCodec.Encode(target)</c>). A
|
||||
/// match compares the codec-encoded form of the current value against this string.
|
||||
/// When both this and <see cref="Predicate"/> are null the wait matches on ANY change.
|
||||
/// </param>
|
||||
/// <param name="Predicate">
|
||||
/// Site-local predicate tested against the raw (decoded) current value. Mutually
|
||||
/// exclusive with <see cref="TargetValueEncoded"/> — null when the encoded target is used.
|
||||
/// </param>
|
||||
/// <param name="Timeout">How long to wait before self-evicting with a timeout reply.</param>
|
||||
/// <param name="OccurredAtUtc">When the request was issued (UTC).</param>
|
||||
/// <param name="RequireGoodQuality">
|
||||
/// Quality-gated ("Good"-only) mode (spec §4.2): when <see langword="true"/>, a
|
||||
/// match additionally requires the attribute quality to be exactly
|
||||
/// <c>"Good"</c> (<see cref="System.StringComparison.Ordinal"/>) — a value that
|
||||
/// reaches the target / satisfies the predicate at Bad/Uncertain quality is NOT a
|
||||
/// match and the waiter stays pending until the value satisfies the test at Good
|
||||
/// quality (or times out). Defaults to <see langword="false"/> (quality-agnostic:
|
||||
/// the match tests the value only). Trailing/defaulted so existing positional
|
||||
/// constructions compile unchanged.
|
||||
/// </param>
|
||||
public record WaitForAttributeRequest(
|
||||
string CorrelationId,
|
||||
string InstanceName,
|
||||
string AttributeName,
|
||||
string? TargetValueEncoded,
|
||||
Func<object?, bool>? Predicate,
|
||||
TimeSpan Timeout,
|
||||
DateTimeOffset OccurredAtUtc,
|
||||
bool RequireGoodQuality = false);
|
||||
|
||||
/// <summary>
|
||||
/// Reply to a <see cref="WaitForAttributeRequest"/>. Exactly one of
|
||||
/// <see cref="Matched"/> / <see cref="TimedOut"/> is set on the happy paths;
|
||||
/// <see cref="ErrorMessage"/> is populated on the failure paths (per-instance
|
||||
/// waiter cap exceeded, or the match predicate threw).
|
||||
/// </summary>
|
||||
/// <param name="CorrelationId">Echoes the request's correlation id.</param>
|
||||
/// <param name="Matched">True when the attribute reached the target/predicate within the timeout.</param>
|
||||
/// <param name="Value">The matched value (null on timeout / error).</param>
|
||||
/// <param name="Quality">
|
||||
/// The attribute quality at match time; <see langword="null"/> on the non-match
|
||||
/// paths (timeout / error / cap-exceeded), matching the nullable
|
||||
/// <see cref="ErrorMessage"/> convention.
|
||||
/// </param>
|
||||
/// <param name="TimedOut">True when the timeout fired before a match.</param>
|
||||
/// <param name="ErrorMessage">
|
||||
/// Non-null only when the wait failed/refused — the per-instance waiter cap was
|
||||
/// exceeded, or the match predicate threw (<c>"Wait predicate threw: …"</c>).
|
||||
/// </param>
|
||||
public record WaitForAttributeResponse(
|
||||
string CorrelationId,
|
||||
bool Matched,
|
||||
object? Value,
|
||||
string? Quality,
|
||||
bool TimedOut,
|
||||
string? ErrorMessage = null);
|
||||
|
||||
/// <summary>
|
||||
/// Internal self-message scheduled by the Instance Actor to fire a waiter's
|
||||
/// timeout. Site-local only; never crosses a cluster boundary.
|
||||
/// </summary>
|
||||
/// <param name="CorrelationId">The waiter whose timeout fired.</param>
|
||||
public record WaitForAttributeTimeout(string CorrelationId);
|
||||
@@ -159,3 +159,23 @@ public record PerSiteNotificationKpiResponse(
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<SiteNotificationKpiSnapshot> Sites);
|
||||
|
||||
/// <summary>
|
||||
/// Outbox UI -> Central: request for the per-node notification outbox KPI breakdown.
|
||||
/// Mirrors <see cref="PerSiteNotificationKpiRequest"/> but groups by <c>SourceNode</c>
|
||||
/// instead of <c>SourceSiteId</c>. Additive — does not change per-site behaviour.
|
||||
/// </summary>
|
||||
public record PerNodeNotificationKpiRequest(
|
||||
string CorrelationId);
|
||||
|
||||
/// <summary>
|
||||
/// Central -> Outbox UI: per-node KPI breakdown for the Notification KPIs page.
|
||||
/// On a repository fault <see cref="Success"/> is <c>false</c>, <see cref="ErrorMessage"/>
|
||||
/// carries the cause, and <see cref="Nodes"/> is empty. Nodes with a <c>NULL</c>
|
||||
/// <c>SourceNode</c> are omitted.
|
||||
/// </summary>
|
||||
public record PerNodeNotificationKpiResponse(
|
||||
string CorrelationId,
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
IReadOnlyList<NodeNotificationKpiSnapshot> Nodes);
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
||||
|
||||
/// <summary>
|
||||
/// Point-in-time <c>SiteCalls</c> metrics scoped to a single originating node. The
|
||||
/// per-node counterpart of <see cref="SiteCallSiteKpiSnapshot"/>; surfaced in the
|
||||
/// per-node breakdown table on the Site Calls KPIs page. Mirrors
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications.NodeNotificationKpiSnapshot"/>.
|
||||
/// </summary>
|
||||
/// <param name="SourceNode">
|
||||
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
|
||||
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
|
||||
/// </param>
|
||||
/// <param name="BufferedCount">Count of this node's non-terminal rows (<c>TerminalAtUtc IS NULL</c>).</param>
|
||||
/// <param name="ParkedCount">Count of this node's rows in the <c>Parked</c> status.</param>
|
||||
/// <param name="FailedLastInterval">
|
||||
/// Count of this node's <c>Failed</c> rows whose <c>TerminalAtUtc</c> is at or
|
||||
/// after the "since" timestamp.
|
||||
/// </param>
|
||||
/// <param name="DeliveredLastInterval">
|
||||
/// Count of this node's <c>Delivered</c> rows whose <c>TerminalAtUtc</c> is at
|
||||
/// or after the "since" timestamp.
|
||||
/// </param>
|
||||
/// <param name="OldestPendingAge">
|
||||
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
|
||||
/// </param>
|
||||
/// <param name="StuckCount">
|
||||
/// Count of this node's non-terminal rows whose <c>CreatedAtUtc</c> is older
|
||||
/// than the stuck cutoff.
|
||||
/// </param>
|
||||
public sealed record SiteCallNodeKpiSnapshot(
|
||||
string SourceNode,
|
||||
int BufferedCount,
|
||||
int ParkedCount,
|
||||
int FailedLastInterval,
|
||||
int DeliveredLastInterval,
|
||||
TimeSpan? OldestPendingAge,
|
||||
int StuckCount);
|
||||
@@ -0,0 +1,30 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Commons.Types.Notifications;
|
||||
|
||||
/// <summary>
|
||||
/// Point-in-time notification-outbox metrics scoped to a single originating node.
|
||||
/// The per-node counterpart of <see cref="SiteNotificationKpiSnapshot"/>; surfaced
|
||||
/// in the per-node breakdown table on the Notification KPIs page.
|
||||
/// </summary>
|
||||
/// <param name="SourceNode">
|
||||
/// The node identifier these metrics are scoped to (e.g. <c>node-a</c>,
|
||||
/// <c>node-b</c>). Rows with a <c>NULL</c> <c>SourceNode</c> are omitted.
|
||||
/// </param>
|
||||
/// <param name="QueueDepth">Count of this node's non-terminal rows (Pending + Retrying).</param>
|
||||
/// <param name="StuckCount">
|
||||
/// Count of this node's non-terminal rows whose <c>CreatedAt</c> is older than the stuck cutoff.
|
||||
/// </param>
|
||||
/// <param name="ParkedCount">Count of this node's rows in the Parked status.</param>
|
||||
/// <param name="DeliveredLastInterval">
|
||||
/// Count of this node's Delivered rows whose <c>DeliveredAt</c> is at or after the
|
||||
/// "delivered since" timestamp.
|
||||
/// </param>
|
||||
/// <param name="OldestPendingAge">
|
||||
/// Age of this node's oldest non-terminal row, or <c>null</c> when it has none.
|
||||
/// </param>
|
||||
public record NodeNotificationKpiSnapshot(
|
||||
string SourceNode,
|
||||
int QueueDepth,
|
||||
int StuckCount,
|
||||
int ParkedCount,
|
||||
int DeliveredLastInterval,
|
||||
TimeSpan? OldestPendingAge);
|
||||
@@ -0,0 +1,21 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
|
||||
/// <summary>
|
||||
/// Rich result of an <c>Attributes.WaitForAsync</c> wait (spec §3) — the full
|
||||
/// outcome of waiting for an attribute to reach a value / satisfy a predicate /
|
||||
/// change at all, bounded by a timeout. The <c>Attributes.WaitAsync</c> helpers
|
||||
/// surface only <see cref="Matched"/>; <c>WaitForAsync</c> returns this struct so
|
||||
/// a script can also read the matched <see cref="Value"/>, its <see cref="Quality"/>,
|
||||
/// and distinguish a genuine timeout (<see cref="TimedOut"/>) from a non-match.
|
||||
/// </summary>
|
||||
/// <param name="Matched">
|
||||
/// <see langword="true"/> when the attribute reached the target / satisfied the
|
||||
/// predicate within the timeout (and, in quality-gated mode, at "Good" quality).
|
||||
/// </param>
|
||||
/// <param name="Value">The matched value; <see langword="null"/> on timeout / error.</param>
|
||||
/// <param name="Quality">
|
||||
/// The attribute quality at match time; <see langword="null"/> on the non-match
|
||||
/// paths (timeout / error / cap-exceeded).
|
||||
/// </param>
|
||||
/// <param name="TimedOut"><see langword="true"/> when the timeout fired before a match.</param>
|
||||
public readonly record struct WaitResult(bool Matched, object? Value, string? Quality, bool TimedOut);
|
||||
@@ -144,6 +144,7 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
Receive<RouteToCallRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<RouteToGetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<RouteToSetAttributesRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
Receive<RouteToWaitForAttributeRequest>(msg => _deploymentManagerProxy.Forward(msg));
|
||||
|
||||
// OPC UA Tag Browser (interactive design-time query) — forward to the
|
||||
// Deployment Manager singleton, which always lands on the active site
|
||||
|
||||
@@ -445,6 +445,25 @@ public class CommunicationService
|
||||
envelope, _options.IntegrationTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Routes an inbound API wait-for-attribute request to a site (spec §6).
|
||||
/// </summary>
|
||||
/// <param name="siteId">The target site identifier.</param>
|
||||
/// <param name="request">The wait-for-attribute route request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The wait-for-attribute route response.</returns>
|
||||
public async Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
|
||||
string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var envelope = new SiteEnvelope(siteId, request);
|
||||
// A wait legitimately blocks up to request.Timeout on the site, so the cluster
|
||||
// Ask must be bounded by the WAIT deadline (plus integration-timeout slack for
|
||||
// the round trip), not the generic IntegrationTimeout used by the other routes.
|
||||
var askTimeout = request.Timeout + _options.IntegrationTimeout;
|
||||
return await GetActor().Ask<RouteToWaitForAttributeResponse>(
|
||||
envelope, askTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
// ── Notification Outbox (central-local actor — Asked directly, no SiteEnvelope) ──
|
||||
|
||||
/// <summary>
|
||||
@@ -525,6 +544,22 @@ public class CommunicationService
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets per-node KPI metrics for the notification outbox.
|
||||
/// Groups by <c>SourceNode</c> (e.g. <c>node-a</c>/<c>node-b</c>); rows with
|
||||
/// a <c>NULL</c> node are omitted. Additive alongside
|
||||
/// <see cref="GetPerSiteNotificationKpisAsync"/>.
|
||||
/// </summary>
|
||||
/// <param name="request">The per-node notification KPI request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The per-node notification KPI response.</returns>
|
||||
public async Task<PerNodeNotificationKpiResponse> GetPerNodeNotificationKpisAsync(
|
||||
PerNodeNotificationKpiRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
return await GetNotificationOutbox().Ask<PerNodeNotificationKpiResponse>(
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
// ── Site Call Audit (central-local actor — Asked directly, no SiteEnvelope) ──
|
||||
|
||||
/// <summary>
|
||||
@@ -579,6 +614,21 @@ public class CommunicationService
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets per-node KPI metrics for site calls. Groups by <c>SourceNode</c>
|
||||
/// (e.g. <c>node-a</c>/<c>node-b</c>); rows with a <c>NULL</c> node are
|
||||
/// omitted. Additive alongside <see cref="GetPerSiteSiteCallKpisAsync"/>.
|
||||
/// </summary>
|
||||
/// <param name="request">The per-node site call KPI request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The per-node site call KPI response.</returns>
|
||||
public async Task<PerNodeSiteCallKpiResponse> GetPerNodeSiteCallKpisAsync(
|
||||
PerNodeSiteCallKpiRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
return await GetSiteCallAudit().Ask<PerNodeSiteCallKpiResponse>(
|
||||
request, _options.QueryTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task 5 (#22): relays an operator Retry of a parked cached call to its
|
||||
/// owning site. The <c>SiteCallAuditActor</c> is Asked directly (it is
|
||||
|
||||
@@ -370,6 +370,99 @@ VALUES
|
||||
return rowsDeleted;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<long> PurgeChannelOlderThanAsync(
|
||||
string channel,
|
||||
DateTime threshold,
|
||||
int batchSize,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(channel))
|
||||
{
|
||||
throw new ArgumentException("Channel must be a non-empty channel name.", nameof(channel));
|
||||
}
|
||||
|
||||
if (batchSize <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
|
||||
}
|
||||
|
||||
var thresholdUtc = DateTime.SpecifyKind(threshold.ToUniversalTime(), DateTimeKind.Utc);
|
||||
|
||||
// M5.5 (T3) per-channel retention override purge. This is the ONLY DELETE
|
||||
// against dbo.AuditLog in the codebase and it runs on the purge/maintenance
|
||||
// path, NOT the append-only writer role (which has INSERT + SELECT only — see
|
||||
// the DENY UPDATE/DENY DELETE grants in CollapseAuditLogToCanonical). The
|
||||
// AuditLog append-only CI guard (AuditLogAppendOnlyGuardTests) is intentionally
|
||||
// widened to allow ONLY the single marked DELETE below; any other UPDATE/DELETE
|
||||
// targeting AuditLog still trips the guard.
|
||||
//
|
||||
// Bounded + idempotent: DELETE TOP (@batch) caps the log/lock footprint per
|
||||
// statement; the loop repeats until a batch deletes zero rows, so re-running
|
||||
// after a crash mid-loop simply resumes. Category is the canonical
|
||||
// channel-name column (e.g. 'ApiOutbound'); Action holds "{channel}.{kind}" so
|
||||
// it is NOT the right column to match a bare channel name against.
|
||||
//
|
||||
// The trailing AUDIT-PURGE-ALLOWED marker on the DELETE line below is the
|
||||
// single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
|
||||
// recognizes; any other UPDATE/DELETE targeting AuditLog still trips the guard.
|
||||
const string deleteBatchSql =
|
||||
"DELETE TOP (@batch) FROM dbo.AuditLog WHERE Category = @channel AND OccurredAtUtc < @threshold;"; // AUDIT-PURGE-ALLOWED: per-channel retention override (M5.5 T3), maintenance path
|
||||
|
||||
long totalDeleted = 0;
|
||||
|
||||
var conn = _context.Database.GetDbConnection();
|
||||
var openedHere = false;
|
||||
if (conn.State != System.Data.ConnectionState.Open)
|
||||
{
|
||||
await conn.OpenAsync(ct).ConfigureAwait(false);
|
||||
openedHere = true;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
await using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = deleteBatchSql;
|
||||
|
||||
var pBatch = cmd.CreateParameter();
|
||||
pBatch.ParameterName = "@batch";
|
||||
pBatch.Value = batchSize;
|
||||
cmd.Parameters.Add(pBatch);
|
||||
|
||||
var pChannel = cmd.CreateParameter();
|
||||
pChannel.ParameterName = "@channel";
|
||||
pChannel.Value = channel;
|
||||
cmd.Parameters.Add(pChannel);
|
||||
|
||||
var pThreshold = cmd.CreateParameter();
|
||||
pThreshold.ParameterName = "@threshold";
|
||||
pThreshold.Value = thresholdUtc;
|
||||
cmd.Parameters.Add(pThreshold);
|
||||
|
||||
var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
|
||||
if (rows <= 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
totalDeleted += rows;
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (openedHere)
|
||||
{
|
||||
await conn.CloseAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
return totalDeleted;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
|
||||
DateTime threshold,
|
||||
@@ -716,6 +809,102 @@ VALUES
|
||||
.ToListAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<long> BackfillSourceNodeAsync(
|
||||
string sentinel,
|
||||
DateTime before,
|
||||
int batchSize,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(sentinel))
|
||||
{
|
||||
throw new ArgumentException("Sentinel must be a non-empty value.", nameof(sentinel));
|
||||
}
|
||||
|
||||
if (batchSize <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(batchSize), batchSize, "Batch size must be > 0.");
|
||||
}
|
||||
|
||||
var beforeUtc = DateTime.SpecifyKind(before.ToUniversalTime(), DateTimeKind.Utc);
|
||||
|
||||
// M5.6 (T5) SourceNode sentinel backfill. This is the ONE sanctioned UPDATE
|
||||
// against dbo.AuditLog in the codebase. It touches ONLY rows where
|
||||
// SourceNode IS NULL AND OccurredAtUtc < @before — rows that pre-date the
|
||||
// M5.6 feature and whose node-of-origin is UNKNOWABLE. The sentinel (default
|
||||
// "unknown") makes that explicit. ExecutionId/ParentExecutionId are PERSISTED
|
||||
// COMPUTED columns derived from DetailsJson — mutating DetailsJson is forbidden
|
||||
// under the append-only invariant, so those stay NULL on pre-feature rows.
|
||||
//
|
||||
// Maintenance path (NOT the writer role): runs on the same connection used for
|
||||
// SwitchOutPartitionAsync (partition-switch DDL), which requires a role that
|
||||
// holds UPDATE — the append-only scadabridge_audit_writer role has only
|
||||
// INSERT + SELECT.
|
||||
//
|
||||
// Bounded + idempotent: UPDATE TOP (@batch) caps the log/lock footprint per
|
||||
// statement; the loop exits when a batch updates 0 rows. Re-running after a
|
||||
// crash simply resumes where it left off.
|
||||
//
|
||||
// The trailing AUDIT-PURGE-ALLOWED marker on the UPDATE line below is the
|
||||
// single narrow exemption the append-only CI guard (AuditLogAppendOnlyGuardTests)
|
||||
// recognises for an UPDATE; any other UPDATE targeting AuditLog still trips the guard.
|
||||
const string updateBatchSql =
|
||||
"UPDATE TOP (@batch) dbo.AuditLog SET SourceNode = @sentinel WHERE SourceNode IS NULL AND OccurredAtUtc < @before;"; // AUDIT-PURGE-ALLOWED: SourceNode sentinel backfill (M5.6 T5), maintenance path
|
||||
|
||||
long totalUpdated = 0;
|
||||
|
||||
var conn = _context.Database.GetDbConnection();
|
||||
var openedHere = false;
|
||||
if (conn.State != System.Data.ConnectionState.Open)
|
||||
{
|
||||
await conn.OpenAsync(ct).ConfigureAwait(false);
|
||||
openedHere = true;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
await using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = updateBatchSql;
|
||||
|
||||
var pBatch = cmd.CreateParameter();
|
||||
pBatch.ParameterName = "@batch";
|
||||
pBatch.Value = batchSize;
|
||||
cmd.Parameters.Add(pBatch);
|
||||
|
||||
var pSentinel = cmd.CreateParameter();
|
||||
pSentinel.ParameterName = "@sentinel";
|
||||
pSentinel.Value = sentinel;
|
||||
cmd.Parameters.Add(pSentinel);
|
||||
|
||||
var pBefore = cmd.CreateParameter();
|
||||
pBefore.ParameterName = "@before";
|
||||
pBefore.Value = beforeUtc;
|
||||
cmd.Parameters.Add(pBefore);
|
||||
|
||||
var rows = await cmd.ExecuteNonQueryAsync(ct).ConfigureAwait(false);
|
||||
if (rows <= 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
totalUpdated += rows;
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (openedHere)
|
||||
{
|
||||
await conn.CloseAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
return totalUpdated;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Splits a <c>STRING_AGG</c> comma-joined value into a distinct, ordered
|
||||
/// list. A null/empty aggregate (a stub node with no rows) yields an empty
|
||||
|
||||
+73
@@ -300,6 +300,63 @@ VALUES
|
||||
: null)).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<NodeNotificationKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
// Exclude rows with NULL SourceNode (legacy / unstamped) — per-node KPIs
|
||||
// are only meaningful when the node identity is known.
|
||||
var queueDepth = await CountByNodeAsync(
|
||||
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
|
||||
&& n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
var stuck = await CountByNodeAsync(
|
||||
n => (n.Status == NotificationStatus.Pending || n.Status == NotificationStatus.Retrying)
|
||||
&& n.CreatedAt < stuckCutoff
|
||||
&& n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
var parked = await CountByNodeAsync(
|
||||
n => n.Status == NotificationStatus.Parked && n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
var delivered = await CountByNodeAsync(
|
||||
n => n.Status == NotificationStatus.Delivered
|
||||
&& n.DeliveredAt != null && n.DeliveredAt >= deliveredSince
|
||||
&& n.SourceNode != null,
|
||||
cancellationToken);
|
||||
|
||||
// Oldest non-terminal CreatedAt per node — same in-memory reduction
|
||||
// pattern as ComputePerSiteKpisAsync (DateTimeOffset converter makes
|
||||
// a SQL Min awkward).
|
||||
var oldest = (await _context.Notifications
|
||||
.Where(n => (n.Status == NotificationStatus.Pending
|
||||
|| n.Status == NotificationStatus.Retrying)
|
||||
&& n.SourceNode != null)
|
||||
.Select(n => new { n.SourceNode, n.CreatedAt })
|
||||
.ToListAsync(cancellationToken))
|
||||
.GroupBy(x => x.SourceNode!)
|
||||
.ToDictionary(g => g.Key, g => g.Min(x => x.CreatedAt));
|
||||
|
||||
var nodeNames = queueDepth.Keys
|
||||
.Concat(stuck.Keys).Concat(parked.Keys).Concat(delivered.Keys)
|
||||
.Distinct()
|
||||
.OrderBy(n => n, StringComparer.Ordinal);
|
||||
|
||||
return nodeNames.Select(node => new NodeNotificationKpiSnapshot(
|
||||
SourceNode: node,
|
||||
QueueDepth: queueDepth.GetValueOrDefault(node),
|
||||
StuckCount: stuck.GetValueOrDefault(node),
|
||||
ParkedCount: parked.GetValueOrDefault(node),
|
||||
DeliveredLastInterval: delivered.GetValueOrDefault(node),
|
||||
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
|
||||
? now - createdAt
|
||||
: null)).ToList();
|
||||
}
|
||||
|
||||
/// <summary>Counts notification rows matching <paramref name="predicate"/>, grouped by source site.</summary>
|
||||
private async Task<Dictionary<string, int>> CountBySiteAsync(
|
||||
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
|
||||
@@ -312,6 +369,22 @@ VALUES
|
||||
.ToDictionaryAsync(x => x.Site, x => x.Count, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts notification rows matching <paramref name="predicate"/>, grouped by source node.
|
||||
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
|
||||
/// responsible for enforcing that guard.
|
||||
/// </summary>
|
||||
private async Task<Dictionary<string, int>> CountByNodeAsync(
|
||||
System.Linq.Expressions.Expression<Func<Notification, bool>> predicate,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
return await _context.Notifications
|
||||
.Where(predicate)
|
||||
.GroupBy(n => n.SourceNode!)
|
||||
.Select(g => new { Node = g.Key, Count = g.Count() })
|
||||
.ToDictionaryAsync(x => x.Node, x => x.Count, cancellationToken);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<int> SaveChangesAsync(CancellationToken cancellationToken = default)
|
||||
=> await _context.SaveChangesAsync(cancellationToken);
|
||||
|
||||
+71
@@ -324,6 +324,61 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
|
||||
StuckCount: stuck.GetValueOrDefault(site))).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<SiteCallNodeKpiSnapshot>> ComputePerNodeKpisAsync(
|
||||
DateTime stuckCutoff, DateTime intervalSince, CancellationToken ct = default)
|
||||
{
|
||||
var now = DateTime.UtcNow;
|
||||
|
||||
// Exclude rows with NULL SourceNode — per-node KPIs are only meaningful
|
||||
// when the node identity is known. Each predicate guards n.SourceNode != null
|
||||
// so the GROUP BY key is always non-null.
|
||||
var buffered = await CountByNodeAsync(
|
||||
s => s.TerminalAtUtc == null && s.SourceNode != null, ct);
|
||||
|
||||
var parked = await CountByNodeAsync(
|
||||
s => s.Status == StatusParked && s.SourceNode != null, ct);
|
||||
|
||||
var failed = await CountByNodeAsync(
|
||||
s => s.Status == StatusFailed
|
||||
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
|
||||
&& s.SourceNode != null, ct);
|
||||
|
||||
var delivered = await CountByNodeAsync(
|
||||
s => s.Status == StatusDelivered
|
||||
&& s.TerminalAtUtc != null && s.TerminalAtUtc >= intervalSince
|
||||
&& s.SourceNode != null, ct);
|
||||
|
||||
var stuck = await CountByNodeAsync(
|
||||
s => s.TerminalAtUtc == null && s.CreatedAtUtc < stuckCutoff
|
||||
&& s.SourceNode != null, ct);
|
||||
|
||||
// Oldest non-terminal CreatedAtUtc per node — server-side GROUP BY MIN.
|
||||
var oldest = (await _context.SiteCalls
|
||||
.Where(s => s.TerminalAtUtc == null && s.SourceNode != null)
|
||||
.GroupBy(s => s.SourceNode!)
|
||||
.Select(g => new { Node = g.Key, Oldest = g.Min(s => s.CreatedAtUtc) })
|
||||
.ToListAsync(ct))
|
||||
.ToDictionary(x => x.Node, x => x.Oldest);
|
||||
|
||||
var nodeNames = buffered.Keys
|
||||
.Concat(parked.Keys).Concat(failed.Keys)
|
||||
.Concat(delivered.Keys).Concat(stuck.Keys)
|
||||
.Distinct()
|
||||
.OrderBy(n => n, StringComparer.Ordinal);
|
||||
|
||||
return nodeNames.Select(node => new SiteCallNodeKpiSnapshot(
|
||||
SourceNode: node,
|
||||
BufferedCount: buffered.GetValueOrDefault(node),
|
||||
ParkedCount: parked.GetValueOrDefault(node),
|
||||
FailedLastInterval: failed.GetValueOrDefault(node),
|
||||
DeliveredLastInterval: delivered.GetValueOrDefault(node),
|
||||
OldestPendingAge: oldest.TryGetValue(node, out var createdAt)
|
||||
? now - createdAt
|
||||
: null,
|
||||
StuckCount: stuck.GetValueOrDefault(node))).ToList();
|
||||
}
|
||||
|
||||
/// <summary>Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source site.</summary>
|
||||
private async Task<Dictionary<string, int>> CountBySiteAsync(
|
||||
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
|
||||
@@ -336,6 +391,22 @@ ORDER BY CreatedAtUtc DESC, TrackedOperationId DESC;";
|
||||
.ToDictionaryAsync(x => x.Site, x => x.Count, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Counts <c>SiteCalls</c> rows matching <paramref name="predicate"/>, grouped by source node.
|
||||
/// Only rows with a non-null <c>SourceNode</c> should be included; the predicate is
|
||||
/// responsible for enforcing that guard.
|
||||
/// </summary>
|
||||
private async Task<Dictionary<string, int>> CountByNodeAsync(
|
||||
System.Linq.Expressions.Expression<Func<SiteCall, bool>> predicate,
|
||||
CancellationToken ct)
|
||||
{
|
||||
return await _context.SiteCalls
|
||||
.Where(predicate)
|
||||
.GroupBy(s => s.SourceNode!)
|
||||
.Select(g => new { Node = g.Key, Count = g.Count() })
|
||||
.ToDictionaryAsync(x => x.Node, x => x.Count, ct);
|
||||
}
|
||||
|
||||
private static int GetRankOrThrow(string status)
|
||||
{
|
||||
if (!StatusRank.TryGetValue(status, out var rank))
|
||||
|
||||
@@ -35,4 +35,9 @@ public sealed class CommunicationServiceInstanceRouter : IInstanceRouter
|
||||
public Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
|
||||
string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken) =>
|
||||
_communicationService.RouteToSetAttributesAsync(siteId, request, cancellationToken);
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
|
||||
string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken) =>
|
||||
_communicationService.RouteToWaitForAttributeAsync(siteId, request, cancellationToken);
|
||||
}
|
||||
|
||||
@@ -34,4 +34,12 @@ public interface IInstanceRouter
|
||||
/// <returns>A task that resolves to the set-attributes response from the target site.</returns>
|
||||
Task<RouteToSetAttributesResponse> RouteToSetAttributesAsync(
|
||||
string siteId, RouteToSetAttributesRequest request, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Routes a wait-for-attribute request to the specified site (spec §6).</summary>
|
||||
/// <param name="siteId">Target site identifier.</param>
|
||||
/// <param name="request">The wait-for-attribute request to route (value-equality only).</param>
|
||||
/// <param name="cancellationToken">Cancellation token for the routed call.</param>
|
||||
/// <returns>A task that resolves to the wait-for-attribute response from the target site.</returns>
|
||||
Task<RouteToWaitForAttributeResponse> RouteToWaitForAttributeAsync(
|
||||
string siteId, RouteToWaitForAttributeRequest request, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
||||
@@ -95,6 +96,7 @@ public sealed class AuditWriteMiddleware
|
||||
private readonly ILogger<AuditWriteMiddleware> _logger;
|
||||
private readonly IOptionsMonitor<AuditLogOptions> _options;
|
||||
private readonly IAuditActorAccessor? _actorAccessor;
|
||||
private readonly IAuditInboundCeilingHitsCounter _ceilingHitsCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the middleware with its required dependencies.
|
||||
@@ -110,18 +112,26 @@ public sealed class AuditWriteMiddleware
|
||||
/// construct the middleware; when absent, actor resolution falls back to the
|
||||
/// stashed API-key name only.
|
||||
/// </param>
|
||||
/// <param name="ceilingHitsCounter">
|
||||
/// M5.3 (T7, optional): incremented whenever an inbound request or response
|
||||
/// body is truncated at <see cref="AuditLogOptions.InboundMaxBytes"/>. Optional
|
||||
/// so existing tests and composition roots without the central health snapshot
|
||||
/// wired still construct without the counter; a NoOp is used when absent.
|
||||
/// </param>
|
||||
public AuditWriteMiddleware(
|
||||
RequestDelegate next,
|
||||
ICentralAuditWriter auditWriter,
|
||||
ILogger<AuditWriteMiddleware> logger,
|
||||
IOptionsMonitor<AuditLogOptions> options,
|
||||
IAuditActorAccessor? actorAccessor = null)
|
||||
IAuditActorAccessor? actorAccessor = null,
|
||||
IAuditInboundCeilingHitsCounter? ceilingHitsCounter = null)
|
||||
{
|
||||
_next = next ?? throw new ArgumentNullException(nameof(next));
|
||||
_auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_actorAccessor = actorAccessor;
|
||||
_ceilingHitsCounter = ceilingHitsCounter ?? new NoOpAuditInboundCeilingHitsCounter();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -133,9 +143,11 @@ public sealed class AuditWriteMiddleware
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
// Per-request hot read of the inbound cap so a live config change
|
||||
// Per-request hot read of the options snapshot so a live config change
|
||||
// picks up on the next request without re-resolving the singleton.
|
||||
var cap = _options.CurrentValue.InboundMaxBytes;
|
||||
// InboundMaxBytes is read once here and passed to the capture helpers.
|
||||
var opts = _options.CurrentValue;
|
||||
var cap = opts.InboundMaxBytes;
|
||||
|
||||
// Audit Log #23 (ParentExecutionId): mint the inbound request's per-request
|
||||
// ExecutionId ONCE, here at the start of the request, and stash it on
|
||||
@@ -163,9 +175,20 @@ public sealed class AuditWriteMiddleware
|
||||
// ReadBufferedRequestBodyAsync's own ContentLength is 0 short-circuit
|
||||
// returns (null, false) for the bodyless case anyway, so the audit row
|
||||
// is unchanged.
|
||||
//
|
||||
// M5.3 (T7): check if the matched method/target has SkipBodyCapture set.
|
||||
// The route value is resolved BEFORE the pipeline runs (route matching
|
||||
// has already bound {methodName} at this point), so we can skip the
|
||||
// EnableBuffering allocation and body read up front.
|
||||
var methodNameForOverride = ctx.Request.RouteValues.TryGetValue("methodName", out var rv)
|
||||
&& rv is string mn && !string.IsNullOrWhiteSpace(mn) ? mn : null;
|
||||
var skipBody = methodNameForOverride != null
|
||||
&& opts.PerTargetOverrides.TryGetValue(methodNameForOverride, out var perTarget)
|
||||
&& perTarget.SkipBodyCapture;
|
||||
|
||||
var requestBody = (string?)null;
|
||||
var requestTruncated = false;
|
||||
if (RequestHasBody(ctx.Request))
|
||||
if (!skipBody && RequestHasBody(ctx.Request))
|
||||
{
|
||||
ctx.Request.EnableBuffering();
|
||||
(requestBody, requestTruncated) =
|
||||
@@ -200,15 +223,25 @@ public sealed class AuditWriteMiddleware
|
||||
// The forwarding wrapper has already written every byte to the
|
||||
// original sink; this just pulls back the bounded UTF-8 string.
|
||||
ctx.Response.Body = originalResponseBody;
|
||||
var (responseBody, responseTruncated) = captureStream.GetCapturedBody();
|
||||
var (capturedResponseBody, capturedResponseTruncated) = captureStream.GetCapturedBody();
|
||||
// M5.3 (T7): if SkipBodyCapture is set, discard the captured response
|
||||
// body (the request body was never captured above). The row + headers
|
||||
// still emit with null RequestSummary / ResponseSummary.
|
||||
// Truncation flags are also cleared so ceiling-hit counter is not
|
||||
// bumped for methods that deliberately opt out of body capture.
|
||||
var responseBody = skipBody ? null : capturedResponseBody;
|
||||
var responseTruncated = skipBody ? false : capturedResponseTruncated;
|
||||
|
||||
EmitInboundAudit(
|
||||
ctx,
|
||||
opts,
|
||||
sw.ElapsedMilliseconds,
|
||||
thrown,
|
||||
requestBody,
|
||||
responseBody,
|
||||
requestTruncated || responseTruncated);
|
||||
requestTruncated || responseTruncated,
|
||||
requestTruncated,
|
||||
responseTruncated);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -219,11 +252,14 @@ public sealed class AuditWriteMiddleware
|
||||
/// </summary>
|
||||
private void EmitInboundAudit(
|
||||
HttpContext ctx,
|
||||
AuditLogOptions opts,
|
||||
long durationMs,
|
||||
Exception? thrown,
|
||||
string? requestBody,
|
||||
string? responseBody,
|
||||
bool payloadTruncated)
|
||||
bool payloadTruncated,
|
||||
bool requestTruncated = false,
|
||||
bool responseTruncated = false)
|
||||
{
|
||||
try
|
||||
{
|
||||
@@ -243,10 +279,43 @@ public sealed class AuditWriteMiddleware
|
||||
var actor = isAuthFailure ? null : ResolveActor(ctx);
|
||||
var methodName = ResolveMethodName(ctx);
|
||||
|
||||
// M5.3 (T7): increment the ceiling-hits counter once per request
|
||||
// that hit the cap on EITHER the request or response body.
|
||||
if (requestTruncated || responseTruncated)
|
||||
{
|
||||
try { _ceilingHitsCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
}
|
||||
|
||||
// M5.3 (T7): capture request headers into Extra JSON alongside the
|
||||
// existing remoteIp / userAgent provenance fields. The header
|
||||
// collection is run through the SAME header-redaction list
|
||||
// (AuditLogOptions.HeaderRedactList) that the ScadaBridgeAuditRedactor
|
||||
// applies to RequestSummary / ResponseSummary — auth/sensitive
|
||||
// headers are redacted before they land in the row. Uses the SAME
|
||||
// options snapshot captured at request start (passed in as opts) as
|
||||
// the SkipBodyCapture / PerTargetOverrides decisions, so a mid-request
|
||||
// live-reload can't split the body-capture and header-redaction
|
||||
// verdicts across two different snapshots.
|
||||
var redactSet = new HashSet<string>(
|
||||
opts.HeaderRedactList,
|
||||
StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var headerDict = new Dictionary<string, string>(StringComparer.Ordinal);
|
||||
foreach (var header in ctx.Request.Headers)
|
||||
{
|
||||
// Redact headers whose name appears in the HeaderRedactList —
|
||||
// the same "<redacted>" marker used by ScadaBridgeAuditRedactor.
|
||||
var value = redactSet.Contains(header.Key)
|
||||
? "<redacted>"
|
||||
: header.Value.ToString();
|
||||
headerDict[header.Key] = value;
|
||||
}
|
||||
|
||||
var extra = JsonSerializer.Serialize(new
|
||||
{
|
||||
remoteIp = ctx.Connection.RemoteIpAddress?.ToString(),
|
||||
userAgent = ctx.Request.Headers.UserAgent.ToString(),
|
||||
requestHeaders = headerDict,
|
||||
});
|
||||
|
||||
var evt = ScadaBridgeAuditEventFactory.Create(
|
||||
|
||||
@@ -205,6 +205,47 @@ public class RouteTarget
|
||||
return response.Values;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Blocks until a remote instance attribute reaches <paramref name="targetValue"/>
|
||||
/// or <paramref name="timeout"/> elapses (spec §6). Value-equality ONLY across the
|
||||
/// wire: the target is canonically encoded via <see cref="AttributeValueCodec"/> and
|
||||
/// the site evaluates equality — there is no predicate and no quality flag in the
|
||||
/// comparison.
|
||||
/// </summary>
|
||||
/// <param name="attributeName">Name of the attribute to wait on.</param>
|
||||
/// <param name="targetValue">Target value the attribute must equal for the wait to match.</param>
|
||||
/// <param name="timeout">Maximum time to wait for the attribute to reach the target value.</param>
|
||||
/// <param name="cancellationToken">Optional cancellation token; defaults to the method deadline.</param>
|
||||
/// <returns>A task that resolves to <c>true</c> if the attribute reached the target value, <c>false</c> if the wait timed out.</returns>
|
||||
public async Task<bool> WaitForAttribute(
|
||||
string attributeName,
|
||||
object? targetValue,
|
||||
TimeSpan timeout,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var token = Effective(cancellationToken);
|
||||
var siteId = await ResolveSiteAsync(token);
|
||||
|
||||
// Audit Log #23 (ParentExecutionId): mirrors the Call path — stamp the
|
||||
// spawning inbound request's ExecutionId so future site-side audit
|
||||
// emission for routed waits can record this wait's parent. CorrelationId
|
||||
// is the per-operation lifecycle id, freshly minted per routed wait.
|
||||
var request = new RouteToWaitForAttributeRequest(
|
||||
Guid.NewGuid().ToString(), _instanceCode, attributeName,
|
||||
AttributeValueCodec.Encode(targetValue), timeout, DateTimeOffset.UtcNow,
|
||||
_parentExecutionId);
|
||||
|
||||
var response = await _instanceRouter.RouteToWaitForAttributeAsync(siteId, request, token);
|
||||
|
||||
if (!response.Success)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
response.ErrorMessage ?? "Remote attribute wait failed");
|
||||
}
|
||||
|
||||
return response.Matched;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets a single attribute value on the remote instance.
|
||||
/// </summary>
|
||||
|
||||
@@ -18,13 +18,17 @@ namespace ZB.MOM.WW.ScadaBridge.ManagementService;
|
||||
|
||||
/// <summary>
|
||||
/// Minimal-API endpoints exposing the central Audit Log (#23) over HTTP for the
|
||||
/// ScadaBridge CLI (M8). Two routes:
|
||||
/// ScadaBridge CLI (M8). Three routes:
|
||||
/// <list type="bullet">
|
||||
/// <item><c>GET /api/audit/query</c> — keyset-paged JSON page, gated on the
|
||||
/// <see cref="AuthorizationPolicies.OperationalAudit"/> permission.</item>
|
||||
/// <item><c>GET /api/audit/export</c> — streamed bulk export (csv / jsonl;
|
||||
/// parquet returns HTTP 501), gated on the
|
||||
/// <see cref="AuthorizationPolicies.AuditExport"/> permission.</item>
|
||||
/// <item><c>GET /api/audit/tree</c> — execution-chain tree rooted at the
|
||||
/// topmost ancestor of a given <c>executionId</c>, returned as a JSON array
|
||||
/// of <see cref="ExecutionTreeNode"/>; gated on
|
||||
/// <see cref="AuthorizationPolicies.OperationalAudit"/>.</item>
|
||||
/// </list>
|
||||
///
|
||||
/// <para>
|
||||
@@ -85,8 +89,16 @@ public static class AuditEndpoints
|
||||
Converters = { new JsonStringEnumConverter() },
|
||||
};
|
||||
|
||||
/// <summary>Default sentinel written by the backfill endpoint when the caller omits <c>sentinel</c>.</summary>
|
||||
public const string DefaultBackfillSentinel = "unknown";
|
||||
|
||||
/// <summary>Default batch size for the backfill endpoint when the caller omits <c>batchSize</c>.</summary>
|
||||
public const int DefaultBackfillBatchSize = 5000;
|
||||
|
||||
/// <summary>
|
||||
/// Registers the <c>/api/audit/query</c> and <c>/api/audit/export</c> minimal-API endpoints.
|
||||
/// Registers the <c>/api/audit/query</c>, <c>/api/audit/export</c>,
|
||||
/// <c>/api/audit/tree</c>, and <c>POST /api/audit/backfill-source-node</c>
|
||||
/// minimal-API endpoints.
|
||||
/// </summary>
|
||||
/// <param name="endpoints">The endpoint route builder to register routes on.</param>
|
||||
/// <returns>The same <paramref name="endpoints"/> builder, for chaining.</returns>
|
||||
@@ -94,6 +106,8 @@ public static class AuditEndpoints
|
||||
{
|
||||
endpoints.MapGet("/api/audit/query", (Delegate)HandleQuery);
|
||||
endpoints.MapGet("/api/audit/export", (Delegate)HandleExport);
|
||||
endpoints.MapGet("/api/audit/tree", (Delegate)HandleTree);
|
||||
endpoints.MapPost("/api/audit/backfill-source-node", (Delegate)HandleBackfillSourceNode);
|
||||
return endpoints;
|
||||
}
|
||||
|
||||
@@ -232,6 +246,177 @@ public static class AuditEndpoints
|
||||
return Results.Empty;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// GET /api/audit/tree
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Handles <c>GET /api/audit/tree?executionId=...</c>: authenticates, checks the
|
||||
/// OperationalAudit permission, and returns the full execution-chain tree rooted at
|
||||
/// the topmost ancestor of the supplied <c>executionId</c>. The response is a JSON
|
||||
/// array of <see cref="ExecutionTreeNode"/> objects (empty array when the id is
|
||||
/// not found). Returns HTTP 400 when <c>executionId</c> is absent or not a valid
|
||||
/// GUID.
|
||||
/// </summary>
|
||||
/// <param name="context">The HTTP context for the current request.</param>
|
||||
/// <returns>A task that resolves to the HTTP result (200 JSON array, 400, 401, or 403).</returns>
|
||||
internal static async Task<IResult> HandleTree(HttpContext context)
|
||||
{
|
||||
var auth = await AuthenticateAsync(context);
|
||||
if (auth.Failure is not null)
|
||||
{
|
||||
return auth.Failure;
|
||||
}
|
||||
|
||||
if (!HasAnyRole(auth.User!, AuthorizationPolicies.OperationalAuditRoles))
|
||||
{
|
||||
return Forbidden("OperationalAudit");
|
||||
}
|
||||
|
||||
var raw = context.Request.Query["executionId"].ToString();
|
||||
if (string.IsNullOrWhiteSpace(raw) || !Guid.TryParse(raw, out var executionId))
|
||||
{
|
||||
return Results.Json(
|
||||
new { error = "Missing or invalid 'executionId' query parameter (expected a GUID).", code = "BAD_REQUEST" },
|
||||
statusCode: 400);
|
||||
}
|
||||
|
||||
var repo = context.RequestServices.GetRequiredService<IAuditLogRepository>();
|
||||
var nodes = await repo.GetExecutionTreeAsync(executionId, context.RequestAborted);
|
||||
|
||||
return Results.Json(nodes, JsonOptions);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
// POST /api/audit/backfill-source-node
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// <summary>
|
||||
/// Handles <c>POST /api/audit/backfill-source-node</c>: authenticates (Admin role
|
||||
/// required), reads the JSON body for <c>sentinel</c> / <c>before</c> /
|
||||
/// <c>batchSize</c>, and calls
|
||||
/// <see cref="IAuditLogRepository.BackfillSourceNodeAsync"/> on the maintenance
|
||||
/// path.
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Auth.</b> Admin-only — backfilling the SourceNode column is a one-time ops
|
||||
/// procedure that mutates the AuditLog table via the maintenance path (NOT the
|
||||
/// append-only writer role). Restricted to <see cref="AuthorizationPolicies.AuditExportRoles"/>
|
||||
/// (Administrator) so it is never accessible to Viewer-role users.
|
||||
/// </para>
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Request body.</b>
|
||||
/// <code>
|
||||
/// {
|
||||
/// "sentinel": "unknown", // optional; default "unknown"
|
||||
/// "before": "2026-01-01T00:00:00Z", // required ISO-8601 UTC
|
||||
/// "batchSize": 5000 // optional; default 5000
|
||||
/// }
|
||||
/// </code>
|
||||
/// </para>
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Response (200).</b>
|
||||
/// <code>{ "rowsUpdated": 12345, "sentinel": "unknown", "before": "2026-01-01T00:00:00Z" }</code>
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="context">The HTTP context for the current request.</param>
|
||||
/// <returns>A task that resolves to the HTTP result (200 JSON, 400, 401, or 403).</returns>
|
||||
internal static async Task<IResult> HandleBackfillSourceNode(HttpContext context)
|
||||
{
|
||||
var auth = await AuthenticateAsync(context);
|
||||
if (auth.Failure is not null)
|
||||
{
|
||||
return auth.Failure;
|
||||
}
|
||||
|
||||
// Admin-only: backfilling is a one-time ops procedure on the maintenance path.
|
||||
if (!HasAnyRole(auth.User!, AuthorizationPolicies.AuditExportRoles))
|
||||
{
|
||||
return Forbidden("Administrator");
|
||||
}
|
||||
|
||||
string bodyText;
|
||||
try
|
||||
{
|
||||
using var reader = new System.IO.StreamReader(context.Request.Body);
|
||||
bodyText = await reader.ReadToEndAsync(context.RequestAborted);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
return Results.Json(new { error = "Request cancelled.", code = "CANCELLED" }, statusCode: 499);
|
||||
}
|
||||
|
||||
string sentinel = DefaultBackfillSentinel;
|
||||
DateTime? beforeUtc = null;
|
||||
int batchSize = DefaultBackfillBatchSize;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(bodyText))
|
||||
{
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(bodyText);
|
||||
var root = doc.RootElement;
|
||||
|
||||
if (root.TryGetProperty("sentinel", out var sentinelEl))
|
||||
{
|
||||
var s = sentinelEl.GetString();
|
||||
if (!string.IsNullOrWhiteSpace(s))
|
||||
{
|
||||
sentinel = s.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
if (root.TryGetProperty("before", out var beforeEl))
|
||||
{
|
||||
if (DateTime.TryParse(
|
||||
beforeEl.GetString(),
|
||||
System.Globalization.CultureInfo.InvariantCulture,
|
||||
System.Globalization.DateTimeStyles.AssumeUniversal | System.Globalization.DateTimeStyles.AdjustToUniversal,
|
||||
out var parsed))
|
||||
{
|
||||
beforeUtc = DateTime.SpecifyKind(parsed, DateTimeKind.Utc);
|
||||
}
|
||||
else
|
||||
{
|
||||
return Results.Json(
|
||||
new { error = "Invalid 'before' value; expected ISO-8601 UTC datetime.", code = "BAD_REQUEST" },
|
||||
statusCode: 400);
|
||||
}
|
||||
}
|
||||
|
||||
if (root.TryGetProperty("batchSize", out var batchEl) && batchEl.TryGetInt32(out var b) && b > 0)
|
||||
{
|
||||
batchSize = b;
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
return Results.Json(
|
||||
new { error = "Request body must be valid JSON.", code = "BAD_REQUEST" },
|
||||
statusCode: 400);
|
||||
}
|
||||
}
|
||||
|
||||
if (beforeUtc is null)
|
||||
{
|
||||
return Results.Json(
|
||||
new { error = "Required field 'before' (ISO-8601 UTC datetime) is missing.", code = "BAD_REQUEST" },
|
||||
statusCode: 400);
|
||||
}
|
||||
|
||||
var repo = context.RequestServices.GetRequiredService<IAuditLogRepository>();
|
||||
var rowsUpdated = await repo.BackfillSourceNodeAsync(sentinel, beforeUtc.Value, batchSize, context.RequestAborted);
|
||||
|
||||
return Results.Json(new
|
||||
{
|
||||
rowsUpdated,
|
||||
sentinel,
|
||||
before = beforeUtc.Value.ToString("O", System.Globalization.CultureInfo.InvariantCulture),
|
||||
}, JsonOptions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Streams every matching row as RFC 4180 CSV, paging the repository with its
|
||||
/// keyset cursor and flushing after each page so a large export starts
|
||||
|
||||
@@ -122,6 +122,7 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
|
||||
Receive<DiscardNotificationRequest>(HandleDiscard);
|
||||
Receive<NotificationKpiRequest>(HandleKpiRequest);
|
||||
Receive<PerSiteNotificationKpiRequest>(HandlePerSiteKpiRequest);
|
||||
Receive<PerNodeNotificationKpiRequest>(HandlePerNodeKpiRequest);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -1081,6 +1082,38 @@ public class NotificationOutboxActor : ReceiveActor, IWithTimers
|
||||
return new PerSiteNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, sites);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles a per-node KPI request, computing the per-source-node outbox metrics with the
|
||||
/// same stuck cutoff and delivered window as <see cref="HandleKpiRequest"/>. Additive
|
||||
/// alongside <see cref="HandlePerSiteKpiRequest"/> — does not change per-site behaviour.
|
||||
/// </summary>
|
||||
private void HandlePerNodeKpiRequest(PerNodeNotificationKpiRequest request)
|
||||
{
|
||||
var sender = Sender;
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var stuckCutoff = StuckCutoff(now);
|
||||
var deliveredSince = now - _options.DeliveredKpiWindow;
|
||||
|
||||
ComputePerNodeKpisAsync(request.CorrelationId, stuckCutoff, deliveredSince).PipeTo(
|
||||
sender,
|
||||
success: response => response,
|
||||
failure: ex => new PerNodeNotificationKpiResponse(
|
||||
request.CorrelationId,
|
||||
Success: false,
|
||||
ErrorMessage: ex.GetBaseException().Message,
|
||||
Nodes: Array.Empty<NodeNotificationKpiSnapshot>()));
|
||||
}
|
||||
|
||||
private async Task<PerNodeNotificationKpiResponse> ComputePerNodeKpisAsync(
|
||||
string correlationId, DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince)
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
var repository = scope.ServiceProvider.GetRequiredService<INotificationOutboxRepository>();
|
||||
var nodes = await repository.ComputePerNodeKpisAsync(stuckCutoff, deliveredSince);
|
||||
|
||||
return new PerNodeNotificationKpiResponse(correlationId, Success: true, ErrorMessage: null, nodes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The instant before which a still-pending notification counts as stuck — <paramref name="now"/>
|
||||
/// offset back by <see cref="NotificationOutboxOptions.StuckAgeThreshold"/>.
|
||||
|
||||
@@ -239,6 +239,7 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
Receive<SiteCallDetailRequest>(HandleDetail);
|
||||
Receive<SiteCallKpiRequest>(HandleKpi);
|
||||
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
|
||||
Receive<PerNodeSiteCallKpiRequest>(HandlePerNodeKpi);
|
||||
|
||||
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
|
||||
Receive<RegisterCentralCommunication>(msg =>
|
||||
@@ -817,6 +818,47 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles a per-node KPI request, using the same stuck cutoff and
|
||||
/// interval bound as <see cref="HandleKpi"/>. Additive alongside
|
||||
/// <see cref="HandlePerSiteKpi"/> — does not change per-site behaviour.
|
||||
/// </summary>
|
||||
private void HandlePerNodeKpi(PerNodeSiteCallKpiRequest request)
|
||||
{
|
||||
var sender = Sender;
|
||||
var now = DateTime.UtcNow;
|
||||
var stuckCutoff = now - _options.StuckAgeThreshold;
|
||||
var intervalSince = now - _options.KpiInterval;
|
||||
|
||||
PerNodeKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
||||
sender,
|
||||
success: response => response,
|
||||
failure: ex => new PerNodeSiteCallKpiResponse(
|
||||
request.CorrelationId,
|
||||
Success: false,
|
||||
ErrorMessage: ex.GetBaseException().Message,
|
||||
Nodes: Array.Empty<SiteCallNodeKpiSnapshot>()));
|
||||
}
|
||||
|
||||
private async Task<PerNodeSiteCallKpiResponse> PerNodeKpiAsync(
|
||||
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
||||
{
|
||||
var (scope, repository) = ResolveRepository();
|
||||
try
|
||||
{
|
||||
var nodes = await repository
|
||||
.ComputePerNodeKpisAsync(stuckCutoff, intervalSince)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return new PerNodeSiteCallKpiResponse(
|
||||
correlationId, Success: true, ErrorMessage: null, nodes);
|
||||
}
|
||||
finally
|
||||
{
|
||||
scope?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
// ── Task 5: central→site Retry/Discard relay ──
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -571,7 +571,20 @@ public class AlarmActor : ReceiveActor
|
||||
/// Passes the firing alarm's level/priority/message so the script can
|
||||
/// branch on severity via the <c>Alarm</c> global.
|
||||
/// </summary>
|
||||
private void SpawnAlarmExecution(AlarmLevel level, int priority, string message)
|
||||
/// <param name="level">The firing alarm severity level.</param>
|
||||
/// <param name="priority">The firing alarm priority.</param>
|
||||
/// <param name="message">The firing alarm message.</param>
|
||||
/// <param name="parentExecutionId">
|
||||
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
|
||||
/// the context that fired this alarm, recorded as the on-trigger script run's
|
||||
/// <c>ParentExecutionId</c> so the alarm-triggered run chains under its firing
|
||||
/// context in the audit tree. The alarm subsystem currently has no Guid-typed
|
||||
/// firing id, so the only call sites pass <c>null</c> (the on-trigger run is a
|
||||
/// root). The parameter exists so a future firing-id can flow without
|
||||
/// touching the actor wiring.
|
||||
/// </param>
|
||||
private void SpawnAlarmExecution(
|
||||
AlarmLevel level, int priority, string message, Guid? parentExecutionId = null)
|
||||
{
|
||||
if (_onTriggerCompiledScript == null) return;
|
||||
|
||||
@@ -591,7 +604,9 @@ public class AlarmActor : ReceiveActor
|
||||
_options,
|
||||
_logger,
|
||||
// M2.5 (#9): per-script timeout from the on-trigger script (null = global).
|
||||
_onTriggerExecutionTimeoutSeconds));
|
||||
_onTriggerExecutionTimeoutSeconds,
|
||||
// Audit Log #23 (M5.4): the firing context's execution id (null today).
|
||||
parentExecutionId));
|
||||
|
||||
Context.ActorOf(props, executionId);
|
||||
}
|
||||
|
||||
@@ -29,6 +29,14 @@ public class AlarmExecutionActor : ReceiveActor
|
||||
/// <param name="options">Site runtime configuration options, including the execution timeout.</param>
|
||||
/// <param name="logger">Logger for execution diagnostics.</param>
|
||||
/// <param name="executionTimeoutSeconds">M2.5 (#9): the on-trigger script's per-script execution timeout in seconds. Null or non-positive falls back to the global <see cref="SiteRuntimeOptions.ScriptExecutionTimeoutSeconds"/>.</param>
|
||||
/// <param name="parentExecutionId">
|
||||
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the execution id of
|
||||
/// the context that fired this alarm, threaded into the on-trigger script's
|
||||
/// <see cref="ScriptRuntimeContext"/> as its <c>ParentExecutionId</c> so the
|
||||
/// alarm-triggered run chains under its firing context. Null today (no
|
||||
/// Guid-typed firing id exists yet) — the run is a root, but the plumbing
|
||||
/// is in place for a future firing id.
|
||||
/// </param>
|
||||
public AlarmExecutionActor(
|
||||
string alarmName,
|
||||
string instanceName,
|
||||
@@ -42,7 +50,9 @@ public class AlarmExecutionActor : ReceiveActor
|
||||
ILogger logger,
|
||||
// M2.5 (#9): per-script execution timeout override (seconds) for the
|
||||
// alarm on-trigger script. Null or non-positive falls back to the global.
|
||||
int? executionTimeoutSeconds = null)
|
||||
int? executionTimeoutSeconds = null,
|
||||
// Audit Log #23 (M5.4): the firing context's execution id (null today).
|
||||
Guid? parentExecutionId = null)
|
||||
{
|
||||
var self = Self;
|
||||
var parent = Context.Parent;
|
||||
@@ -51,7 +61,7 @@ public class AlarmExecutionActor : ReceiveActor
|
||||
alarmName, instanceName, level, priority, message,
|
||||
compiledScript, instanceActor,
|
||||
sharedScriptLibrary, options, self, parent, logger,
|
||||
executionTimeoutSeconds);
|
||||
executionTimeoutSeconds, parentExecutionId);
|
||||
}
|
||||
|
||||
private static void ExecuteAlarmScript(
|
||||
@@ -67,7 +77,8 @@ public class AlarmExecutionActor : ReceiveActor
|
||||
IActorRef self,
|
||||
IActorRef parent,
|
||||
ILogger logger,
|
||||
int? executionTimeoutSeconds)
|
||||
int? executionTimeoutSeconds,
|
||||
Guid? parentExecutionId)
|
||||
{
|
||||
// M2.5 (#9): per-script timeout overrides the global default. A null or
|
||||
// non-positive per-script value (≤ 0) falls back to the global.
|
||||
@@ -95,7 +106,19 @@ public class AlarmExecutionActor : ReceiveActor
|
||||
options.MaxScriptCallDepth,
|
||||
timeout,
|
||||
instanceName,
|
||||
logger);
|
||||
logger,
|
||||
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the
|
||||
// alarm on-trigger run mints its own fresh ExecutionId (the
|
||||
// ctor's `?? NewGuid()` fallback) and records the firing
|
||||
// context's id as its ParentExecutionId — null today, so the
|
||||
// run is a root, but the plumbing exists for a future
|
||||
// firing id.
|
||||
parentExecutionId: parentExecutionId,
|
||||
// WaitForAttribute (spec §4.4): thread the alarm on-trigger
|
||||
// script's per-script execution-timeout token so a
|
||||
// Attributes.WaitAsync inside an on-trigger script is bounded
|
||||
// by the same script deadline.
|
||||
scriptTimeoutToken: cts.Token);
|
||||
|
||||
var globals = new ScriptGlobals
|
||||
{
|
||||
|
||||
@@ -149,6 +149,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
Receive<RouteToCallRequest>(RouteInboundApiCall);
|
||||
Receive<RouteToGetAttributesRequest>(RouteInboundApiGetAttributes);
|
||||
Receive<RouteToSetAttributesRequest>(RouteInboundApiSetAttributes);
|
||||
Receive<RouteToWaitForAttributeRequest>(RouteInboundApiWaitForAttribute);
|
||||
|
||||
// OPC UA Tag Browser — singleton-only re-forward to local /user/dcl-manager.
|
||||
// BrowseNodeCommand is routed to this singleton (active node) by
|
||||
@@ -1078,6 +1079,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
}).PipeTo(sender);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Spec §6 (WD-2b): unpacks a routed <see cref="RouteToWaitForAttributeRequest"/>
|
||||
/// (inbound-API <c>Route.To().WaitForAttribute()</c>) into the deployed
|
||||
/// Instance Actor's site-local <see cref="WaitForAttributeRequest"/> and relays
|
||||
/// the result back. Value-equality only across the wire — the predicate is null
|
||||
/// and <c>RequireGoodQuality</c> is left at its default. The Ask is bounded by the
|
||||
/// wait timeout plus slack (NOT a fixed 30s), since the wait legitimately blocks
|
||||
/// for up to <see cref="RouteToWaitForAttributeRequest.Timeout"/>.
|
||||
/// </summary>
|
||||
private void RouteInboundApiWaitForAttribute(RouteToWaitForAttributeRequest request)
|
||||
{
|
||||
if (!_instanceActors.TryGetValue(request.InstanceUniqueName, out var instanceActor))
|
||||
{
|
||||
Sender.Tell(new RouteToWaitForAttributeResponse(
|
||||
request.CorrelationId, false, null, null, false,
|
||||
false, $"Instance '{request.InstanceUniqueName}' not found on this site.",
|
||||
DateTimeOffset.UtcNow));
|
||||
return;
|
||||
}
|
||||
|
||||
var sender = Sender;
|
||||
// Routed waits are value-equality only (predicate null); RequireGoodQuality left at default.
|
||||
var inner = new WaitForAttributeRequest(
|
||||
request.CorrelationId, request.InstanceUniqueName, request.AttributeName,
|
||||
request.TargetValueEncoded, null, request.Timeout, DateTimeOffset.UtcNow);
|
||||
|
||||
// Ask bounded by the WAIT timeout + slack — NOT a fixed 30s (the wait legitimately blocks up to request.Timeout).
|
||||
instanceActor.Ask<WaitForAttributeResponse>(inner, request.Timeout + TimeSpan.FromSeconds(5))
|
||||
.ContinueWith(t => t.IsCompletedSuccessfully
|
||||
? new RouteToWaitForAttributeResponse(
|
||||
request.CorrelationId, t.Result.Matched, t.Result.Value, t.Result.Quality, t.Result.TimedOut,
|
||||
true, null, DateTimeOffset.UtcNow)
|
||||
: new RouteToWaitForAttributeResponse(
|
||||
request.CorrelationId, false, null, null, false,
|
||||
false, t.Exception?.GetBaseException().Message ?? "Attribute wait timed out",
|
||||
DateTimeOffset.UtcNow))
|
||||
.PipeTo(sender);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes attribute values on a deployed instance for a Route.To().SetAttribute(s)
|
||||
/// call (or a central Test Run bound to the instance). Each write is Ask'd to the
|
||||
|
||||
@@ -68,6 +68,18 @@ public class InstanceActor : ReceiveActor
|
||||
// mirroring the rest of the actor's by-name dictionaries).
|
||||
private readonly Dictionary<string, ResolvedAttribute> _resolvedAttributeByName = new();
|
||||
|
||||
// WaitForAttribute (spec §4.2): one-shot waiter registry keyed by the
|
||||
// request CorrelationId. Each entry holds the watched attribute name, the
|
||||
// match test (decoded target equality OR a site-local predicate), the
|
||||
// original Sender to reply to, and the scheduled-timeout handle so a match
|
||||
// can cancel it. Single-threaded actor access — no locking needed.
|
||||
private readonly Dictionary<string, PendingWait> _attributeWaiters = new();
|
||||
|
||||
// WaitForAttribute: defensive per-instance cap so a script leaking waiters
|
||||
// in a loop cannot grow the registry without bound. Exceeding it refuses the
|
||||
// wait with an error reply rather than registering.
|
||||
private const int MaxAttributeWaiters = 100;
|
||||
|
||||
// DCL manager actor reference for subscribing to tag values
|
||||
private readonly IActorRef? _dclManager;
|
||||
// Maps each tag path to every attribute canonical name that references it.
|
||||
@@ -170,6 +182,12 @@ public class InstanceActor : ReceiveActor
|
||||
// WP-22/23: Handle attribute value changes from DCL (Tell pattern)
|
||||
Receive<AttributeValueChanged>(HandleAttributeValueChanged);
|
||||
|
||||
// WaitForAttribute (spec §4.2): event-driven "wait for value" waiter
|
||||
// registration + its scheduled-timeout self-message. Both flow only
|
||||
// site-locally (the predicate variant carries a non-serializable delegate).
|
||||
Receive<WaitForAttributeRequest>(HandleWaitForAttribute);
|
||||
Receive<WaitForAttributeTimeout>(HandleWaitForAttributeTimeout);
|
||||
|
||||
// Handle tag value updates from DCL — convert to AttributeValueChanged
|
||||
Receive<TagValueUpdate>(HandleTagValueUpdate);
|
||||
Receive<SubscribeTagsResponse>(_ => { }); // Ack from DCL subscribe — no action needed
|
||||
@@ -519,6 +537,114 @@ public class InstanceActor : ReceiveActor
|
||||
PublishAndNotifyChildren(changed);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §4.2): registers a one-shot event-driven waiter for
|
||||
/// an attribute to reach a value (encoded-equality), satisfy a site-local
|
||||
/// predicate, or change at all. The current-value fast-path and the
|
||||
/// change-handling in <see cref="HandleAttributeValueChanged"/> both run on
|
||||
/// this single-threaded actor, so a value that flips between "read current"
|
||||
/// and "register" cannot be missed (spec §5).
|
||||
/// </summary>
|
||||
private void HandleWaitForAttribute(WaitForAttributeRequest req)
|
||||
{
|
||||
// Capture the sender immediately — Sender is invalid once we schedule /
|
||||
// return and a later message arrives.
|
||||
var replyer = Sender;
|
||||
|
||||
// Build the match test: explicit predicate wins; else null encoded target
|
||||
// means "any change"; else compare the codec-encoded current value to the
|
||||
// encoded target (avoids needing the attribute's DataType to decode).
|
||||
Func<object?, bool> test;
|
||||
if (req.Predicate is not null)
|
||||
{
|
||||
test = req.Predicate;
|
||||
}
|
||||
else if (req.TargetValueEncoded is null)
|
||||
{
|
||||
test = _ => true;
|
||||
}
|
||||
else
|
||||
{
|
||||
var target = req.TargetValueEncoded;
|
||||
test = v => string.Equals(
|
||||
AttributeValueCodec.Encode(v), target, StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
// Fast path: the current value already satisfies the test → reply now.
|
||||
// A script-supplied predicate (or the codec-equality lambda) runs on the
|
||||
// actor thread; guard it so a throwing predicate cannot crash the actor or
|
||||
// leak a never-resolved waiter. On throw: reply non-matched + ErrorMessage
|
||||
// and return WITHOUT registering (no timeout scheduled).
|
||||
if (_attributes.TryGetValue(req.AttributeName, out var current))
|
||||
{
|
||||
// Effective quality used for BOTH the §4.2 quality gate and the match
|
||||
// reply — the same `?? "Good"` default the reply has always used.
|
||||
_attributeQualities.TryGetValue(req.AttributeName, out var fastQuality);
|
||||
var effectiveQuality = fastQuality ?? "Good";
|
||||
|
||||
bool fastMatch;
|
||||
try
|
||||
{
|
||||
// §4.2 quality gate ANDed with the value test, both INSIDE the guard:
|
||||
// in quality-gated mode a value already at target but at Bad/Uncertain
|
||||
// quality is NOT a fast match — it falls through to register + schedule
|
||||
// the timeout like any other pending waiter (do NOT fast-reply matched).
|
||||
fastMatch =
|
||||
(!req.RequireGoodQuality
|
||||
|| string.Equals(effectiveQuality, "Good", StringComparison.Ordinal))
|
||||
&& test(current);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"WaitForAttribute predicate threw on the fast-path for {Instance}.{Attribute}; refusing the wait",
|
||||
_instanceUniqueName, req.AttributeName);
|
||||
replyer.Tell(new WaitForAttributeResponse(
|
||||
req.CorrelationId, Matched: false, null, null, TimedOut: false,
|
||||
ErrorMessage: "Wait predicate threw: " + ex.Message));
|
||||
return;
|
||||
}
|
||||
|
||||
if (fastMatch)
|
||||
{
|
||||
replyer.Tell(new WaitForAttributeResponse(
|
||||
req.CorrelationId, Matched: true, current, effectiveQuality, TimedOut: false));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Defensive cap: refuse rather than register if the instance already has
|
||||
// too many concurrent waiters (guards against a script leaking waiters).
|
||||
if (_attributeWaiters.Count >= MaxAttributeWaiters)
|
||||
{
|
||||
replyer.Tell(new WaitForAttributeResponse(
|
||||
req.CorrelationId, Matched: false, null, null, TimedOut: false,
|
||||
ErrorMessage: "Too many concurrent attribute waiters on this instance"));
|
||||
return;
|
||||
}
|
||||
|
||||
// Register and schedule the self-evicting timeout (NativeAlarmActor idiom).
|
||||
var handle = Context.System.Scheduler.ScheduleTellOnceCancelable(
|
||||
req.Timeout, Self, new WaitForAttributeTimeout(req.CorrelationId), Self);
|
||||
|
||||
_attributeWaiters[req.CorrelationId] =
|
||||
new PendingWait(req.AttributeName, test, replyer, handle, req.RequireGoodQuality);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §4.2): the scheduled timeout fired for a waiter that
|
||||
/// never matched. If still registered (a match would have removed + canceled
|
||||
/// it), reply TimedOut and evict it.
|
||||
/// </summary>
|
||||
private void HandleWaitForAttributeTimeout(WaitForAttributeTimeout msg)
|
||||
{
|
||||
if (_attributeWaiters.Remove(msg.CorrelationId, out var pending))
|
||||
{
|
||||
pending.Replyer.Tell(new WaitForAttributeResponse(
|
||||
msg.CorrelationId, Matched: false, null, null, TimedOut: true));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles tag value updates from DCL. Maps the tag path back to the attribute
|
||||
/// canonical name and converts to an AttributeValueChanged for unified processing.
|
||||
@@ -556,9 +682,14 @@ public class InstanceActor : ReceiveActor
|
||||
_attributeQualities[attrName] = "Bad";
|
||||
_attributeTimestamps[attrName] = update.Timestamp;
|
||||
var currentValue = _attributes.GetValueOrDefault(attrName);
|
||||
// WaitForAttribute (spec §4.2): quality-only republish — the
|
||||
// stored value is UNCHANGED (we publish the OLD currentValue, only
|
||||
// the quality flips to Bad). Do NOT evaluate waiters, or an
|
||||
// "any-change" / unchanged-value-equality waiter would fire on a
|
||||
// non-change.
|
||||
PublishAndNotifyChildren(new AttributeValueChanged(
|
||||
_instanceUniqueName, update.TagPath, attrName,
|
||||
currentValue, "Bad", update.Timestamp));
|
||||
currentValue, "Bad", update.Timestamp), evaluateWaiters: false);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -908,7 +1039,17 @@ public class InstanceActor : ReceiveActor
|
||||
/// Publishes attribute change to stream and notifies child Script/Alarm actors.
|
||||
/// WP-22: Tell for attribute notifications (fire-and-forget, never blocks).
|
||||
/// </summary>
|
||||
private void PublishAndNotifyChildren(AttributeValueChanged changed)
|
||||
/// <param name="changed">The attribute change to publish.</param>
|
||||
/// <param name="evaluateWaiters">
|
||||
/// WaitForAttribute (spec §4.2): when <c>true</c> (the default), registered
|
||||
/// <c>Attributes.WaitAsync</c> waiters on this attribute are re-evaluated against
|
||||
/// <paramref name="changed"/>'s value. Pass <c>false</c> on republish/quality-only
|
||||
/// paths that do NOT assign a new value to <c>_attributes[name]</c> (e.g. the
|
||||
/// List-coerce-failure Bad-quality republish, which publishes the OLD value) —
|
||||
/// otherwise an "any-change" waiter (or a waiter whose target equals the unchanged
|
||||
/// value) would spuriously fire even though nothing actually changed.
|
||||
/// </param>
|
||||
private void PublishAndNotifyChildren(AttributeValueChanged changed, bool evaluateWaiters = true)
|
||||
{
|
||||
// WP-23: Publish to site-wide stream
|
||||
_streamManager?.PublishAttributeValueChanged(changed);
|
||||
@@ -924,6 +1065,83 @@ public class InstanceActor : ReceiveActor
|
||||
{
|
||||
alarmActor.Tell(changed);
|
||||
}
|
||||
|
||||
// WaitForAttribute (spec §4.2): re-evaluate any waiters on THIS attribute —
|
||||
// but ONLY when this publish reflects a real value change (evaluateWaiters).
|
||||
// The genuine value-change paths (HandleAttributeValueChanged, the scalar
|
||||
// DCL update path, HandleSetStaticAttributeCore) call it AFTER assigning
|
||||
// _attributes[name], so changed.Value is the just-applied current value.
|
||||
// Republish/quality-only paths (List-coerce-failure Bad-quality, which
|
||||
// publishes the OLD value) pass evaluateWaiters:false so an "any-change" or
|
||||
// unchanged-value-equality waiter does not spuriously fire (spec §4.2).
|
||||
// Iterate a snapshot so satisfied waiters can be removed during the loop;
|
||||
// each match cancels its scheduled timeout (so no stray WaitForAttributeTimeout
|
||||
// follows) and replies Matched=true.
|
||||
if (evaluateWaiters)
|
||||
ResolveMatchedWaiters(changed);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §4.2): fires every registered waiter on
|
||||
/// <paramref name="changed"/>'s attribute whose test now passes against the
|
||||
/// just-applied value — cancelling its timeout, replying Matched, and removing
|
||||
/// it from the registry. A no-op when there are no waiters.
|
||||
///
|
||||
/// <para>
|
||||
/// Each waiter's match test runs inside a per-waiter try/catch: a throwing
|
||||
/// script-supplied predicate (or codec lambda) must NOT abort the loop and
|
||||
/// strand sibling waiters on the same attribute, nor leave the throwing waiter
|
||||
/// registered with a live scheduled timeout. On throw we cancel that waiter's
|
||||
/// timeout, reply non-matched + ErrorMessage, remove it, and continue.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
private void ResolveMatchedWaiters(AttributeValueChanged changed)
|
||||
{
|
||||
if (_attributeWaiters.Count == 0)
|
||||
return;
|
||||
|
||||
// Snapshot the candidate waiters on THIS attribute. Iterating a snapshot
|
||||
// (and NOT evaluating the test inside the LINQ filter) keeps removal mid-loop
|
||||
// safe and ensures one throwing test cannot abort materialization for siblings.
|
||||
var candidates = _attributeWaiters
|
||||
.Where(kvp => kvp.Value.AttributeName == changed.AttributeName)
|
||||
.ToList();
|
||||
|
||||
foreach (var (cid, pending) in candidates)
|
||||
{
|
||||
bool matched;
|
||||
try
|
||||
{
|
||||
// §4.2 quality gate ANDed with the value test, both INSIDE the guard:
|
||||
// in quality-gated mode a value reaching the target at Bad/Uncertain
|
||||
// quality is NOT a match — the waiter stays pending until it satisfies
|
||||
// the test at Good quality (or times out).
|
||||
matched =
|
||||
(!pending.RequireGoodQuality
|
||||
|| string.Equals(changed.Quality, "Good", StringComparison.Ordinal))
|
||||
&& pending.Test(changed.Value);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"WaitForAttribute predicate threw while resolving waiter {CorrelationId} on {Instance}.{Attribute}; evicting it",
|
||||
cid, _instanceUniqueName, changed.AttributeName);
|
||||
pending.Timeout.Cancel();
|
||||
pending.Replyer.Tell(new WaitForAttributeResponse(
|
||||
cid, Matched: false, null, null, TimedOut: false,
|
||||
ErrorMessage: "Wait predicate threw: " + ex.Message));
|
||||
_attributeWaiters.Remove(cid);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!matched)
|
||||
continue;
|
||||
|
||||
pending.Timeout.Cancel();
|
||||
pending.Replyer.Tell(new WaitForAttributeResponse(
|
||||
cid, Matched: true, changed.Value, changed.Quality, TimedOut: false));
|
||||
_attributeWaiters.Remove(cid);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -1202,4 +1420,23 @@ public class InstanceActor : ReceiveActor
|
||||
/// Internal message for async override loading result.
|
||||
/// </summary>
|
||||
internal record LoadOverridesResult(Dictionary<string, string> Overrides, string? Error);
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §4.2): one registered, not-yet-satisfied waiter.
|
||||
/// </summary>
|
||||
/// <param name="AttributeName">The attribute this waiter watches (scope-resolved).</param>
|
||||
/// <param name="Test">The match test (decoded-target equality OR site-local predicate OR any-change).</param>
|
||||
/// <param name="Replyer">The original sender to reply to on match / timeout.</param>
|
||||
/// <param name="Timeout">The scheduled timeout handle, canceled on match.</param>
|
||||
/// <param name="RequireGoodQuality">
|
||||
/// Quality-gated ("Good"-only) mode (spec §4.2): when <c>true</c>, the resolve
|
||||
/// loop additionally requires <c>changed.Quality == "Good"</c> before the test
|
||||
/// can match.
|
||||
/// </param>
|
||||
private sealed record PendingWait(
|
||||
string AttributeName,
|
||||
Func<object?, bool> Test,
|
||||
IActorRef Replyer,
|
||||
ICancelable Timeout,
|
||||
bool RequireGoodQuality);
|
||||
}
|
||||
|
||||
@@ -221,7 +221,12 @@ public class ScriptExecutionActor : ReceiveActor
|
||||
// M2.12 (#25): thread the singleton site event logger so
|
||||
// recursion-limit violations at CallScript/CallShared emit a
|
||||
// script Error site event in addition to ILogger.LogError.
|
||||
siteEventLogger: siteEventLogger);
|
||||
siteEventLogger: siteEventLogger,
|
||||
// WaitForAttribute (spec §4.3/§4.4): thread the per-script
|
||||
// execution-timeout token so Attributes.WaitAsync's Ask is
|
||||
// bounded by the script's own ExecutionTimeoutSeconds — a
|
||||
// shorter script deadline wins over the wait's own timeout.
|
||||
scriptTimeoutToken: cts.Token);
|
||||
|
||||
var globals = new ScriptGlobals
|
||||
{
|
||||
|
||||
@@ -73,6 +73,107 @@ public class AttributeAccessor
|
||||
/// <returns>A task that represents the asynchronous operation.</returns>
|
||||
public Task SetAsync(string key, object? value)
|
||||
=> _ctx.SetAttribute(Resolve(key), AttributeValueCodec.Encode(value) ?? string.Empty);
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §3-§5): waits event-driven until the attribute equals
|
||||
/// <paramref name="targetValue"/> (value-equality, codec-normalized), bounded by
|
||||
/// <paramref name="timeout"/>. Returns <c>true</c> if matched within the timeout,
|
||||
/// <c>false</c> on timeout (no throw). Honors the script's execution-timeout token.
|
||||
/// Scope/composition path resolution (<see cref="Resolve"/>) is applied just like
|
||||
/// <see cref="GetAsync"/> / <see cref="SetAsync"/>.
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Quality-agnostic by default (spec §4.2):</b> matching tests the VALUE, not
|
||||
/// the quality — a value arriving at Bad quality still satisfies the wait. Pass
|
||||
/// <paramref name="requireGoodQuality"/><c>:true</c> for quality-gated ("Good"-only)
|
||||
/// matching: a value reaching the target at Bad/Uncertain quality is ignored and
|
||||
/// the wait holds until the target is reached at "Good" quality (or times out).
|
||||
/// </para>
|
||||
///
|
||||
/// <para>
|
||||
/// Passing a <b>null</b> <paramref name="targetValue"/> means "match on any change":
|
||||
/// the wait then matches the next value the attribute receives — and matches
|
||||
/// IMMEDIATELY (fast-path) if the attribute already holds any value at registration.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
|
||||
/// <param name="targetValue">
|
||||
/// The value to wait for (codec-encoded for comparison); <c>null</c> means
|
||||
/// "match on any change" (matches immediately if the attribute already has a value).
|
||||
/// </param>
|
||||
/// <param name="timeout">How long to wait before returning false.</param>
|
||||
/// <param name="requireGoodQuality">
|
||||
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to
|
||||
/// <c>false</c> (quality-agnostic — Bad/Uncertain-quality transients still match).
|
||||
/// </param>
|
||||
/// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
|
||||
public Task<bool> WaitAsync(string key, object? targetValue, TimeSpan timeout, bool requireGoodQuality = false)
|
||||
=> _ctx.WaitAttribute(Resolve(key), AttributeValueCodec.Encode(targetValue), null, timeout, requireGoodQuality);
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §3-§5): predicate form — waits event-driven until
|
||||
/// <paramref name="predicate"/> returns <c>true</c> for the attribute's current
|
||||
/// value, bounded by <paramref name="timeout"/>. Site-local only (the predicate
|
||||
/// is an in-process delegate). Returns <c>true</c> if matched within the timeout,
|
||||
/// <c>false</c> on timeout (no throw). Scope/composition path resolution applies.
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Quality-agnostic by default (spec §4.2):</b> the predicate is tested against
|
||||
/// the VALUE, regardless of quality — a value arriving at Bad quality still
|
||||
/// satisfies the wait if the predicate passes. Pass <paramref name="requireGoodQuality"/>
|
||||
/// <c>:true</c> for quality-gated ("Good"-only) matching: a value satisfying the
|
||||
/// predicate at Bad/Uncertain quality is ignored until it does so at "Good" quality.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
|
||||
/// <param name="predicate">The site-local predicate tested against the current value.</param>
|
||||
/// <param name="timeout">How long to wait before returning false.</param>
|
||||
/// <param name="requireGoodQuality">
|
||||
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to
|
||||
/// <c>false</c> (quality-agnostic).
|
||||
/// </param>
|
||||
/// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
|
||||
public Task<bool> WaitAsync(string key, Func<object?, bool> predicate, TimeSpan timeout, bool requireGoodQuality = false)
|
||||
=> _ctx.WaitAttribute(Resolve(key), null, predicate, timeout, requireGoodQuality);
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §3): richer value-equality form — like
|
||||
/// <see cref="WaitAsync(string, object?, TimeSpan, bool)"/> but returns the full
|
||||
/// <see cref="WaitResult"/> (matched flag + matched value + quality + timed-out
|
||||
/// flag) instead of a bare bool. Scope/composition path resolution
|
||||
/// (<see cref="Resolve"/>) is applied to <paramref name="key"/> just like the
|
||||
/// other accessors. Never throws on timeout — a timeout yields
|
||||
/// <c>WaitResult { Matched = false, TimedOut = true }</c>.
|
||||
/// </summary>
|
||||
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
|
||||
/// <param name="targetValue">
|
||||
/// The value to wait for (codec-encoded for comparison); <c>null</c> means
|
||||
/// "match on any change".
|
||||
/// </param>
|
||||
/// <param name="timeout">How long to wait before returning a timed-out result.</param>
|
||||
/// <param name="requireGoodQuality">
|
||||
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to <c>false</c>.
|
||||
/// </param>
|
||||
/// <returns>The full <see cref="WaitResult"/> for the wait.</returns>
|
||||
public Task<WaitResult> WaitForAsync(string key, object? targetValue, TimeSpan timeout, bool requireGoodQuality = false)
|
||||
=> _ctx.WaitAttributeFull(Resolve(key), AttributeValueCodec.Encode(targetValue), null, timeout, requireGoodQuality);
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §3): richer predicate form — like
|
||||
/// <see cref="WaitAsync(string, Func{object?, bool}, TimeSpan, bool)"/> but returns
|
||||
/// the full <see cref="WaitResult"/>. Site-local only (the predicate is an
|
||||
/// in-process delegate). Scope/composition path resolution applies. Never throws
|
||||
/// on timeout (<c>WaitResult { Matched = false, TimedOut = true }</c>).
|
||||
/// </summary>
|
||||
/// <param name="key">The attribute key (scope-resolved before the wait is registered).</param>
|
||||
/// <param name="predicate">The site-local predicate tested against the current value.</param>
|
||||
/// <param name="timeout">How long to wait before returning a timed-out result.</param>
|
||||
/// <param name="requireGoodQuality">
|
||||
/// <c>true</c> for quality-gated ("Good"-only) matching (spec §4.2); defaults to <c>false</c>.
|
||||
/// </param>
|
||||
/// <returns>The full <see cref="WaitResult"/> for the wait.</returns>
|
||||
public Task<WaitResult> WaitForAsync(string key, Func<object?, bool> predicate, TimeSpan timeout, bool requireGoodQuality = false)
|
||||
=> _ctx.WaitAttributeFull(Resolve(key), null, predicate, timeout, requireGoodQuality);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -46,6 +46,16 @@ public class ScriptRuntimeContext
|
||||
private readonly ILogger _logger;
|
||||
private readonly string _instanceName;
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §4.3): the per-script execution-timeout token from
|
||||
/// the owning <c>ScriptExecutionActor</c>/<c>AlarmExecutionActor</c>
|
||||
/// (<c>cts.Token</c>). Bounds the <c>Attributes.WaitAsync</c> Ask so a script
|
||||
/// that hits its own <c>ExecutionTimeoutSeconds</c> abandons the wait. Defaults
|
||||
/// to <see cref="CancellationToken.None"/> for contexts that do not thread one
|
||||
/// (legacy callers / tests / the alarm path when it has no CTS).
|
||||
/// </summary>
|
||||
private readonly CancellationToken _scriptTimeoutToken;
|
||||
|
||||
/// <summary>
|
||||
/// WP-13: External system client for ExternalSystem.Call/CachedCall.
|
||||
/// </summary>
|
||||
@@ -194,6 +204,13 @@ public class ScriptRuntimeContext
|
||||
/// <c>ILogger.LogError</c> + throw. When null the existing behaviour is
|
||||
/// unchanged; all existing callers and tests remain source-compatible.
|
||||
/// </param>
|
||||
/// <param name="scriptTimeoutToken">
|
||||
/// WaitForAttribute (spec §4.3): the per-script execution-timeout token
|
||||
/// (<c>cts.Token</c> on the owning execution actor) used to bound
|
||||
/// <c>Attributes.WaitAsync</c>. Defaults to
|
||||
/// <see cref="CancellationToken.None"/> for callers / tests that do not
|
||||
/// thread one — those waits are bounded only by their own timeout.
|
||||
/// </param>
|
||||
public ScriptRuntimeContext(
|
||||
IActorRef instanceActor,
|
||||
IActorRef self,
|
||||
@@ -215,7 +232,8 @@ public class ScriptRuntimeContext
|
||||
Guid? executionId = null,
|
||||
Guid? parentExecutionId = null,
|
||||
string? sourceNode = null,
|
||||
ISiteEventLogger? siteEventLogger = null)
|
||||
ISiteEventLogger? siteEventLogger = null,
|
||||
CancellationToken scriptTimeoutToken = default)
|
||||
{
|
||||
_instanceActor = instanceActor;
|
||||
_self = self;
|
||||
@@ -245,6 +263,66 @@ public class ScriptRuntimeContext
|
||||
_parentExecutionId = parentExecutionId;
|
||||
// M2.12 (#25): optional — null when not wired (tests / AlarmExecutionActor).
|
||||
_siteEventLogger = siteEventLogger;
|
||||
// WaitForAttribute (spec §4.3): default(CancellationToken) == None when
|
||||
// not threaded in — the WaitAsync Ask is then bounded only by its own timeout.
|
||||
_scriptTimeoutToken = scriptTimeoutToken;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M5.4): this run's own per-execution id. Exposed so a
|
||||
/// nested <c>Scripts.CallShared</c> can record it as the spawned shared
|
||||
/// script's <c>ParentExecutionId</c>, forming a true execution tree.
|
||||
/// </summary>
|
||||
internal Guid ExecutionId => _executionId;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M5.4): the spawning execution's id for this run (null for
|
||||
/// a root run). Exposed for test assertions on the execution tree.
|
||||
/// </summary>
|
||||
internal Guid? ParentExecutionId => _parentExecutionId;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): builds a child
|
||||
/// <see cref="ScriptRuntimeContext"/> for an inline <c>Scripts.CallShared</c>
|
||||
/// invocation. The shared script runs inline (no actor hop) but is modelled
|
||||
/// as its OWN execution node in the audit tree: it mints a fresh
|
||||
/// <see cref="_executionId"/> and records THIS run's <see cref="_executionId"/>
|
||||
/// as its <c>ParentExecutionId</c>, so <c>B → CallShared(C)</c> yields
|
||||
/// <c>C.ParentExecutionId == B.ExecutionId</c>. Every other dependency
|
||||
/// (actors, gateways, audit writer, site id, source node, call-depth) is
|
||||
/// carried over verbatim from this context.
|
||||
/// </summary>
|
||||
/// <param name="childCallDepth">The recursion depth of the shared-script call.</param>
|
||||
internal ScriptRuntimeContext CreateChildContextForSharedScript(int childCallDepth)
|
||||
{
|
||||
return new ScriptRuntimeContext(
|
||||
_instanceActor,
|
||||
_self,
|
||||
_sharedScriptLibrary,
|
||||
childCallDepth,
|
||||
_maxCallDepth,
|
||||
_askTimeout,
|
||||
_instanceName,
|
||||
_logger,
|
||||
_externalSystemClient,
|
||||
_databaseGateway,
|
||||
_storeAndForward,
|
||||
_siteCommunicationActor,
|
||||
_siteId,
|
||||
_sourceScript,
|
||||
_auditWriter,
|
||||
_operationTrackingStore,
|
||||
_cachedForwarder,
|
||||
// Fresh execution id for the shared-script run (omit so the ctor mints one)…
|
||||
executionId: null,
|
||||
// …parented to THIS run's execution id (the spawner).
|
||||
parentExecutionId: _executionId,
|
||||
sourceNode: _sourceNode,
|
||||
siteEventLogger: _siteEventLogger,
|
||||
// WaitForAttribute (spec §4.3): an inline shared-script call shares the
|
||||
// parent run's execution-timeout token so a WaitAsync inside the shared
|
||||
// script is bounded by the SAME script deadline.
|
||||
scriptTimeoutToken: _scriptTimeoutToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -307,6 +385,115 @@ public class ScriptRuntimeContext
|
||||
return response.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §3-§5): waits event-driven for an attribute to reach
|
||||
/// a value (encoded-equality), satisfy a site-local predicate, or change at all,
|
||||
/// bounded by <paramref name="timeout"/>. Returns <c>true</c> if matched within
|
||||
/// the timeout, <c>false</c> on timeout — NEVER throws on timeout. The backing
|
||||
/// <c>Attributes.WaitAsync</c> for the accessor.
|
||||
///
|
||||
/// <para>
|
||||
/// The Ask is bounded by the script's own execution-timeout token (§4.3): a
|
||||
/// script that hits its <c>ExecutionTimeoutSeconds</c> abandons the wait. The
|
||||
/// Ask timeout is the wait timeout plus a small <see cref="_askTimeout"/> slack
|
||||
/// so the InstanceActor's own scheduled timeout reply is the authoritative path
|
||||
/// for the false/timed-out outcome, not the Ask deadline.
|
||||
/// </para>
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Quality-agnostic by default (spec §4.2):</b> a value arriving at Bad
|
||||
/// quality still satisfies the wait — the match tests the value, not the quality.
|
||||
/// A quality-gated ("Good"-only) mode is a planned enhancement, deferred per spec §4.2.
|
||||
/// </para>
|
||||
///
|
||||
/// <para>
|
||||
/// <b>Never throws on timeout.</b> An <see cref="Akka.Actor.AskTimeoutException"/>
|
||||
/// (the pathological case where the InstanceActor's authoritative timeout reply
|
||||
/// never arrives — actor stopped/restarted) is caught and surfaced as <c>false</c>,
|
||||
/// matching the timeout contract. An <see cref="OperationCanceledException"/> /
|
||||
/// <see cref="TaskCanceledException"/> from the script-deadline token is NOT caught
|
||||
/// — it propagates to abort the script (intended §4.3 behaviour).
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="name">The scope-resolved attribute name to wait on.</param>
|
||||
/// <param name="targetValueEncoded">
|
||||
/// The codec-encoded target value; null (with null <paramref name="predicate"/>)
|
||||
/// means "any change".
|
||||
/// </param>
|
||||
/// <param name="predicate">Site-local predicate; null when the encoded target is used.</param>
|
||||
/// <param name="timeout">How long to wait before returning false.</param>
|
||||
/// <param name="requireGoodQuality">
|
||||
/// Quality-gated ("Good"-only) mode (spec §4.2): when <see langword="true"/>, a
|
||||
/// value reaching the target / satisfying the predicate at Bad/Uncertain quality
|
||||
/// is NOT a match — the wait holds until the value satisfies the test at Good
|
||||
/// quality (or times out). Defaults to <see langword="false"/> (quality-agnostic).
|
||||
/// </param>
|
||||
/// <returns><c>true</c> on match within the timeout; <c>false</c> on timeout.</returns>
|
||||
public async Task<bool> WaitAttribute(
|
||||
string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
|
||||
bool requireGoodQuality = false)
|
||||
=> (await WaitInternal(name, targetValueEncoded, predicate, timeout, requireGoodQuality)).Matched;
|
||||
|
||||
/// <summary>
|
||||
/// WaitForAttribute (spec §3): the richer overload backing <c>Attributes.WaitForAsync</c>
|
||||
/// — identical semantics to <see cref="WaitAttribute"/> but surfaces the full
|
||||
/// <see cref="WaitResult"/> (matched flag + matched value + quality + timed-out
|
||||
/// flag) instead of a bare bool. Never throws on timeout (see <see cref="WaitInternal"/>).
|
||||
/// </summary>
|
||||
/// <param name="name">The scope-resolved attribute name to wait on.</param>
|
||||
/// <param name="targetValueEncoded">The codec-encoded target value; null (with null predicate) means "any change".</param>
|
||||
/// <param name="predicate">Site-local predicate; null when the encoded target is used.</param>
|
||||
/// <param name="timeout">How long to wait before returning a timed-out result.</param>
|
||||
/// <param name="requireGoodQuality">Quality-gated ("Good"-only) mode (spec §4.2); defaults to <see langword="false"/>.</param>
|
||||
/// <returns>The full <see cref="WaitResult"/> — on timeout: <c>Matched:false, TimedOut:true</c>.</returns>
|
||||
public async Task<WaitResult> WaitAttributeFull(
|
||||
string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
|
||||
bool requireGoodQuality = false)
|
||||
{
|
||||
var r = await WaitInternal(name, targetValueEncoded, predicate, timeout, requireGoodQuality);
|
||||
return new WaitResult(r.Matched, r.Value, r.Quality, r.TimedOut);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Shared core for <see cref="WaitAttribute"/> / <see cref="WaitAttributeFull"/>:
|
||||
/// builds the <see cref="WaitForAttributeRequest"/> (incl. the §4.2
|
||||
/// <paramref name="requireGoodQuality"/> flag), Asks the InstanceActor bounded by
|
||||
/// the script's execution-timeout token, and returns the full response. An
|
||||
/// <see cref="AskTimeoutException"/> (the pathological case where the actor's own
|
||||
/// authoritative timeout reply never arrives — actor stopped/restarted) is caught
|
||||
/// and surfaced as a synthetic non-matched/timed-out response, preserving the
|
||||
/// "never throw on timeout" contract. An <see cref="OperationCanceledException"/> /
|
||||
/// <see cref="TaskCanceledException"/> from the script-deadline token is NOT caught
|
||||
/// — it propagates to abort the script (§4.3).
|
||||
/// </summary>
|
||||
private async Task<WaitForAttributeResponse> WaitInternal(
|
||||
string name, string? targetValueEncoded, Func<object?, bool>? predicate, TimeSpan timeout,
|
||||
bool requireGoodQuality)
|
||||
{
|
||||
var cid = Guid.NewGuid().ToString();
|
||||
var req = new WaitForAttributeRequest(
|
||||
cid, _instanceName, name, targetValueEncoded, predicate, timeout, DateTimeOffset.UtcNow,
|
||||
requireGoodQuality);
|
||||
|
||||
try
|
||||
{
|
||||
return await _instanceActor.Ask<WaitForAttributeResponse>(
|
||||
req, timeout + _askTimeout, _scriptTimeoutToken);
|
||||
}
|
||||
catch (AskTimeoutException)
|
||||
{
|
||||
// Pathological: the InstanceActor's own scheduled timeout reply never
|
||||
// arrived (e.g. the actor stopped/restarted under us). The helper's
|
||||
// contract is "false on timeout, never throw" — so synthesize a
|
||||
// non-matched/timed-out response rather than leaking the Ask exception.
|
||||
// OperationCanceledException / TaskCanceledException from the
|
||||
// script-deadline token are deliberately NOT caught here: they must
|
||||
// propagate to abort the script (§4.3).
|
||||
return new WaitForAttributeResponse(
|
||||
cid, Matched: false, null, null, TimedOut: true);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets an attribute value. For data-connected attributes the Instance Actor
|
||||
/// forwards the write to the DCL, which writes the physical device; the
|
||||
@@ -366,7 +553,14 @@ public class ScriptRuntimeContext
|
||||
scriptName,
|
||||
ScriptArgs.Normalize(parameters),
|
||||
nextDepth,
|
||||
correlationId);
|
||||
correlationId,
|
||||
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the child
|
||||
// script run is a NEW execution spawned BY this run. Its parent is
|
||||
// THIS run's own ExecutionId — NOT the inherited _parentExecutionId.
|
||||
// So A → CallScript(B) yields B.ParentExecutionId == A.ExecutionId,
|
||||
// building a true multi-level execution tree rather than flattening
|
||||
// every nested call under the original inbound spawner.
|
||||
ParentExecutionId: _executionId);
|
||||
|
||||
// Ask the Instance Actor, which routes to the appropriate Script Actor
|
||||
var result = await _instanceActor.Ask<ScriptCallResult>(request, _askTimeout);
|
||||
@@ -526,8 +720,14 @@ public class ScriptRuntimeContext
|
||||
throw new InvalidOperationException(msg);
|
||||
}
|
||||
|
||||
// Audit Log #23 (M5.4 — ParentExecutionId tag-cascade): the shared
|
||||
// script runs inline, but is modelled as its OWN execution node — a
|
||||
// child context mints a fresh ExecutionId parented to the caller's
|
||||
// ExecutionId, so its audit rows chain under the calling run.
|
||||
var childContext = _context.CreateChildContextForSharedScript(nextDepth);
|
||||
|
||||
return await _library.ExecuteAsync(
|
||||
scriptName, _context, ScriptArgs.Normalize(parameters), cancellationToken);
|
||||
scriptName, childContext, ScriptArgs.Normalize(parameters), cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user