docs+code: close Theme 1 — 24 design-doc / XML-doc drift findings
Doc/XML-comment drift + small adherence fixes across 17 modules. Highlights: - Host-017: site CoordinatedShutdown ordering — SiteStreamGrpcServer gains CancelAllStreams() (refuse new streams, cancel active), wired into Program.cs site branch via ApplicationStopping. - InboundAPI-021: ParentExecutionId now travels on RouteToGet/SetAttributes symmetric with RouteToCallRequest; RouteHelper stamps from _parentExecutionId. - ClusterInfra-012: ClusterOptionsValidator now requires both seed nodes. - Comm-018: SiteCommunicationActor.HeartbeatMessage.IsActive derived from cluster leader check (was hardcoded true). - DM-020: reconciliation audit row attributes the current user, not prior deployer. - SEL-019: EventLogPurgeService early-exits on standby via active-node check. - Plus comment/XML-doc accuracy fixes across AuditLog, ConfigurationDatabase, NotificationOutbox, SiteRuntime, SiteCallAudit; doc refreshes for Component- Commons / -ManagementService / -CLI / -ExternalSystemGateway / -HealthMonitoring / -Transport / -ConfigurationDatabase; CD-023 index-name doc alignment. 11 new regression tests (RouteHelper x4, SiteStreamGrpcServer x2, ClusterOptionsValidator x1, SiteCommunicationActor x1, DeploymentService x1, EventLogPurgeService x3). Build clean (0 warnings); InboundAPI/Communication/ Host suites all green. README regenerated: 112 open (was 136).
This commit is contained in:
@@ -26,10 +26,12 @@ namespace ScadaLink.AuditLog.Central;
|
||||
/// <para>
|
||||
/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
|
||||
/// action. The actor wraps each repository call in its own try/catch so a
|
||||
/// single bad row cannot cause the rest of the batch to be lost; the actor's
|
||||
/// <see cref="SupervisorStrategy"/> uses <c>Resume</c> so a thrown exception
|
||||
/// inside <c>ReceiveAsync</c> does not restart the actor (which would also
|
||||
/// reset any in-flight state).
|
||||
/// single bad row cannot cause the rest of the batch to be lost — that
|
||||
/// per-row catch is what keeps this actor alive across handler throws, not
|
||||
/// the supervisor strategy. The <see cref="SupervisorStrategy"/> override
|
||||
/// returns the Akka default decider (Restart for most exceptions) and
|
||||
/// governs children only; this actor has no children today, so the override
|
||||
/// is a forward-compat placeholder.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
|
||||
|
||||
@@ -35,9 +35,11 @@ namespace ScadaLink.AuditLog.Central;
|
||||
/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
|
||||
/// failure, contention with backup, missing object) must NOT prevent the
|
||||
/// other eligible boundaries from being purged on the same tick. Per-boundary
|
||||
/// work runs inside its own try/catch; the actor's
|
||||
/// <see cref="SupervisorStrategy"/> uses Resume so any leaked exception keeps
|
||||
/// the singleton alive for the next tick.
|
||||
/// work runs inside its own try/catch — that per-boundary catch is what
|
||||
/// keeps the singleton alive across handler throws. The
|
||||
/// <see cref="SupervisorStrategy"/> override returns the Akka default
|
||||
/// decider (Restart) and governs children only; this actor has no children
|
||||
/// today, so the override is a forward-compat placeholder.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
|
||||
|
||||
@@ -48,10 +48,13 @@ namespace ScadaLink.AuditLog.Central;
|
||||
/// <para>
|
||||
/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
|
||||
/// repository write) must NOT prevent other sites from being polled on the
|
||||
/// same tick. The per-site work runs inside its own try/catch; the actor's
|
||||
/// supervisor strategy keeps it alive across any leaked exception with
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/>'s Restart
|
||||
/// semantics — restart resets the in-memory cursors, but as noted above that's
|
||||
/// same tick. The per-site work runs inside its own try/catch — that
|
||||
/// per-site catch is what keeps the actor running across handler throws.
|
||||
/// The <see cref="SupervisorStrategy"/> override returns
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart
|
||||
/// semantics) and governs children only; this actor has no children today,
|
||||
/// so the override is a forward-compat placeholder. If it ever did fire,
|
||||
/// restart would reset the in-memory cursors — but as noted above that's
|
||||
/// a safe (over-pull, idempotent) recovery.
|
||||
/// </para>
|
||||
/// <para>
|
||||
|
||||
@@ -706,9 +706,13 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return;
|
||||
// Stop accepting new events. Setting _disposed first ensures any
|
||||
// FlushBatch entered after we mark disposed will fault its pending
|
||||
// events rather than touching the about-to-close connection.
|
||||
// Stop accepting new events. Completing the channel writer is the
|
||||
// shutdown signal: WriteAsync calls observe the completion and
|
||||
// fault, and the writer loop drains any already-buffered items
|
||||
// before exiting. _disposed is intentionally NOT set here — it
|
||||
// flips only after the loop has fully drained (second lock block
|
||||
// below), so FlushBatch's existing _disposed check guards the
|
||||
// post-drain window when the connection is about to close.
|
||||
_writeQueue.Writer.TryComplete();
|
||||
writerLoop = _writerLoop;
|
||||
}
|
||||
|
||||
@@ -27,9 +27,18 @@ public sealed class ClusterOptionsValidator : IValidateOptions<ClusterOptions>
|
||||
{
|
||||
var failures = new List<string>();
|
||||
|
||||
if (options.SeedNodes is null || options.SeedNodes.Count == 0)
|
||||
if (options.SeedNodes is null || options.SeedNodes.Count < 2)
|
||||
{
|
||||
failures.Add("ClusterOptions.SeedNodes must contain at least one seed node.");
|
||||
// CI-012: design doc states "both nodes are seed nodes — each node lists
|
||||
// both itself and its partner" so a properly-configured deployment lists
|
||||
// two. Accepting a single-seed configuration silently defeats the
|
||||
// "no startup ordering dependency" guarantee called out by
|
||||
// Component-ClusterInfrastructure.md (Node Configuration).
|
||||
failures.Add(
|
||||
"ClusterOptions.SeedNodes must contain at least 2 seed nodes "
|
||||
+ "(Component-ClusterInfrastructure.md → Node Configuration: "
|
||||
+ "both nodes are seed nodes); a single-seed configuration defeats "
|
||||
+ "the no-startup-ordering-dependency guarantee.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(options.SplitBrainResolverStrategy)
|
||||
|
||||
@@ -8,12 +8,13 @@ namespace ScadaLink.Commons.Interfaces.Transport;
|
||||
/// Re-entrancy / thread-safety: mutating <see cref="BundleImportId"/> is NOT
|
||||
/// thread-safe. The service is registered scoped, and the assumed usage is a
|
||||
/// single Blazor Server circuit (or single API request) at a time — within that
|
||||
/// scope <see cref="BundleImporter.ApplyAsync"/> is the sole writer, and the
|
||||
/// audit service is the sole reader, in a strictly sequential await chain.
|
||||
/// Callers that perform concurrent imports within a shared scope (e.g. two
|
||||
/// <c>ApplyAsync</c> calls awaited via <c>Task.WhenAll</c> on the same circuit)
|
||||
/// MUST serialize access externally — there is no internal lock and the last
|
||||
/// writer wins, which would cross-contaminate audit rows between imports.
|
||||
/// scope <c>BundleImporter.ApplyAsync</c> (in the Transport component) is the
|
||||
/// sole writer, and the audit service is the sole reader, in a strictly
|
||||
/// sequential await chain. Callers that perform concurrent imports within a
|
||||
/// shared scope (e.g. two <c>ApplyAsync</c> calls awaited via
|
||||
/// <c>Task.WhenAll</c> on the same circuit) MUST serialize access externally —
|
||||
/// there is no internal lock and the last writer wins, which would
|
||||
/// cross-contaminate audit rows between imports.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public interface IAuditCorrelationContext
|
||||
|
||||
@@ -33,11 +33,19 @@ public record RouteToCallResponse(
|
||||
/// <summary>
|
||||
/// Request to read attribute(s) from a remote instance.
|
||||
/// </summary>
|
||||
/// <param name="ParentExecutionId">
|
||||
/// Audit Log #23 (ParentExecutionId): mirrors <see cref="RouteToCallRequest.ParentExecutionId"/>.
|
||||
/// For an inbound-API-routed read this is the inbound request's per-request execution id;
|
||||
/// future site-side audit emission for routed reads can stamp it as <c>ParentExecutionId</c>
|
||||
/// so the inbound→site execution-tree link survives the read path. Additive trailing
|
||||
/// member — null for the Central UI sandbox path or for callers built before the field existed.
|
||||
/// </param>
|
||||
public record RouteToGetAttributesRequest(
|
||||
string CorrelationId,
|
||||
string InstanceUniqueName,
|
||||
IReadOnlyList<string> AttributeNames,
|
||||
DateTimeOffset Timestamp);
|
||||
DateTimeOffset Timestamp,
|
||||
Guid? ParentExecutionId = null);
|
||||
|
||||
/// <summary>
|
||||
/// Response containing attribute values from a remote instance.
|
||||
@@ -52,11 +60,20 @@ public record RouteToGetAttributesResponse(
|
||||
/// <summary>
|
||||
/// Request to write attribute(s) on a remote instance.
|
||||
/// </summary>
|
||||
/// <param name="ParentExecutionId">
|
||||
/// Audit Log #23 (ParentExecutionId): mirrors <see cref="RouteToCallRequest.ParentExecutionId"/>.
|
||||
/// For an inbound-API-routed write this is the inbound request's per-request execution id;
|
||||
/// site-side audit emission for the underlying device / static-attribute write can stamp
|
||||
/// it as <c>ParentExecutionId</c> so the inbound→site execution-tree link survives the
|
||||
/// write path. Additive trailing member — null for the Central UI sandbox path or for
|
||||
/// callers built before the field existed.
|
||||
/// </param>
|
||||
public record RouteToSetAttributesRequest(
|
||||
string CorrelationId,
|
||||
string InstanceUniqueName,
|
||||
IReadOnlyDictionary<string, string> AttributeValues,
|
||||
DateTimeOffset Timestamp);
|
||||
DateTimeOffset Timestamp,
|
||||
Guid? ParentExecutionId = null);
|
||||
|
||||
/// <summary>
|
||||
/// Response confirming attribute writes on a remote instance.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster;
|
||||
using Akka.Cluster.Tools.Client;
|
||||
using Akka.Event;
|
||||
using ScadaLink.Commons.Messages.Artifacts;
|
||||
@@ -27,6 +28,15 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
private readonly string _siteId;
|
||||
private readonly CommunicationOptions _options;
|
||||
|
||||
/// <summary>
|
||||
/// Communication-018: predicate that returns <c>true</c> when this node is
|
||||
/// the active member of the local site cluster (used to stamp
|
||||
/// <see cref="HeartbeatMessage.IsActive"/>). Production builds default to
|
||||
/// the Akka <see cref="Cluster"/> leader check; tests inject a stub so they
|
||||
/// do not need a real cluster.
|
||||
/// </summary>
|
||||
private readonly Func<bool> _isActiveCheck;
|
||||
|
||||
/// <summary>
|
||||
/// Reference to the local Deployment Manager singleton proxy.
|
||||
/// </summary>
|
||||
@@ -54,14 +64,23 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
/// <param name="siteId">The site identifier included in outbound messages.</param>
|
||||
/// <param name="options">Communication options including heartbeat interval and transport settings.</param>
|
||||
/// <param name="deploymentManagerProxy">Local reference to the Deployment Manager singleton proxy.</param>
|
||||
/// <param name="isActiveCheck">
|
||||
/// Communication-018: optional override returning <c>true</c> when this node
|
||||
/// is the active member of the site cluster. <c>null</c> uses the real
|
||||
/// Akka <see cref="Cluster"/> leader check (the default for production
|
||||
/// wiring); tests pass a stub so they do not need to load Akka.Cluster
|
||||
/// into the <c>TestKit</c> ActorSystem.
|
||||
/// </param>
|
||||
public SiteCommunicationActor(
|
||||
string siteId,
|
||||
CommunicationOptions options,
|
||||
IActorRef deploymentManagerProxy)
|
||||
IActorRef deploymentManagerProxy,
|
||||
Func<bool>? isActiveCheck = null)
|
||||
{
|
||||
_siteId = siteId;
|
||||
_options = options;
|
||||
_deploymentManagerProxy = deploymentManagerProxy;
|
||||
_isActiveCheck = isActiveCheck ?? DefaultIsActiveCheck;
|
||||
|
||||
// Registration
|
||||
Receive<RegisterCentralClient>(msg =>
|
||||
@@ -360,16 +379,60 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
return;
|
||||
|
||||
var hostname = Environment.MachineName;
|
||||
|
||||
// Communication-018: stamp HeartbeatMessage.IsActive with this node's
|
||||
// true active/standby role rather than hard-coding `true`. The field is
|
||||
// part of the wire contract (additive-only-evolution) so a future
|
||||
// central health dashboard can distinguish "active node down, standby
|
||||
// up" from "site fully offline" without a new message type.
|
||||
bool isActive;
|
||||
try
|
||||
{
|
||||
isActive = _isActiveCheck();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Defensive: never let a cluster-state read failure abort the
|
||||
// heartbeat itself (heartbeats are health signal — their absence is
|
||||
// already meaningful). Fall back to the safest non-claiming value:
|
||||
// standby. Logged at Debug because this path normally only fires
|
||||
// during ActorSystem warm-up.
|
||||
_log.Debug(ex,
|
||||
"Active-node check threw while sending heartbeat for site {0}; reporting IsActive=false",
|
||||
_siteId);
|
||||
isActive = false;
|
||||
}
|
||||
|
||||
var heartbeat = new HeartbeatMessage(
|
||||
_siteId,
|
||||
hostname,
|
||||
IsActive: true,
|
||||
IsActive: isActive,
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
_centralClient.Tell(
|
||||
new ClusterClient.Send("/user/central-communication", heartbeat), Self);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Communication-018: default active-node check used when no override is
|
||||
/// supplied. Mirrors <c>ActiveNodeGate</c> in the Host (and
|
||||
/// <c>ActiveNodeHealthCheck</c>): the node is the active member of the
|
||||
/// site cluster when it is the current cluster leader AND its own
|
||||
/// <see cref="MemberStatus"/> is <see cref="MemberStatus.Up"/>. Any other
|
||||
/// state (still joining, leaving, no leader yet) reports standby —
|
||||
/// safe-by-default, matching the standby case.
|
||||
/// </summary>
|
||||
private bool DefaultIsActiveCheck()
|
||||
{
|
||||
var cluster = Cluster.Get(Context.System);
|
||||
var self = cluster.SelfMember;
|
||||
if (self.Status != MemberStatus.Up)
|
||||
return false;
|
||||
|
||||
var leader = cluster.State.Leader;
|
||||
return leader != null && leader == self.Address;
|
||||
}
|
||||
|
||||
// ── Internal messages ──
|
||||
|
||||
internal record SendHeartbeat;
|
||||
|
||||
@@ -25,6 +25,11 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
private readonly int _maxConcurrentStreams;
|
||||
private readonly TimeSpan _maxStreamLifetime;
|
||||
private volatile bool _ready;
|
||||
// Host-017 / REQ-HOST-7: flipped by CancelAllStreams() when the host enters
|
||||
// CoordinatedShutdown so SubscribeInstance refuses new streams with
|
||||
// Unavailable before the actor system tears down. Strictly monotonic — once
|
||||
// true, never reset (the server is single-lifetime per host).
|
||||
private volatile bool _shuttingDown;
|
||||
private long _actorCounter;
|
||||
// Audit Log (#23 M2): central-side ingest actor proxy. Set by the host
|
||||
// after the cluster singleton starts (see Bundle E wiring). When null the
|
||||
@@ -131,6 +136,40 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
_siteAuditQueue = queue;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Host-017 / REQ-HOST-7: signals the gRPC server to begin its part of the
|
||||
/// site shutdown sequence — refuse new <see cref="SubscribeInstance"/>
|
||||
/// streams with <see cref="StatusCode.Unavailable"/> and cancel every
|
||||
/// active stream so its <c>await foreach</c> observes
|
||||
/// <see cref="OperationCanceledException"/> and the response stream
|
||||
/// completes with <c>Cancelled</c> on the client. Idempotent — safe to call
|
||||
/// more than once. Invoked from the site host's
|
||||
/// <c>IHostApplicationLifetime.ApplicationStopping</c> callback BEFORE
|
||||
/// Akka's <c>CoordinatedShutdown</c> runs, so in-flight clients get a
|
||||
/// clean cancellation they can reconnect on rather than a silent stream
|
||||
/// that only times out via gRPC keepalive.
|
||||
/// </summary>
|
||||
public void CancelAllStreams()
|
||||
{
|
||||
_shuttingDown = true;
|
||||
foreach (var entry in _activeStreams.Values)
|
||||
{
|
||||
try
|
||||
{
|
||||
entry.Cts.Cancel();
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
// Already cleaned up by its own finally — nothing to do.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Host-017: exposed for test assertions on the shutdown state.
|
||||
/// </summary>
|
||||
internal bool IsShuttingDown => _shuttingDown;
|
||||
|
||||
/// <summary>
|
||||
/// Number of currently active streaming subscriptions. Exposed for diagnostics.
|
||||
/// </summary>
|
||||
@@ -151,6 +190,11 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
if (!_ready)
|
||||
throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server not ready"));
|
||||
|
||||
// Host-017 / REQ-HOST-7: refuse new subscriptions during shutdown so
|
||||
// CoordinatedShutdown can quiesce without racing fresh streams.
|
||||
if (_shuttingDown)
|
||||
throw new RpcException(new GrpcStatus(StatusCode.Unavailable, "Server shutting down"));
|
||||
|
||||
// Communication-014: correlation_id arrives off the wire on a public gRPC
|
||||
// endpoint and is used (below) to compose an Akka actor name. Akka actor names
|
||||
// have a restricted character set — a id containing '/', whitespace, or other
|
||||
|
||||
@@ -6,11 +6,12 @@ using ScadaLink.Commons.Interfaces.Repositories;
|
||||
namespace ScadaLink.ConfigurationDatabase.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// EF Core implementation of IDeploymentManagerRepository.
|
||||
/// Provides storage/query of deployed configuration snapshots per instance,
|
||||
/// current deployment status, and optimistic concurrency on deployment status records.
|
||||
///
|
||||
/// WP-24: Stub level sufficient for diff/staleness support.
|
||||
/// EF Core implementation of <see cref="IDeploymentManagerRepository"/> covering
|
||||
/// the deployment pipeline's persistence surface: <c>DeploymentRecord</c> CRUD
|
||||
/// (with optimistic concurrency via <c>DeploymentRecord.RowVersion</c>),
|
||||
/// <c>SystemArtifactDeploymentRecord</c> CRUD, <c>DeployedConfigSnapshot</c> CRUD,
|
||||
/// and a Restrict-FK-aware <see cref="DeleteInstanceAsync"/> that explicitly
|
||||
/// clears dependent deployment-record rows before removing an instance.
|
||||
/// </summary>
|
||||
public class DeploymentManagerRepository : IDeploymentManagerRepository
|
||||
{
|
||||
|
||||
@@ -170,7 +170,7 @@ public class DeploymentService
|
||||
// Idempotency"). A clean prior Success or a fresh first-time deploy
|
||||
// skips this extra round-trip.
|
||||
var reconciled = await TryReconcileWithSiteAsync(
|
||||
instance, revisionHash, configJson, cancellationToken);
|
||||
instance, revisionHash, configJson, user, cancellationToken);
|
||||
if (reconciled != null)
|
||||
return Result<DeploymentRecord>.Success(reconciled);
|
||||
|
||||
@@ -617,6 +617,7 @@ public class DeploymentService
|
||||
Instance instance,
|
||||
string targetRevisionHash,
|
||||
string configJson,
|
||||
string currentUser,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var prior = await _repository.GetCurrentDeploymentStatusAsync(instance.Id, cancellationToken);
|
||||
@@ -695,9 +696,19 @@ public class DeploymentService
|
||||
instance, prior.DeploymentId, targetRevisionHash, configJson,
|
||||
forceEnabledState: false, cancellationToken);
|
||||
|
||||
await _auditService.LogAsync(prior.DeployedBy, "DeployReconciled", "Instance",
|
||||
// DeploymentManager-020: attribute the audit row to the user driving
|
||||
// THIS redeploy (the caller of DeployInstanceAsync), not the user
|
||||
// who issued the original timed-out / stuck deployment. The original
|
||||
// deployer is preserved in the detail object so forensics can still
|
||||
// see who launched the run that reconciliation rescued.
|
||||
await _auditService.LogAsync(currentUser, "DeployReconciled", "Instance",
|
||||
instance.Id.ToString(), instance.UniqueName,
|
||||
new { DeploymentId = prior.DeploymentId, RevisionHash = targetRevisionHash },
|
||||
new
|
||||
{
|
||||
DeploymentId = prior.DeploymentId,
|
||||
RevisionHash = targetRevisionHash,
|
||||
OriginalDeployer = prior.DeployedBy
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
return prior;
|
||||
|
||||
@@ -271,6 +271,18 @@ try
|
||||
// Map gRPC service — resolves the singleton SiteStreamGrpcServer from DI
|
||||
app.MapGrpcService<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
|
||||
// Host-017 / REQ-HOST-7: site-shutdown ordering. ApplicationStopping
|
||||
// fires BEFORE IHostedService.StopAsync runs, so the gRPC server
|
||||
// refuses new streams (Unavailable) and cancels every active stream
|
||||
// here — clients observe a clean Cancelled and reconnect — and only
|
||||
// THEN does AkkaHostedService run CoordinatedShutdown and tear down
|
||||
// actors. Without this hand-off, in-flight streams go silent and only
|
||||
// time out via gRPC keepalive (~25 s), violating the documented
|
||||
// four-step sequence.
|
||||
var siteLifetime = app.Services.GetRequiredService<Microsoft.Extensions.Hosting.IHostApplicationLifetime>();
|
||||
var siteGrpcServer = app.Services.GetRequiredService<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
siteLifetime.ApplicationStopping.Register(() => siteGrpcServer.CancelAllStreams());
|
||||
|
||||
await app.RunAsync();
|
||||
}
|
||||
else
|
||||
|
||||
@@ -179,8 +179,14 @@ public class RouteTarget
|
||||
var siteId = await ResolveSiteAsync(token);
|
||||
var correlationId = Guid.NewGuid().ToString();
|
||||
|
||||
// Audit Log #23 (ParentExecutionId): mirrors the Call path — stamp the
|
||||
// spawning inbound request's ExecutionId so future site-side audit
|
||||
// emission for routed reads can record this read's parent. Symmetric
|
||||
// with RouteToCallRequest so script authors get the same correlation
|
||||
// across Call / GetAttributes / SetAttributes.
|
||||
var request = new RouteToGetAttributesRequest(
|
||||
correlationId, _instanceCode, attributeNames.ToList(), DateTimeOffset.UtcNow);
|
||||
correlationId, _instanceCode, attributeNames.ToList(), DateTimeOffset.UtcNow,
|
||||
_parentExecutionId);
|
||||
|
||||
var response = await _instanceRouter.RouteToGetAttributesAsync(siteId, request, token);
|
||||
|
||||
@@ -222,8 +228,14 @@ public class RouteTarget
|
||||
var siteId = await ResolveSiteAsync(token);
|
||||
var correlationId = Guid.NewGuid().ToString();
|
||||
|
||||
// Audit Log #23 (ParentExecutionId): mirrors the Call path — stamp the
|
||||
// spawning inbound request's ExecutionId so future site-side audit
|
||||
// emission for routed writes can record this write's parent. Symmetric
|
||||
// with RouteToCallRequest so script authors get the same correlation
|
||||
// across Call / GetAttributes / SetAttributes.
|
||||
var request = new RouteToSetAttributesRequest(
|
||||
correlationId, _instanceCode, attributeValues, DateTimeOffset.UtcNow);
|
||||
correlationId, _instanceCode, attributeValues, DateTimeOffset.UtcNow,
|
||||
_parentExecutionId);
|
||||
|
||||
var response = await _instanceRouter.RouteToSetAttributesAsync(siteId, request, token);
|
||||
|
||||
|
||||
@@ -6,21 +6,46 @@ namespace ScadaLink.NotificationOutbox;
|
||||
/// </summary>
|
||||
public class NotificationOutboxOptions
|
||||
{
|
||||
/// <summary>Interval between dispatch sweeps that pick up pending notifications for delivery.</summary>
|
||||
/// <summary>
|
||||
/// Interval between dispatch sweeps that pick up pending notifications for delivery.
|
||||
/// Default: 10 seconds.
|
||||
/// </summary>
|
||||
public TimeSpan DispatchInterval { get; set; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>Maximum number of notifications claimed for delivery in a single dispatch sweep.</summary>
|
||||
/// <summary>
|
||||
/// Maximum number of notifications claimed for delivery in a single dispatch sweep.
|
||||
/// Caps per-sweep batch size to bound memory and per-iteration latency.
|
||||
/// Default: 100.
|
||||
/// </summary>
|
||||
public int DispatchBatchSize { get; set; } = 100;
|
||||
|
||||
/// <summary>Age past which an in-progress notification is considered stuck and re-claimed.</summary>
|
||||
/// <summary>
|
||||
/// Age past which a still-<c>Pending</c>/<c>Retrying</c> notification is counted as
|
||||
/// "stuck" on the KPI tile and the per-row badge in the Central UI.
|
||||
/// Display-only: rows older than this threshold are flagged in KPIs/UI; there is no
|
||||
/// automatic re-claim, requeue, or escalation — the dispatcher behaviour is unaffected.
|
||||
/// Default: 10 minutes.
|
||||
/// </summary>
|
||||
public TimeSpan StuckAgeThreshold { get; set; } = TimeSpan.FromMinutes(10);
|
||||
|
||||
/// <summary>Retention period for notifications in a terminal state before they are purged.</summary>
|
||||
/// <summary>
|
||||
/// Retention period for notifications in a terminal state (<c>Delivered</c>,
|
||||
/// <c>Parked</c>, <c>Discarded</c>) before they are purged from the Notifications table.
|
||||
/// Default: 365 days.
|
||||
/// </summary>
|
||||
public TimeSpan TerminalRetention { get; set; } = TimeSpan.FromDays(365);
|
||||
|
||||
/// <summary>Interval between background purge sweeps of terminal notifications.</summary>
|
||||
/// <summary>
|
||||
/// Interval between background purge sweeps of terminal notifications older than
|
||||
/// <see cref="TerminalRetention"/>.
|
||||
/// Default: 1 day.
|
||||
/// </summary>
|
||||
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromDays(1);
|
||||
|
||||
/// <summary>Trailing window used to compute the delivered-notifications throughput KPI.</summary>
|
||||
/// <summary>
|
||||
/// Trailing window used to compute the "delivered (last interval)" throughput KPI
|
||||
/// surfaced on the Health dashboard and the Notification Outbox page.
|
||||
/// Default: 1 minute.
|
||||
/// </summary>
|
||||
public TimeSpan DeliveredKpiWindow { get; set; } = TimeSpan.FromMinutes(1);
|
||||
}
|
||||
|
||||
@@ -23,10 +23,14 @@ namespace ScadaLink.SiteCallAudit;
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Query, detail and KPIs (Task 4) and the central→site Retry/Discard relay
|
||||
/// (Task 5 — the relay handlers live in this actor) are implemented; only
|
||||
/// reconciliation remains deferred (per CLAUDE.md scope discipline — it lands
|
||||
/// in a later follow-up).
|
||||
/// Implemented: direct <see cref="UpsertSiteCallCommand"/> telemetry ingest,
|
||||
/// query, detail and KPI handlers (Task 4), and the central→site Retry/Discard
|
||||
/// relay (Task 5 — the relay handlers live in this actor). Deferred (per
|
||||
/// CLAUDE.md scope discipline — both land in a later follow-up): the periodic
|
||||
/// per-site reconciliation puller that backfills lost telemetry, and the daily
|
||||
/// terminal-row purge scheduler (the repository exposes
|
||||
/// <c>PurgeTerminalAsync</c> but nothing in this module currently invokes it
|
||||
/// on a schedule).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" —
|
||||
@@ -556,6 +560,13 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
// SiteUnreachable is never produced from a ParkedOperationActionAck —
|
||||
// unreachable responses are built by UnreachableRetry/UnreachableDiscard
|
||||
// before any ack is classified, so this arm is unreachable by construction.
|
||||
// We deliberately return ack.ErrorMessage (rather than throwing) to keep
|
||||
// AckErrorMessage total and side-effect-free: site-unreachable is classified
|
||||
// as transient by the upstream relay path (which has already constructed the
|
||||
// SiteUnreachable response and detail text via SiteUnreachableMessage), so a
|
||||
// defensive fall-through here just surfaces whatever error text the ack
|
||||
// carries and lets the caller schedule a retry. Throwing would turn a benign
|
||||
// refactor invariant violation into a relay-path crash.
|
||||
SiteCallRelayOutcome.SiteUnreachable => ack.ErrorMessage,
|
||||
_ => throw new ArgumentOutOfRangeException(
|
||||
nameof(outcome), outcome, "unknown SiteCallRelayOutcome"),
|
||||
|
||||
@@ -4,6 +4,21 @@ using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-019: predicate the <see cref="EventLogPurgeService"/>
|
||||
/// consults at the top of every purge tick to decide whether THIS node should
|
||||
/// run the daily purge. The design states "a daily background job runs on the
|
||||
/// active node and deletes all events older than 30 days"; the standby's local
|
||||
/// SQLite receives no writes, so purging there is harmless but unnecessary —
|
||||
/// and silently doing it anyway diverges from the design.
|
||||
///
|
||||
/// Registration is the Host's responsibility (it knows the cluster topology);
|
||||
/// when no implementation is registered the purge service defaults to "always
|
||||
/// active" so non-clustered hosts and unit tests are unaffected — backward
|
||||
/// compatible with the prior "run on every host" behaviour.
|
||||
/// </summary>
|
||||
public delegate bool SiteEventLogActiveNodeCheck();
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically purges old events from the SQLite event log.
|
||||
/// Enforces both time-based retention (default 30 days) and storage cap (default 1GB).
|
||||
@@ -17,15 +32,24 @@ public class EventLogPurgeService : BackgroundService
|
||||
private readonly SiteEventLogger _eventLogger;
|
||||
private readonly SiteEventLogOptions _options;
|
||||
private readonly ILogger<EventLogPurgeService> _logger;
|
||||
private readonly SiteEventLogActiveNodeCheck _isActiveNode;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="EventLogPurgeService"/>.</summary>
|
||||
/// <param name="eventLogger">The concrete event logger providing lock-guarded database access.</param>
|
||||
/// <param name="options">Site event log options (retention days, storage cap, purge interval).</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="isActiveNode">
|
||||
/// SiteEventLogging-019: optional active-node check. When <c>null</c>, the
|
||||
/// service runs the purge on every tick (preserves the pre-fix behaviour
|
||||
/// for non-clustered hosts and existing tests). When supplied — e.g. by
|
||||
/// the Host on a site node — each tick early-exits on the standby so the
|
||||
/// daily purge runs only on the active node, matching the design.
|
||||
/// </param>
|
||||
public EventLogPurgeService(
|
||||
SiteEventLogger eventLogger,
|
||||
IOptions<SiteEventLogOptions> options,
|
||||
ILogger<EventLogPurgeService> logger)
|
||||
ILogger<EventLogPurgeService> logger,
|
||||
SiteEventLogActiveNodeCheck? isActiveNode = null)
|
||||
{
|
||||
// Depend on the concrete recorder directly: purge must funnel database access
|
||||
// through its lock-guarded WithConnection. Taking ISiteEventLogger and
|
||||
@@ -33,6 +57,7 @@ public class EventLogPurgeService : BackgroundService
|
||||
_eventLogger = eventLogger;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_isActiveNode = isActiveNode ?? (static () => true);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -58,6 +83,31 @@ public class EventLogPurgeService : BackgroundService
|
||||
{
|
||||
try
|
||||
{
|
||||
// SiteEventLogging-019: gate every tick on the active-node check.
|
||||
// The standby's local SQLite receives no writes, so purging there
|
||||
// is harmless but unnecessary; the design (Component-SiteEventLogging
|
||||
// → Storage) explicitly states the purge runs on the active node.
|
||||
// Defensive try/catch: a transient cluster-state read failure must
|
||||
// not stop the purge loop — fall back to running the purge (the
|
||||
// pre-fix behaviour was "always run", which is harmless on standby).
|
||||
bool isActive;
|
||||
try
|
||||
{
|
||||
isActive = _isActiveNode();
|
||||
}
|
||||
catch (Exception checkEx)
|
||||
{
|
||||
_logger.LogDebug(checkEx,
|
||||
"Active-node check threw during purge tick; running purge to be safe");
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
if (!isActive)
|
||||
{
|
||||
_logger.LogDebug("Skipping event log purge tick — this node is not the active site member");
|
||||
return;
|
||||
}
|
||||
|
||||
PurgeByRetention();
|
||||
PurgeByStorageCap();
|
||||
}
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.SiteEventLogging;
|
||||
|
||||
@@ -18,7 +21,19 @@ public static class ServiceCollectionExtensions
|
||||
services.AddSingleton<SiteEventLogger>();
|
||||
services.AddSingleton<ISiteEventLogger>(sp => sp.GetRequiredService<SiteEventLogger>());
|
||||
services.AddSingleton<IEventLogQueryService, EventLogQueryService>();
|
||||
services.AddHostedService<EventLogPurgeService>();
|
||||
|
||||
// SiteEventLogging-019: the purge service still registers on every host
|
||||
// node, but it consults an optional SiteEventLogActiveNodeCheck on each
|
||||
// tick and early-exits on the standby. The Host registers the real
|
||||
// active-node check on site nodes; tests and non-clustered hosts leave
|
||||
// it unregistered, and the purge defaults to "always run" (the
|
||||
// pre-fix behaviour). Building the service via a factory so the
|
||||
// optional delegate flows from DI rather than the constructor default.
|
||||
services.AddHostedService(sp => new EventLogPurgeService(
|
||||
sp.GetRequiredService<SiteEventLogger>(),
|
||||
sp.GetRequiredService<IOptions<SiteEventLogOptions>>(),
|
||||
sp.GetRequiredService<ILogger<EventLogPurgeService>>(),
|
||||
sp.GetService<SiteEventLogActiveNodeCheck>()));
|
||||
return services;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,32 +3,40 @@ using ScadaLink.StoreAndForward;
|
||||
|
||||
namespace ScadaLink.SiteRuntime.Messages;
|
||||
|
||||
/// <summary>
|
||||
/// Outbound messages — sent by local DeploymentManagerActor/S&F service
|
||||
/// to the local SiteReplicationActor for forwarding to the peer node.
|
||||
/// </summary>
|
||||
// Outbound messages — sent by local DeploymentManagerActor/S&F service
|
||||
// to the local SiteReplicationActor for forwarding to the peer node.
|
||||
|
||||
/// <summary>Outbound: replicate a deployed instance config (create or update) to the peer node.</summary>
|
||||
public record ReplicateConfigDeploy(
|
||||
string InstanceName, string ConfigJson, string DeploymentId, string RevisionHash, bool IsEnabled);
|
||||
|
||||
/// <summary>Outbound: replicate removal of a deployed instance config to the peer node.</summary>
|
||||
public record ReplicateConfigRemove(string InstanceName);
|
||||
|
||||
/// <summary>Outbound: replicate an instance enabled/disabled flag change to the peer node.</summary>
|
||||
public record ReplicateConfigSetEnabled(string InstanceName, bool IsEnabled);
|
||||
|
||||
/// <summary>Outbound: replicate a system-wide artifact deployment (shared scripts, external systems, etc.) to the peer node.</summary>
|
||||
public record ReplicateArtifacts(DeployArtifactsCommand Command);
|
||||
|
||||
/// <summary>Outbound: replicate a store-and-forward buffer mutation (enqueue/dequeue/park/etc.) to the peer node.</summary>
|
||||
public record ReplicateStoreAndForward(ReplicationOperation Operation);
|
||||
|
||||
/// <summary>
|
||||
/// Inbound messages — received from the peer's SiteReplicationActor
|
||||
/// and applied to local SQLite storage.
|
||||
/// </summary>
|
||||
// Inbound messages — received from the peer's SiteReplicationActor
|
||||
// and applied to local SQLite storage.
|
||||
|
||||
/// <summary>Inbound: apply a peer-replicated instance config (create or update) to local SQLite.</summary>
|
||||
public record ApplyConfigDeploy(
|
||||
string InstanceName, string ConfigJson, string DeploymentId, string RevisionHash, bool IsEnabled);
|
||||
|
||||
/// <summary>Inbound: apply peer-replicated removal of a deployed instance config to local SQLite.</summary>
|
||||
public record ApplyConfigRemove(string InstanceName);
|
||||
|
||||
/// <summary>Inbound: apply a peer-replicated instance enabled/disabled flag change to local SQLite.</summary>
|
||||
public record ApplyConfigSetEnabled(string InstanceName, bool IsEnabled);
|
||||
|
||||
/// <summary>Inbound: apply a peer-replicated system-wide artifact deployment to local SQLite.</summary>
|
||||
public record ApplyArtifacts(DeployArtifactsCommand Command);
|
||||
|
||||
/// <summary>Inbound: apply a peer-replicated store-and-forward buffer mutation to the local buffer.</summary>
|
||||
public record ApplyStoreAndForward(ReplicationOperation Operation);
|
||||
|
||||
Reference in New Issue
Block a user