docs+code: close Theme 1 — 24 design-doc / XML-doc drift findings

Doc/XML-comment drift + small adherence fixes across 17 modules. Highlights:
- Host-017: site CoordinatedShutdown ordering — SiteStreamGrpcServer gains
  CancelAllStreams() (refuse new streams, cancel active), wired into
  Program.cs site branch via ApplicationStopping.
- InboundAPI-021: ParentExecutionId now travels on RouteToGet/SetAttributes
  symmetric with RouteToCallRequest; RouteHelper stamps from _parentExecutionId.
- ClusterInfra-012: ClusterOptionsValidator now requires both seed nodes.
- Comm-018: SiteCommunicationActor.HeartbeatMessage.IsActive derived from
  cluster leader check (was hardcoded true).
- DM-020: reconciliation audit row attributes the current user, not prior deployer.
- SEL-019: EventLogPurgeService early-exits on standby via active-node check.
- Plus comment/XML-doc accuracy fixes across AuditLog, ConfigurationDatabase,
  NotificationOutbox, SiteRuntime, SiteCallAudit; doc refreshes for Component-
  Commons / -ManagementService / -CLI / -ExternalSystemGateway / -HealthMonitoring
  / -Transport / -ConfigurationDatabase; CD-023 index-name doc alignment.

11 new regression tests (RouteHelper x4, SiteStreamGrpcServer x2,
ClusterOptionsValidator x1, SiteCommunicationActor x1, DeploymentService x1,
EventLogPurgeService x3). Build clean (0 warnings); InboundAPI/Communication/
Host suites all green. README regenerated: 112 open (was 136).
This commit is contained in:
Joseph Doherty
2026-05-28 06:28:31 -04:00
parent e3ca9af1be
commit 487859bff0
51 changed files with 940 additions and 188 deletions
@@ -69,6 +69,23 @@ public class ClusterOptionsValidatorTests
Assert.Contains("SeedNodes", result.FailureMessage);
}
[Fact]
public void SingleSeedNode_FailsValidation()
{
// CI-012: design doc says "both nodes are seed nodes" — a single-seed
// configuration defeats the no-startup-ordering-dependency guarantee and
// must be rejected by the contract owner's validator, not just by the
// Host's startup validator.
var options = ValidOptions();
options.SeedNodes = new List<string> { "akka.tcp://scadalink@node1:8081" };
var result = new ClusterOptionsValidator().Validate(null, options);
Assert.True(result.Failed);
Assert.Contains("SeedNodes", result.FailureMessage);
Assert.Contains("2", result.FailureMessage);
}
[Fact]
public void HeartbeatNotBelowFailureThreshold_FailsValidation()
{
@@ -138,6 +138,63 @@ public class SiteStreamGrpcServerTests : TestKit
Assert.Equal(0, server.ActiveStreamCount);
}
// --- Host-017 / REQ-HOST-7: site-shutdown ordering ---
[Fact]
public async Task Host017_CancelAllStreams_CancelsActiveStreamsAndRefusesNewOnes()
{
// REQ-HOST-7 step (1)+(2): on CoordinatedShutdown the gRPC server must
// stop accepting new streams AND cancel every active stream so the
// client observes a clean Cancelled (not a silent stream that only
// times out via keepalive). Program.cs registers
// ApplicationStopping → CancelAllStreams(); this test exercises the
// server-side guarantee in isolation.
var server = CreateServer();
server.SetReady(Sys);
var cts1 = new CancellationTokenSource();
var context1 = CreateMockContext(cts1.Token);
var writer1 = Substitute.For<IServerStreamWriter<SiteStreamEvent>>();
var stream1Task = Task.Run(() => server.SubscribeInstance(
MakeRequest("corr-shutdown-1"), writer1, context1));
await WaitForConditionAsync(() => server.ActiveStreamCount == 1);
// Begin shutdown — flip the flag AND cancel the active stream.
server.CancelAllStreams();
Assert.True(server.IsShuttingDown);
// Active stream's await foreach observes OCE and falls through finally
// → entry is removed from _activeStreams.
await stream1Task;
Assert.Equal(0, server.ActiveStreamCount);
// A second SubscribeInstance after shutdown is refused immediately
// with Unavailable rather than allowed to register a new stream.
var writer2 = Substitute.For<IServerStreamWriter<SiteStreamEvent>>();
var context2 = CreateMockContext();
var ex = await Assert.ThrowsAsync<RpcException>(
() => server.SubscribeInstance(MakeRequest("corr-shutdown-2"), writer2, context2));
Assert.Equal(StatusCode.Unavailable, ex.StatusCode);
Assert.Contains("shutting", ex.Status.Detail, StringComparison.OrdinalIgnoreCase);
}
[Fact]
public void Host017_CancelAllStreams_IsIdempotent()
{
// Repeated calls during a double-fire shutdown sequence must not throw.
var server = CreateServer();
server.SetReady(Sys);
server.CancelAllStreams();
server.CancelAllStreams();
Assert.True(server.IsShuttingDown);
Assert.Equal(0, server.ActiveStreamCount);
}
[Fact]
public async Task SubscribesAndRemovesFromStreamManager()
{
@@ -2,6 +2,7 @@ using Akka.Actor;
using Akka.Cluster.Tools.Client;
using Akka.TestKit.Xunit2;
using ScadaLink.Commons.Messages.Deployment;
using ScadaLink.Commons.Messages.Health;
using ScadaLink.Commons.Messages.Lifecycle;
using ScadaLink.Commons.Messages.Integration;
using ScadaLink.Commons.Messages.Notification;
@@ -282,4 +283,35 @@ public class SiteCommunicationActorTests : TestKit
Assert.False(ack.Applied);
Assert.NotNull(ack.ErrorMessage);
}
// ── Communication-018: heartbeat IsActive reflects this node's cluster role ──
[Theory]
[InlineData(true)]
[InlineData(false)]
public void Heartbeat_StampsIsActive_FromInjectedCheck(bool isActive)
{
// Communication-018: HeartbeatMessage.IsActive must reflect the actual
// active/standby role of this node, not a hard-coded `true`. The actor
// now takes a Func<bool> override (defaulting to a real Akka.Cluster
// leader check in production); tests inject a stub so they do not need
// to bring up a full cluster in the TestKit ActorSystem.
var dmProbe = CreateTestProbe();
var centralClientProbe = CreateTestProbe();
var fastHeartbeatOptions = new CommunicationOptions
{
TransportHeartbeatInterval = TimeSpan.FromMilliseconds(50)
};
var siteActor = Sys.ActorOf(Props.Create(() =>
new SiteCommunicationActor("site1", fastHeartbeatOptions, dmProbe.Ref, () => isActive)));
siteActor.Tell(new RegisterCentralClient(centralClientProbe.Ref));
var send = centralClientProbe.FishForMessage<ClusterClient.Send>(
s => s.Message is HeartbeatMessage, TimeSpan.FromSeconds(3));
var heartbeat = Assert.IsType<HeartbeatMessage>(send.Message);
Assert.Equal(isActive, heartbeat.IsActive);
Assert.Equal("site1", heartbeat.SiteId);
}
}
@@ -909,6 +909,50 @@ public class DeploymentServiceTests : TestKit
Assert.Equal("sha256:target", prior.RevisionHash);
}
// ── DeploymentManager-020: reconciliation audit attributes to the CURRENT user, not the prior deployer ──
[Fact]
public async Task DeployInstanceAsync_Reconciled_AuditAttributesCurrentUserNotPriorDeployer()
{
// DeploymentManager-020: a redeploy that reconciles a timed-out prior
// record must be audited as the action of the user driving THIS
// redeploy — not the user who originally issued the now-reconciled
// deployment. The prior deployer is preserved in the detail object so
// forensics still see who started the rescued run.
var instance = new Instance("ReconcileAuditUser")
{
Id = 73, SiteId = 1, State = InstanceState.NotDeployed
};
_repo.GetInstanceByIdAsync(73, Arg.Any<CancellationToken>()).Returns(instance);
SetupValidPipeline(73, "ReconcileAuditUser", "sha256:target");
var prior = new DeploymentRecord("dep-prior-73", "originalUser")
{
InstanceId = 73,
Status = DeploymentStatus.InProgress,
RevisionHash = "sha256:target"
};
_repo.GetCurrentDeploymentStatusAsync(73, Arg.Any<CancellationToken>()).Returns(prior);
_repo.GetDeployedSnapshotByInstanceIdAsync(73, Arg.Any<CancellationToken>())
.Returns((DeployedConfigSnapshot?)null);
var commActor = Sys.ActorOf(Props.Create(() =>
new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false)));
var service = CreateServiceWithCommActor(commActor);
var result = await service.DeployInstanceAsync(73, "currentUser");
Assert.True(result.IsSuccess);
// DeploymentManager-020: audit row's actor is the current user.
await _audit.Received().LogAsync(
"currentUser", "DeployReconciled", "Instance", "73", "ReconcileAuditUser",
Arg.Any<object>(), Arg.Any<CancellationToken>());
// And the prior deployer was NOT used as the actor.
await _audit.DidNotReceive().LogAsync(
"originalUser", "DeployReconciled", "Instance", "73", "ReconcileAuditUser",
Arg.Any<object>(), Arg.Any<CancellationToken>());
}
// ── DeploymentManager-012: LifecycleCommandTimeout must actually bound lifecycle commands ──
[Fact]
@@ -115,7 +115,7 @@ public class SiteHealthCollectorTests
}
[Fact]
public void StoreAndForwardBufferDepths_IsEmptyPlaceholder()
public void StoreAndForwardBufferDepths_DefaultsToEmpty_WhenSetterNotCalled()
{
var report = _collector.CollectReport("site-1");
Assert.Empty(report.StoreAndForwardBufferDepths);
@@ -330,4 +330,75 @@ public class RouteHelperTests
Assert.Equal(deadline.Token, seen);
}
// --- InboundAPI-021: ParentExecutionId flows through Get/SetAttributes too ---
[Fact]
public async Task GetAttributes_WithoutParentExecutionId_LeavesParentExecutionIdNull()
{
SiteResolves("inst-1", "SiteA");
RouteToGetAttributesRequest? captured = null;
_router.RouteToGetAttributesAsync("SiteA", Arg.Do<RouteToGetAttributesRequest>(r => captured = r), Arg.Any<CancellationToken>())
.Returns(ci => new RouteToGetAttributesResponse(
((RouteToGetAttributesRequest)ci[1]).CorrelationId,
new Dictionary<string, object?>(), true, null, DateTimeOffset.UtcNow));
await CreateHelper().To("inst-1").GetAttributes(new[] { "a" });
Assert.NotNull(captured);
Assert.Null(captured!.ParentExecutionId);
}
[Fact]
public async Task GetAttributes_WithParentExecutionId_CarriesItOnRouteToGetAttributesRequest()
{
// Symmetric with Call: a RouteHelper bound to the inbound request's
// ExecutionId stamps it onto the routed GetAttributes request so
// future site-side audit can record the inbound→site link.
SiteResolves("inst-1", "SiteA");
var inboundExecutionId = Guid.NewGuid();
RouteToGetAttributesRequest? captured = null;
_router.RouteToGetAttributesAsync("SiteA", Arg.Do<RouteToGetAttributesRequest>(r => captured = r), Arg.Any<CancellationToken>())
.Returns(ci => new RouteToGetAttributesResponse(
((RouteToGetAttributesRequest)ci[1]).CorrelationId,
new Dictionary<string, object?>(), true, null, DateTimeOffset.UtcNow));
var bound = CreateHelper().WithParentExecutionId(inboundExecutionId);
await bound.To("inst-1").GetAttributes(new[] { "a" });
Assert.NotNull(captured);
Assert.Equal(inboundExecutionId, captured!.ParentExecutionId);
}
[Fact]
public async Task SetAttributes_WithoutParentExecutionId_LeavesParentExecutionIdNull()
{
SiteResolves("inst-1", "SiteA");
RouteToSetAttributesRequest? captured = null;
_router.RouteToSetAttributesAsync("SiteA", Arg.Do<RouteToSetAttributesRequest>(r => captured = r), Arg.Any<CancellationToken>())
.Returns(ci => new RouteToSetAttributesResponse(
((RouteToSetAttributesRequest)ci[1]).CorrelationId, true, null, DateTimeOffset.UtcNow));
await CreateHelper().To("inst-1").SetAttributes(new Dictionary<string, string> { ["x"] = "1" });
Assert.NotNull(captured);
Assert.Null(captured!.ParentExecutionId);
}
[Fact]
public async Task SetAttributes_WithParentExecutionId_CarriesItOnRouteToSetAttributesRequest()
{
SiteResolves("inst-1", "SiteA");
var inboundExecutionId = Guid.NewGuid();
RouteToSetAttributesRequest? captured = null;
_router.RouteToSetAttributesAsync("SiteA", Arg.Do<RouteToSetAttributesRequest>(r => captured = r), Arg.Any<CancellationToken>())
.Returns(ci => new RouteToSetAttributesResponse(
((RouteToSetAttributesRequest)ci[1]).CorrelationId, true, null, DateTimeOffset.UtcNow));
var bound = CreateHelper().WithParentExecutionId(inboundExecutionId);
await bound.To("inst-1").SetAttributes(new Dictionary<string, string> { ["x"] = "1" });
Assert.NotNull(captured);
Assert.Equal(inboundExecutionId, captured!.ParentExecutionId);
}
}
@@ -29,13 +29,16 @@ public class EventLogPurgeServiceTests : IDisposable
if (File.Exists(_dbPath)) File.Delete(_dbPath);
}
private EventLogPurgeService CreatePurgeService(SiteEventLogOptions? optionsOverride = null)
private EventLogPurgeService CreatePurgeService(
SiteEventLogOptions? optionsOverride = null,
SiteEventLogActiveNodeCheck? isActiveNode = null)
{
var opts = optionsOverride ?? _options;
return new EventLogPurgeService(
_eventLogger,
Options.Create(opts),
NullLogger<EventLogPurgeService>.Instance);
NullLogger<EventLogPurgeService>.Instance,
isActiveNode);
}
private void InsertEventWithTimestamp(DateTimeOffset timestamp)
@@ -309,4 +312,54 @@ public class EventLogPurgeServiceTests : IDisposable
Assert.Empty(exceptions);
}
// ── SiteEventLogging-019: purge runs only on the active node ──
[Fact]
public void RunPurge_OnStandbyNode_SkipsAllWork()
{
// SiteEventLogging-019: per design, the daily purge runs on the active
// node only. The standby's local SQLite receives no writes, so purging
// there is unnecessary; we gate the purge tick on the injected
// active-node check and early-exit when it returns false. The row
// inserted here is well past retention, so a real purge would delete
// it — the standby gate must leave it intact.
InsertEventWithTimestamp(DateTimeOffset.UtcNow.AddDays(-31));
Assert.Equal(1, GetEventCount());
var purge = CreatePurgeService(isActiveNode: () => false);
purge.RunPurge();
Assert.Equal(1, GetEventCount());
}
[Fact]
public void RunPurge_OnActiveNode_RunsTheRetentionPurge()
{
// SiteEventLogging-019: when the active-node check returns true the
// service runs the purge as before. Pinned alongside the standby case
// so a future regression that inverts the gate is caught.
InsertEventWithTimestamp(DateTimeOffset.UtcNow.AddDays(-31));
InsertEventWithTimestamp(DateTimeOffset.UtcNow);
var purge = CreatePurgeService(isActiveNode: () => true);
purge.RunPurge();
Assert.Equal(1, GetEventCount());
}
[Fact]
public void RunPurge_WithNullCheck_FallsBackToRunning()
{
// SiteEventLogging-019: when no active-node check is supplied (the
// default for non-clustered hosts and pre-existing tests), the service
// preserves the pre-fix "run on every tick" behaviour rather than
// silently skipping every tick. Backward compatibility guard.
InsertEventWithTimestamp(DateTimeOffset.UtcNow.AddDays(-31));
var purge = CreatePurgeService(isActiveNode: null);
purge.RunPurge();
Assert.Equal(0, GetEventCount());
}
}