From b0584f7a081bd175a99c3cc02bd2de9ed7b3fd3b Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 17:44:12 -0400 Subject: [PATCH 01/16] docs(audit): add M6 reconciliation+purge+partition+health plan (#23) 6 bundles: proto+site handler, reconciliation actor, purge actor with drop-and-rebuild around UX index, partition maintenance, four health metrics, integration tests. M5 realities baked in. --- ...-05-20-auditlog-m6-reconciliation-purge.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md diff --git a/docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md b/docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md new file mode 100644 index 0000000..aebaf93 --- /dev/null +++ b/docs/plans/2026-05-20-auditlog-m6-reconciliation-purge.md @@ -0,0 +1,19 @@ +# Audit Log #23 — M6 Reconciliation + Purge + Partition Maintenance + Health Metrics + +> **For Claude:** subagent-driven-development with bundled cadence. + +**Goal:** Self-healing telemetry (5-min reconciliation pull), monthly partition rollover, daily partition-switch purge with drop-and-rebuild around UX_AuditLog_EventId, all five health metrics live (SiteAuditBacklog, SiteAuditWriteFailures, SiteAuditTelemetryStalled, CentralAuditWriteFailures, AuditRedactionFailure). + +**M5 realities baked in:** AuditRedactionFailure counter is site-only — M6-T9 surfaces it centrally. SwitchOutPartitionAsync ships as NotSupportedException stub from M1; M6-T4 replaces it with the drop-DROP-INDEX → SWITCH PARTITION → DROP staging → CREATE UNIQUE NONCLUSTERED INDEX dance. Partition function pre-seeded Jan 2026 – Dec 2027; M6-T5 SPLITs new boundaries forward. + +**Bundles:** +- Bundle A — Proto + site handler (T1, T2) +- Bundle B — Reconciliation actor (T3) +- Bundle C — Purge actor + drop-and-rebuild repository fix (T4) +- Bundle D — Partition maintenance hosted service (T5) +- Bundle E — Health metrics (T6, T7, T8, T9) +- Bundle F — Integration tests (T10, T11, T12) + +Final cross-bundle review + merge. + +**Note**: M2 noted NoOpSiteStreamAuditClient stays in production until "M6 wires the real client". M6-T1+T2 add the PULL RPC; the actual production PUSH client (real implementation of ISiteStreamAuditClient.IngestAuditEventsAsync + IngestCachedTelemetryAsync) is the bigger lift. M6 will add the real client IF feasible within scope OR defer to a follow-up. Decision: try in Bundle A (alongside the proto extension); if scope blows up, the NoOp stays. From 25d9acbce35f8d510b4a01e463ee586db0cfab79 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 17:48:30 -0400 Subject: [PATCH 02/16] feat(comms): PullAuditEvents RPC for audit reconciliation (#23 M6) --- .../Protos/sitestream.proto | 17 + .../SiteStreamGrpc/Sitestream.cs | 516 +++++++++++++++++- .../SiteStreamGrpc/SitestreamGrpc.cs | 42 +- .../Protos/PullAuditEventsProtoTests.cs | 83 +++ 4 files changed, 641 insertions(+), 17 deletions(-) create mode 100644 tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs diff --git a/src/ScadaLink.Communication/Protos/sitestream.proto b/src/ScadaLink.Communication/Protos/sitestream.proto index 43ffbe3..5ceb709 100644 --- a/src/ScadaLink.Communication/Protos/sitestream.proto +++ b/src/ScadaLink.Communication/Protos/sitestream.proto @@ -9,6 +9,7 @@ service SiteStreamService { rpc SubscribeInstance(InstanceStreamRequest) returns (stream SiteStreamEvent); rpc IngestAuditEvents(AuditEventBatch) returns (IngestAck); rpc IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck); + rpc PullAuditEvents(PullAuditEventsRequest) returns (PullAuditEventsResponse); } message InstanceStreamRequest { @@ -119,3 +120,19 @@ message CachedTelemetryPacket { } message CachedTelemetryBatch { repeated CachedTelemetryPacket packets = 1; } + +// Audit Log (#23) M6 reconciliation pull: central→site request for any +// site-local AuditLog rows with OccurredAtUtc >= since_utc that have not yet +// been ingested centrally (ForwardState in {Pending, Forwarded}). The site +// flips returned rows to Reconciled after the response is on the wire. +// more_available signals batch_size was saturated so the caller knows to +// issue a follow-up pull with an advanced since_utc cursor. +message PullAuditEventsRequest { + google.protobuf.Timestamp since_utc = 1; + int32 batch_size = 2; +} + +message PullAuditEventsResponse { + repeated AuditEventDto events = 1; + bool more_available = 2; +} diff --git a/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs b/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs index 9639242..ccac2bb 100644 --- a/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs +++ b/src/ScadaLink.Communication/SiteStreamGrpc/Sitestream.cs @@ -68,21 +68,27 @@ namespace ScadaLink.Communication.Grpc { "bnREdG8SNwoLb3BlcmF0aW9uYWwYAiABKAsyIi5zaXRlc3RyZWFtLlNpdGVD", "YWxsT3BlcmF0aW9uYWxEdG8iSgoUQ2FjaGVkVGVsZW1ldHJ5QmF0Y2gSMgoH", "cGFja2V0cxgBIAMoCzIhLnNpdGVzdHJlYW0uQ2FjaGVkVGVsZW1ldHJ5UGFj", - "a2V0KlwKB1F1YWxpdHkSFwoTUVVBTElUWV9VTlNQRUNJRklFRBAAEhAKDFFV", - "QUxJVFlfR09PRBABEhUKEVFVQUxJVFlfVU5DRVJUQUlOEAISDwoLUVVBTElU", - "WV9CQUQQAypdCg5BbGFybVN0YXRlRW51bRIbChdBTEFSTV9TVEFURV9VTlNQ", - "RUNJRklFRBAAEhYKEkFMQVJNX1NUQVRFX05PUk1BTBABEhYKEkFMQVJNX1NU", - "QVRFX0FDVElWRRACKoUBCg5BbGFybUxldmVsRW51bRIUChBBTEFSTV9MRVZF", - "TF9OT05FEAASEwoPQUxBUk1fTEVWRUxfTE9XEAESFwoTQUxBUk1fTEVWRUxf", - "TE9XX0xPVxACEhQKEEFMQVJNX0xFVkVMX0hJR0gQAxIZChVBTEFSTV9MRVZF", - "TF9ISUdIX0hJR0gQBDKFAgoRU2l0ZVN0cmVhbVNlcnZpY2USVQoRU3Vic2Ny", - "aWJlSW5zdGFuY2USIS5zaXRlc3RyZWFtLkluc3RhbmNlU3RyZWFtUmVxdWVz", - "dBobLnNpdGVzdHJlYW0uU2l0ZVN0cmVhbUV2ZW50MAESRwoRSW5nZXN0QXVk", - "aXRFdmVudHMSGy5zaXRlc3RyZWFtLkF1ZGl0RXZlbnRCYXRjaBoVLnNpdGVz", - "dHJlYW0uSW5nZXN0QWNrElAKFUluZ2VzdENhY2hlZFRlbGVtZXRyeRIgLnNp", - "dGVzdHJlYW0uQ2FjaGVkVGVsZW1ldHJ5QmF0Y2gaFS5zaXRlc3RyZWFtLklu", - "Z2VzdEFja0IfqgIcU2NhZGFMaW5rLkNvbW11bmljYXRpb24uR3JwY2IGcHJv", - "dG8z")); + "a2V0IlsKFlB1bGxBdWRpdEV2ZW50c1JlcXVlc3QSLQoJc2luY2VfdXRjGAEg", + "ASgLMhouZ29vZ2xlLnByb3RvYnVmLlRpbWVzdGFtcBISCgpiYXRjaF9zaXpl", + "GAIgASgFIlwKF1B1bGxBdWRpdEV2ZW50c1Jlc3BvbnNlEikKBmV2ZW50cxgB", + "IAMoCzIZLnNpdGVzdHJlYW0uQXVkaXRFdmVudER0bxIWCg5tb3JlX2F2YWls", + "YWJsZRgCIAEoCCpcCgdRdWFsaXR5EhcKE1FVQUxJVFlfVU5TUEVDSUZJRUQQ", + "ABIQCgxRVUFMSVRZX0dPT0QQARIVChFRVUFMSVRZX1VOQ0VSVEFJThACEg8K", + "C1FVQUxJVFlfQkFEEAMqXQoOQWxhcm1TdGF0ZUVudW0SGwoXQUxBUk1fU1RB", + "VEVfVU5TUEVDSUZJRUQQABIWChJBTEFSTV9TVEFURV9OT1JNQUwQARIWChJB", + "TEFSTV9TVEFURV9BQ1RJVkUQAiqFAQoOQWxhcm1MZXZlbEVudW0SFAoQQUxB", + "Uk1fTEVWRUxfTk9ORRAAEhMKD0FMQVJNX0xFVkVMX0xPVxABEhcKE0FMQVJN", + "X0xFVkVMX0xPV19MT1cQAhIUChBBTEFSTV9MRVZFTF9ISUdIEAMSGQoVQUxB", + "Uk1fTEVWRUxfSElHSF9ISUdIEAQy4QIKEVNpdGVTdHJlYW1TZXJ2aWNlElUK", + "EVN1YnNjcmliZUluc3RhbmNlEiEuc2l0ZXN0cmVhbS5JbnN0YW5jZVN0cmVh", + "bVJlcXVlc3QaGy5zaXRlc3RyZWFtLlNpdGVTdHJlYW1FdmVudDABEkcKEUlu", + "Z2VzdEF1ZGl0RXZlbnRzEhsuc2l0ZXN0cmVhbS5BdWRpdEV2ZW50QmF0Y2ga", + "FS5zaXRlc3RyZWFtLkluZ2VzdEFjaxJQChVJbmdlc3RDYWNoZWRUZWxlbWV0", + "cnkSIC5zaXRlc3RyZWFtLkNhY2hlZFRlbGVtZXRyeUJhdGNoGhUuc2l0ZXN0", + "cmVhbS5Jbmdlc3RBY2sSWgoPUHVsbEF1ZGl0RXZlbnRzEiIuc2l0ZXN0cmVh", + "bS5QdWxsQXVkaXRFdmVudHNSZXF1ZXN0GiMuc2l0ZXN0cmVhbS5QdWxsQXVk", + "aXRFdmVudHNSZXNwb25zZUIfqgIcU2NhZGFMaW5rLkNvbW11bmljYXRpb24u", + "R3JwY2IGcHJvdG8z")); descriptor = pbr::FileDescriptor.FromGeneratedCode(descriptorData, new pbr::FileDescriptor[] { global::Google.Protobuf.WellKnownTypes.TimestampReflection.Descriptor, global::Google.Protobuf.WellKnownTypes.WrappersReflection.Descriptor, }, new pbr::GeneratedClrTypeInfo(new[] {typeof(global::ScadaLink.Communication.Grpc.Quality), typeof(global::ScadaLink.Communication.Grpc.AlarmStateEnum), typeof(global::ScadaLink.Communication.Grpc.AlarmLevelEnum), }, null, new pbr::GeneratedClrTypeInfo[] { @@ -95,7 +101,9 @@ namespace ScadaLink.Communication.Grpc { new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.IngestAck), global::ScadaLink.Communication.Grpc.IngestAck.Parser, new[]{ "AcceptedEventIds" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.SiteCallOperationalDto), global::ScadaLink.Communication.Grpc.SiteCallOperationalDto.Parser, new[]{ "TrackedOperationId", "Channel", "Target", "SourceSite", "Status", "RetryCount", "LastError", "HttpStatus", "CreatedAtUtc", "UpdatedAtUtc", "TerminalAtUtc" }, null, null, null, null), new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.CachedTelemetryPacket), global::ScadaLink.Communication.Grpc.CachedTelemetryPacket.Parser, new[]{ "AuditEvent", "Operational" }, null, null, null, null), - new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.CachedTelemetryBatch), global::ScadaLink.Communication.Grpc.CachedTelemetryBatch.Parser, new[]{ "Packets" }, null, null, null, null) + new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.CachedTelemetryBatch), global::ScadaLink.Communication.Grpc.CachedTelemetryBatch.Parser, new[]{ "Packets" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest), global::ScadaLink.Communication.Grpc.PullAuditEventsRequest.Parser, new[]{ "SinceUtc", "BatchSize" }, null, null, null, null), + new pbr::GeneratedClrTypeInfo(typeof(global::ScadaLink.Communication.Grpc.PullAuditEventsResponse), global::ScadaLink.Communication.Grpc.PullAuditEventsResponse.Parser, new[]{ "Events", "MoreAvailable" }, null, null, null, null) })); } #endregion @@ -3862,6 +3870,482 @@ namespace ScadaLink.Communication.Grpc { } + /// + /// Audit Log (#23) M6 reconciliation pull: central→site request for any + /// site-local AuditLog rows with OccurredAtUtc >= since_utc that have not yet + /// been ingested centrally (ForwardState in {Pending, Forwarded}). The site + /// flips returned rows to Reconciled after the response is on the wire. + /// more_available signals batch_size was saturated so the caller knows to + /// issue a follow-up pull with an advanced since_utc cursor. + /// + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullAuditEventsRequest : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullAuditEventsRequest()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ScadaLink.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[10]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsRequest() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsRequest(PullAuditEventsRequest other) : this() { + sinceUtc_ = other.sinceUtc_ != null ? other.sinceUtc_.Clone() : null; + batchSize_ = other.batchSize_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsRequest Clone() { + return new PullAuditEventsRequest(this); + } + + /// Field number for the "since_utc" field. + public const int SinceUtcFieldNumber = 1; + private global::Google.Protobuf.WellKnownTypes.Timestamp sinceUtc_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public global::Google.Protobuf.WellKnownTypes.Timestamp SinceUtc { + get { return sinceUtc_; } + set { + sinceUtc_ = value; + } + } + + /// Field number for the "batch_size" field. + public const int BatchSizeFieldNumber = 2; + private int batchSize_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int BatchSize { + get { return batchSize_; } + set { + batchSize_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullAuditEventsRequest); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullAuditEventsRequest other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if (!object.Equals(SinceUtc, other.SinceUtc)) return false; + if (BatchSize != other.BatchSize) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + if (sinceUtc_ != null) hash ^= SinceUtc.GetHashCode(); + if (BatchSize != 0) hash ^= BatchSize.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + if (sinceUtc_ != null) { + output.WriteRawTag(10); + output.WriteMessage(SinceUtc); + } + if (BatchSize != 0) { + output.WriteRawTag(16); + output.WriteInt32(BatchSize); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + if (sinceUtc_ != null) { + size += 1 + pb::CodedOutputStream.ComputeMessageSize(SinceUtc); + } + if (BatchSize != 0) { + size += 1 + pb::CodedOutputStream.ComputeInt32Size(BatchSize); + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullAuditEventsRequest other) { + if (other == null) { + return; + } + if (other.sinceUtc_ != null) { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + SinceUtc.MergeFrom(other.SinceUtc); + } + if (other.BatchSize != 0) { + BatchSize = other.BatchSize; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + if (sinceUtc_ == null) { + SinceUtc = new global::Google.Protobuf.WellKnownTypes.Timestamp(); + } + input.ReadMessage(SinceUtc); + break; + } + case 16: { + BatchSize = input.ReadInt32(); + break; + } + } + } + } + #endif + + } + + [global::System.Diagnostics.DebuggerDisplayAttribute("{ToString(),nq}")] + public sealed partial class PullAuditEventsResponse : pb::IMessage + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + , pb::IBufferMessage + #endif + { + private static readonly pb::MessageParser _parser = new pb::MessageParser(() => new PullAuditEventsResponse()); + private pb::UnknownFieldSet _unknownFields; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pb::MessageParser Parser { get { return _parser; } } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public static pbr::MessageDescriptor Descriptor { + get { return global::ScadaLink.Communication.Grpc.SitestreamReflection.Descriptor.MessageTypes[11]; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + pbr::MessageDescriptor pb::IMessage.Descriptor { + get { return Descriptor; } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsResponse() { + OnConstruction(); + } + + partial void OnConstruction(); + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsResponse(PullAuditEventsResponse other) : this() { + events_ = other.events_.Clone(); + moreAvailable_ = other.moreAvailable_; + _unknownFields = pb::UnknownFieldSet.Clone(other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public PullAuditEventsResponse Clone() { + return new PullAuditEventsResponse(this); + } + + /// Field number for the "events" field. + public const int EventsFieldNumber = 1; + private static readonly pb::FieldCodec _repeated_events_codec + = pb::FieldCodec.ForMessage(10, global::ScadaLink.Communication.Grpc.AuditEventDto.Parser); + private readonly pbc::RepeatedField events_ = new pbc::RepeatedField(); + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public pbc::RepeatedField Events { + get { return events_; } + } + + /// Field number for the "more_available" field. + public const int MoreAvailableFieldNumber = 2; + private bool moreAvailable_; + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool MoreAvailable { + get { return moreAvailable_; } + set { + moreAvailable_ = value; + } + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override bool Equals(object other) { + return Equals(other as PullAuditEventsResponse); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public bool Equals(PullAuditEventsResponse other) { + if (ReferenceEquals(other, null)) { + return false; + } + if (ReferenceEquals(other, this)) { + return true; + } + if(!events_.Equals(other.events_)) return false; + if (MoreAvailable != other.MoreAvailable) return false; + return Equals(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override int GetHashCode() { + int hash = 1; + hash ^= events_.GetHashCode(); + if (MoreAvailable != false) hash ^= MoreAvailable.GetHashCode(); + if (_unknownFields != null) { + hash ^= _unknownFields.GetHashCode(); + } + return hash; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public override string ToString() { + return pb::JsonFormatter.ToDiagnosticString(this); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void WriteTo(pb::CodedOutputStream output) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + output.WriteRawMessage(this); + #else + events_.WriteTo(output, _repeated_events_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(output); + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalWriteTo(ref pb::WriteContext output) { + events_.WriteTo(ref output, _repeated_events_codec); + if (MoreAvailable != false) { + output.WriteRawTag(16); + output.WriteBool(MoreAvailable); + } + if (_unknownFields != null) { + _unknownFields.WriteTo(ref output); + } + } + #endif + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public int CalculateSize() { + int size = 0; + size += events_.CalculateSize(_repeated_events_codec); + if (MoreAvailable != false) { + size += 1 + 1; + } + if (_unknownFields != null) { + size += _unknownFields.CalculateSize(); + } + return size; + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(PullAuditEventsResponse other) { + if (other == null) { + return; + } + events_.Add(other.events_); + if (other.MoreAvailable != false) { + MoreAvailable = other.MoreAvailable; + } + _unknownFields = pb::UnknownFieldSet.MergeFrom(_unknownFields, other._unknownFields); + } + + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + public void MergeFrom(pb::CodedInputStream input) { + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + input.ReadRawMessage(this); + #else + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, input); + break; + case 10: { + events_.AddEntriesFrom(input, _repeated_events_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + #endif + } + + #if !GOOGLE_PROTOBUF_REFSTRUCT_COMPATIBILITY_MODE + [global::System.Diagnostics.DebuggerNonUserCodeAttribute] + [global::System.CodeDom.Compiler.GeneratedCode("protoc", null)] + void pb::IBufferMessage.InternalMergeFrom(ref pb::ParseContext input) { + uint tag; + while ((tag = input.ReadTag()) != 0) { + if ((tag & 7) == 4) { + // Abort on any end group tag. + return; + } + switch(tag) { + default: + _unknownFields = pb::UnknownFieldSet.MergeFieldFrom(_unknownFields, ref input); + break; + case 10: { + events_.AddEntriesFrom(ref input, _repeated_events_codec); + break; + } + case 16: { + MoreAvailable = input.ReadBool(); + break; + } + } + } + } + #endif + + } + #endregion } diff --git a/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs b/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs index e7b9b33..d5fd944 100644 --- a/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs +++ b/src/ScadaLink.Communication/SiteStreamGrpc/SitestreamGrpc.cs @@ -55,6 +55,10 @@ namespace ScadaLink.Communication.Grpc { static readonly grpc::Marshaller __Marshaller_sitestream_IngestAck = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.IngestAck.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Marshaller __Marshaller_sitestream_CachedTelemetryBatch = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.CachedTelemetryBatch.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsRequest = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.PullAuditEventsRequest.Parser)); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Marshaller __Marshaller_sitestream_PullAuditEventsResponse = grpc::Marshallers.Create(__Helper_SerializeMessage, context => __Helper_DeserializeMessage(context, global::ScadaLink.Communication.Grpc.PullAuditEventsResponse.Parser)); [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] static readonly grpc::Method __Method_SubscribeInstance = new grpc::Method( @@ -80,6 +84,14 @@ namespace ScadaLink.Communication.Grpc { __Marshaller_sitestream_CachedTelemetryBatch, __Marshaller_sitestream_IngestAck); + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + static readonly grpc::Method __Method_PullAuditEvents = new grpc::Method( + grpc::MethodType.Unary, + __ServiceName, + "PullAuditEvents", + __Marshaller_sitestream_PullAuditEventsRequest, + __Marshaller_sitestream_PullAuditEventsResponse); + /// Service descriptor public static global::Google.Protobuf.Reflection.ServiceDescriptor Descriptor { @@ -108,6 +120,12 @@ namespace ScadaLink.Communication.Grpc { throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::System.Threading.Tasks.Task PullAuditEvents(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::ServerCallContext context) + { + throw new grpc::RpcException(new grpc::Status(grpc::StatusCode.Unimplemented, "")); + } + } /// Client for SiteStreamService @@ -187,6 +205,26 @@ namespace ScadaLink.Communication.Grpc { { return CallInvoker.AsyncUnaryCall(__Method_IngestCachedTelemetry, null, options, request); } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ScadaLink.Communication.Grpc.PullAuditEventsResponse PullAuditEvents(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullAuditEvents(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual global::ScadaLink.Communication.Grpc.PullAuditEventsResponse PullAuditEvents(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::CallOptions options) + { + return CallInvoker.BlockingUnaryCall(__Method_PullAuditEvents, null, options, request); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullAuditEventsAsync(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::Metadata headers = null, global::System.DateTime? deadline = null, global::System.Threading.CancellationToken cancellationToken = default(global::System.Threading.CancellationToken)) + { + return PullAuditEventsAsync(request, new grpc::CallOptions(headers, deadline, cancellationToken)); + } + [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] + public virtual grpc::AsyncUnaryCall PullAuditEventsAsync(global::ScadaLink.Communication.Grpc.PullAuditEventsRequest request, grpc::CallOptions options) + { + return CallInvoker.AsyncUnaryCall(__Method_PullAuditEvents, null, options, request); + } /// Creates a new instance of client from given ClientBaseConfiguration. [global::System.CodeDom.Compiler.GeneratedCode("grpc_csharp_plugin", null)] protected override SiteStreamServiceClient NewInstance(ClientBaseConfiguration configuration) @@ -203,7 +241,8 @@ namespace ScadaLink.Communication.Grpc { return grpc::ServerServiceDefinition.CreateBuilder() .AddMethod(__Method_SubscribeInstance, serviceImpl.SubscribeInstance) .AddMethod(__Method_IngestAuditEvents, serviceImpl.IngestAuditEvents) - .AddMethod(__Method_IngestCachedTelemetry, serviceImpl.IngestCachedTelemetry).Build(); + .AddMethod(__Method_IngestCachedTelemetry, serviceImpl.IngestCachedTelemetry) + .AddMethod(__Method_PullAuditEvents, serviceImpl.PullAuditEvents).Build(); } /// Register service method with a service binder with or without implementation. Useful when customizing the service binding logic. @@ -216,6 +255,7 @@ namespace ScadaLink.Communication.Grpc { serviceBinder.AddMethod(__Method_SubscribeInstance, serviceImpl == null ? null : new grpc::ServerStreamingServerMethod(serviceImpl.SubscribeInstance)); serviceBinder.AddMethod(__Method_IngestAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestAuditEvents)); serviceBinder.AddMethod(__Method_IngestCachedTelemetry, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.IngestCachedTelemetry)); + serviceBinder.AddMethod(__Method_PullAuditEvents, serviceImpl == null ? null : new grpc::UnaryServerMethod(serviceImpl.PullAuditEvents)); } } diff --git a/tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs b/tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs new file mode 100644 index 0000000..ba9ae37 --- /dev/null +++ b/tests/ScadaLink.Communication.Tests/Protos/PullAuditEventsProtoTests.cs @@ -0,0 +1,83 @@ +using Google.Protobuf; +using Google.Protobuf.WellKnownTypes; +using ScadaLink.Communication.Grpc; + +namespace ScadaLink.Communication.Tests.Protos; + +/// +/// Wire-format round-trip tests for the Audit Log (#23) M6 reconciliation +/// pull proto messages (, +/// ). Locks the additive contract the +/// central→site reconciliation puller depends on. +/// +public class PullAuditEventsProtoTests +{ + private static AuditEventDto NewAuditDto(Guid? id = null) => new() + { + EventId = (id ?? Guid.NewGuid()).ToString(), + OccurredAtUtc = Timestamp.FromDateTimeOffset( + new DateTimeOffset(2026, 5, 20, 10, 15, 30, 123, TimeSpan.Zero)), + Channel = "ApiOutbound", + Kind = "ApiCall", + Status = "Delivered", + SourceSiteId = "site-1", + }; + + [Fact] + public void PullAuditEventsRequest_RoundTrip() + { + var sinceUtc = Timestamp.FromDateTimeOffset( + new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero)); + + var original = new PullAuditEventsRequest + { + SinceUtc = sinceUtc, + BatchSize = 250, + }; + + var bytes = original.ToByteArray(); + var deserialized = PullAuditEventsRequest.Parser.ParseFrom(bytes); + + Assert.Equal(sinceUtc, deserialized.SinceUtc); + Assert.Equal(250, deserialized.BatchSize); + } + + [Fact] + public void PullAuditEventsResponse_RoundTrip_WithEvents_And_MoreAvailable() + { + var dtos = Enumerable.Range(0, 4).Select(_ => NewAuditDto()).ToList(); + + var original = new PullAuditEventsResponse + { + MoreAvailable = true, + }; + original.Events.AddRange(dtos); + + var bytes = original.ToByteArray(); + var deserialized = PullAuditEventsResponse.Parser.ParseFrom(bytes); + + Assert.True(deserialized.MoreAvailable); + Assert.Equal(4, deserialized.Events.Count); + for (int i = 0; i < dtos.Count; i++) + { + Assert.Equal(dtos[i].EventId, deserialized.Events[i].EventId); + Assert.Equal(dtos[i].Status, deserialized.Events[i].Status); + Assert.Equal(dtos[i].SourceSiteId, deserialized.Events[i].SourceSiteId); + Assert.Equal(dtos[i].OccurredAtUtc, deserialized.Events[i].OccurredAtUtc); + } + } + + [Fact] + public void PullAuditEventsResponse_Empty_Yields_EmptyEvents() + { + var original = new PullAuditEventsResponse(); + Assert.Empty(original.Events); + Assert.False(original.MoreAvailable); + + var bytes = original.ToByteArray(); + var deserialized = PullAuditEventsResponse.Parser.ParseFrom(bytes); + + Assert.Empty(deserialized.Events); + Assert.False(deserialized.MoreAvailable); + } +} From 640fd07454a2e65ab146d53b45d13269d6f0b215 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 17:58:43 -0400 Subject: [PATCH 03/16] feat(comms): site-side PullAuditEvents handler (#23 M6) --- .../Site/SqliteAuditWriter.cs | 101 +++++++++- .../Site/Telemetry/ISiteAuditQueue.cs | 34 ---- .../Site/Telemetry/SiteAuditTelemetryActor.cs | 1 + .../Interfaces/Services/ISiteAuditQueue.cs | 73 +++++++ .../Grpc/SiteStreamGrpcServer.cs | 160 +++++++++++++++ .../Actors/AkkaHostedService.cs | 9 +- .../SyncCallEmissionEndToEndTests.cs | 1 + .../Site/SqliteAuditWriterWriteTests.cs | 149 ++++++++++++++ .../Telemetry/SiteAuditTelemetryActorTests.cs | 1 + .../SiteStreamPullAuditEventsTests.cs | 185 ++++++++++++++++++ 10 files changed, 678 insertions(+), 36 deletions(-) delete mode 100644 src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs create mode 100644 src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs create mode 100644 tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs diff --git a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs index 789b572..b00f205 100644 --- a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs +++ b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs @@ -2,7 +2,6 @@ using System.Threading.Channels; using Microsoft.Data.Sqlite; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Types.Enums; @@ -390,6 +389,106 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable } } + /// + /// M6 reconciliation-pull read: returns up to rows + /// whose OccurredAtUtc >= sinceUtc and whose + /// is still or + /// . Forwarded rows are included so the + /// brief race window between a site-Forwarded ack and central ingest cannot + /// silently drop rows; central dedups on . + /// Ordered oldest first, EventId tiebreaker. + /// + public Task> ReadPendingSinceAsync( + DateTime sinceUtc, int batchSize, CancellationToken ct = default) + { + if (batchSize <= 0) + { + throw new ArgumentOutOfRangeException(nameof(batchSize), "batchSize must be > 0."); + } + + // Mirror ReadPendingAsync: the write lock guards the single connection. + lock (_writeLock) + { + ObjectDisposedException.ThrowIf(_disposed, this); + + using var cmd = _connection.CreateCommand(); + cmd.CommandText = """ + SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId, + SourceSiteId, SourceInstanceId, SourceScript, Actor, Target, + Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail, + RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState + FROM AuditLog + WHERE ForwardState IN ($pending, $forwarded) + AND OccurredAtUtc >= $since + ORDER BY OccurredAtUtc ASC, EventId ASC + LIMIT $limit; + """; + cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString()); + cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString()); + // Normalise to UTC ISO-8601 round-trip format to match how OccurredAtUtc + // is stored on insert ("o" format) — string comparison is monotonic for + // that encoding so we can index-scan against it. + cmd.Parameters.AddWithValue("$since", EnsureUtc(sinceUtc).ToString( + "o", System.Globalization.CultureInfo.InvariantCulture)); + cmd.Parameters.AddWithValue("$limit", batchSize); + + var rows = new List(Math.Min(batchSize, 256)); + using var reader = cmd.ExecuteReader(); + while (reader.Read()) + { + rows.Add(MapRow(reader)); + } + + return Task.FromResult>(rows); + } + } + + /// + /// M6 reconciliation-pull commit: flips the supplied EventIds to + /// , but ONLY for rows currently in + /// or . + /// Rows already in are left untouched + /// (idempotent re-call). Non-existent ids are silent no-ops. + /// + public Task MarkReconciledAsync(IReadOnlyList eventIds, CancellationToken ct = default) + { + ArgumentNullException.ThrowIfNull(eventIds); + if (eventIds.Count == 0) + { + return Task.CompletedTask; + } + + lock (_writeLock) + { + ObjectDisposedException.ThrowIf(_disposed, this); + + using var cmd = _connection.CreateCommand(); + var sb = new System.Text.StringBuilder(); + sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ") + .Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN ("); + for (int i = 0; i < eventIds.Count; i++) + { + if (i > 0) sb.Append(','); + var p = $"$id{i}"; + sb.Append(p); + cmd.Parameters.AddWithValue(p, eventIds[i].ToString()); + } + sb.Append(");"); + cmd.CommandText = sb.ToString(); + cmd.Parameters.AddWithValue("$reconciled", AuditForwardState.Reconciled.ToString()); + cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString()); + cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString()); + + cmd.ExecuteNonQuery(); + return Task.CompletedTask; + } + } + + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc + ? value + : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc); + private static AuditEvent MapRow(SqliteDataReader reader) { return new AuditEvent diff --git a/src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs b/src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs deleted file mode 100644 index 9da55b5..0000000 --- a/src/ScadaLink.AuditLog/Site/Telemetry/ISiteAuditQueue.cs +++ /dev/null @@ -1,34 +0,0 @@ -using ScadaLink.Commons.Entities.Audit; - -namespace ScadaLink.AuditLog.Site.Telemetry; - -/// -/// Site-local audit-log queue surface consumed by . -/// Extracted from so the telemetry actor can be -/// unit-tested against a stub without touching SQLite. -/// implements this interface; production wiring injects the same instance. -/// -/// -/// Only the two methods the drain loop needs are exposed — the hot-path -/// WriteAsync stays on -/// (script-thread surface), separated by concern from the -/// telemetry-actor surface so each side can be mocked independently. -/// -public interface ISiteAuditQueue -{ - /// - /// Returns up to rows currently in - /// , - /// oldest first. Idempotent — repeated calls before - /// will yield the same rows again. - /// - Task> ReadPendingAsync(int limit, CancellationToken ct = default); - - /// - /// Flips the supplied EventIds from - /// to - /// . - /// Non-existent or already-forwarded ids are silent no-ops. - /// - Task MarkForwardedAsync(IReadOnlyList eventIds, CancellationToken ct = default); -} diff --git a/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs b/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs index a820cf5..724e1d1 100644 --- a/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs +++ b/src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs @@ -3,6 +3,7 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.AuditLog.Telemetry; using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Communication.Grpc; namespace ScadaLink.AuditLog.Site.Telemetry; diff --git a/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs b/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs new file mode 100644 index 0000000..32d8646 --- /dev/null +++ b/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs @@ -0,0 +1,73 @@ +using ScadaLink.Commons.Entities.Audit; + +namespace ScadaLink.Commons.Interfaces.Services; + +/// +/// Site-local audit-log queue surface consumed by the site +/// SiteAuditTelemetryActor drain loop and the M6 +/// SiteStreamGrpcServer.PullAuditEvents reconciliation handler. +/// Extracted from SqliteAuditWriter so both consumers can be +/// unit-tested against a stub without touching SQLite; the +/// SqliteAuditWriter production type implements this interface +/// and DI wires the same singleton instance to every consumer. +/// +/// +/// Lives in Commons (rather than alongside SqliteAuditWriter in +/// ScadaLink.AuditLog) because ScadaLink.Communication — which +/// hosts the M6 gRPC pull handler — must depend on this interface and +/// ScadaLink.AuditLog already depends on ScadaLink.Communication. +/// Pulling the interface up to Commons breaks the would-be cycle while +/// keeping the implementation in the AuditLog component. +/// +/// Only the methods the drain and pull paths need are exposed — the +/// hot-path WriteAsync stays on +/// (script-thread surface), separated by concern so each side can be +/// mocked independently. +/// +public interface ISiteAuditQueue +{ + /// + /// Returns up to rows currently in + /// , + /// oldest first. Idempotent — repeated calls before + /// will yield the same rows again. + /// + Task> ReadPendingAsync(int limit, CancellationToken ct = default); + + /// + /// Flips the supplied EventIds from + /// to + /// . + /// Non-existent or already-forwarded ids are silent no-ops. + /// + Task MarkForwardedAsync(IReadOnlyList eventIds, CancellationToken ct = default); + + /// + /// M6 reconciliation-pull read surface: returns up to + /// rows whose >= + /// and whose is still + /// or + /// . + /// + /// + /// Rows in the brief race window between site-Forwarded and central-ingest are + /// intentionally included: the central reconciliation puller dedups on + /// , so re-shipping is safe and avoids losing rows + /// whose telemetry ack was acted on locally but never landed centrally. Ordering + /// is oldest first with + /// as the deterministic tiebreaker. + /// + Task> ReadPendingSinceAsync( + DateTime sinceUtc, int batchSize, CancellationToken ct = default); + + /// + /// M6 reconciliation-pull commit surface: flips the supplied EventIds to + /// , + /// but ONLY for rows currently in + /// or + /// . + /// Rows already in + /// are left untouched (idempotent re-call). Non-existent ids are silent no-ops. + /// + Task MarkReconciledAsync(IReadOnlyList eventIds, CancellationToken ct = default); +} diff --git a/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs b/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs index 1da14ec..8a92027 100644 --- a/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs +++ b/src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs @@ -5,6 +5,7 @@ using Grpc.Core; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Messages.Audit; using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; @@ -36,6 +37,13 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase // calls are sub-100 ms in steady state; a generous timeout absorbs a slow // MSSQL connection without surfacing as a gRPC failure on a healthy site. private static readonly TimeSpan AuditIngestAskTimeout = TimeSpan.FromSeconds(30); + // Audit Log (#23 M6): site-local queue handed in by AkkaHostedService on + // site roles so the central reconciliation puller's PullAuditEvents RPC + // can read Pending/Forwarded rows. Null when not wired (e.g. central-only + // host or test composing the server in isolation) — the handler treats + // the missing queue as "nothing to ship" and returns an empty response so + // central retries on its next reconciliation cycle. + private ISiteAuditQueue? _siteAuditQueue; /// /// Test-only constructor — kept internal so the DI container sees a @@ -102,6 +110,20 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase _auditIngestActor = proxy; } + /// + /// Hands the site-local (the same + /// SqliteAuditWriter singleton that backs + /// on the script thread) to the gRPC server so the M6 + /// RPC can serve central's reconciliation + /// pulls. Mirrors : wired post-construction + /// because the queue and the gRPC server are both DI singletons brought up + /// in independent orders on site startup. + /// + public void SetSiteAuditQueue(ISiteAuditQueue queue) + { + _siteAuditQueue = queue; + } + /// /// Number of currently active streaming subscriptions. Exposed for diagnostics. /// @@ -361,6 +383,144 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase return ack; } + /// + /// Audit Log (#23) M6 reconciliation pull RPC. Central asks the site for any + /// AuditLog rows whose OccurredAtUtc >= since_utc and whose + /// ForwardState is still Pending or Forwarded (i.e. not + /// yet confirmed reconciled), bounded by batch_size. The site responds + /// with the rows AND flips them to + /// + /// AFTER serializing the response. The flip is best-effort — if it fails + /// (e.g. SQLite disposed mid-call), rows stay Pending/Forwarded and central + /// pulls them again on the next reconciliation cycle. Idempotent. + /// + /// + /// When is not wired (central-only host or a + /// composition-root test exercising the server in isolation) the RPC returns + /// an empty response — central treats that as "nothing to ship" and retries + /// on its next cycle, which is the same self-healing semantics as the + /// SetAuditIngestActor wiring race window. + /// + public override async Task PullAuditEvents( + PullAuditEventsRequest request, + ServerCallContext context) + { + var queue = _siteAuditQueue; + if (queue is null) + { + _logger.LogWarning( + "PullAuditEvents invoked before SetSiteAuditQueue was called; returning empty response."); + return new PullAuditEventsResponse(); + } + + if (request.BatchSize <= 0) + { + // Mirrors the SubscribeInstance guard: reject malformed requests + // cleanly with InvalidArgument so the caller doesn't see a generic + // RpcException from the underlying SQLite parameter validation. + throw new RpcException(new GrpcStatus( + StatusCode.InvalidArgument, "batch_size must be > 0")); + } + + // sinceUtc defaults to DateTime.MinValue when the wrapper is absent — + // i.e. "pull from the beginning of recorded history", which is the + // intended behaviour for the very first reconciliation cycle. + var since = request.SinceUtc?.ToDateTime().ToUniversalTime() ?? DateTime.MinValue; + + IReadOnlyList events; + try + { + events = await queue.ReadPendingSinceAsync( + since, request.BatchSize, context.CancellationToken); + } + catch (Exception ex) + { + _logger.LogError(ex, + "ReadPendingSinceAsync failed for since={Since} batch={Batch}; returning empty response.", + since, request.BatchSize); + return new PullAuditEventsResponse(); + } + + var response = new PullAuditEventsResponse + { + // batch_size saturated → tell central to issue a follow-up pull + // with an advanced cursor. The site doesn't compute the cursor — + // central walks it forward from the last returned OccurredAtUtc. + MoreAvailable = events.Count >= request.BatchSize, + }; + foreach (var evt in events) + { + response.Events.Add(AuditEventToDto(evt)); + } + + // Flip to Reconciled AFTER projecting the response so a fault below the + // try/catch (mid-response, mid-flip) leaves the rows in Pending/Forwarded + // and central pulls them again next cycle. The flip itself is + // best-effort — its failure is a warning, not a fault, because central + // will dedup on EventId on the next pull. + var ids = new List(events.Count); + foreach (var evt in events) + { + ids.Add(evt.EventId); + } + + if (ids.Count > 0) + { + try + { + await queue.MarkReconciledAsync(ids, context.CancellationToken); + } + catch (Exception ex) + { + _logger.LogWarning(ex, + "MarkReconciledAsync failed after PullAuditEvents response of {Count} rows; rows stay Pending for retry.", + ids.Count); + } + } + + return response; + } + + /// + /// Inlined audit-event entity→DTO translation. Keep in sync with + /// AuditEventMapper.ToDto in ScadaLink.AuditLog.Telemetry — + /// the project-reference cycle (AuditLog → Communication) prevents calling + /// the AuditLog mapper directly. The shape mirrors the FromDto pair above. + /// + private static AuditEventDto AuditEventToDto(AuditEvent evt) + { + var dto = new AuditEventDto + { + EventId = evt.EventId.ToString(), + OccurredAtUtc = Google.Protobuf.WellKnownTypes.Timestamp.FromDateTime(EnsureUtc(evt.OccurredAtUtc)), + Channel = evt.Channel.ToString(), + Kind = evt.Kind.ToString(), + CorrelationId = evt.CorrelationId?.ToString() ?? string.Empty, + SourceSiteId = evt.SourceSiteId ?? string.Empty, + SourceInstanceId = evt.SourceInstanceId ?? string.Empty, + SourceScript = evt.SourceScript ?? string.Empty, + Actor = evt.Actor ?? string.Empty, + Target = evt.Target ?? string.Empty, + Status = evt.Status.ToString(), + ErrorMessage = evt.ErrorMessage ?? string.Empty, + ErrorDetail = evt.ErrorDetail ?? string.Empty, + RequestSummary = evt.RequestSummary ?? string.Empty, + ResponseSummary = evt.ResponseSummary ?? string.Empty, + PayloadTruncated = evt.PayloadTruncated, + Extra = evt.Extra ?? string.Empty, + }; + + if (evt.HttpStatus.HasValue) dto.HttpStatus = evt.HttpStatus.Value; + if (evt.DurationMs.HasValue) dto.DurationMs = evt.DurationMs.Value; + + return dto; + } + + private static DateTime EnsureUtc(DateTime value) => + value.Kind == DateTimeKind.Utc + ? value + : DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc); + private static string? NullIfEmpty(string? value) => string.IsNullOrEmpty(value) ? null : value; diff --git a/src/ScadaLink.Host/Actors/AkkaHostedService.cs b/src/ScadaLink.Host/Actors/AkkaHostedService.cs index b8c5171..9425368 100644 --- a/src/ScadaLink.Host/Actors/AkkaHostedService.cs +++ b/src/ScadaLink.Host/Actors/AkkaHostedService.cs @@ -605,7 +605,7 @@ akka {{ var siteAuditOptions = _serviceProvider .GetRequiredService>(); var siteAuditQueue = _serviceProvider - .GetRequiredService(); + .GetRequiredService(); var siteAuditClient = _serviceProvider .GetRequiredService(); var siteAuditLogger = _serviceProvider.GetRequiredService() @@ -640,6 +640,13 @@ akka {{ // handshake has completed". Streams opened before SetReady are already // rejected by SiteStreamGrpcServer with StatusCode.Unavailable. var grpcServer = _serviceProvider.GetService(); + // Audit Log (#23 M6): hand the site-local SqliteAuditWriter (which + // implements ISiteAuditQueue) to the gRPC server so the PullAuditEvents + // reconciliation RPC can serve central's pulls. Both the writer and the + // gRPC server are singletons — wiring this here keeps the dependency + // direction one-way (Host knows both; Communication doesn't reach back + // into AuditLog). + grpcServer?.SetSiteAuditQueue(siteAuditQueue); grpcServer?.SetReady(_actorSystem!); } } diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs index a0c5c85..3b55da3 100644 --- a/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs @@ -9,6 +9,7 @@ using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.AuditLog.Tests.Integration.Infrastructure; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Types.Audit; using ScadaLink.Commons.Types.Enums; using ScadaLink.ConfigurationDatabase; diff --git a/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs index b490142..f9fe5c4 100644 --- a/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs @@ -204,4 +204,153 @@ public class SqliteAuditWriterWriteTests await writer.MarkForwardedAsync(phantomIds); // No assertion needed: the call must complete without throwing. } + + // ----- M6 reconciliation pull surface ----- // + + [Fact] + public async Task ReadPendingSinceAsync_Returns_PendingAndForwarded_OldestFirst_LimitedToN() + { + var (writer, dataSource) = CreateWriter(nameof(ReadPendingSinceAsync_Returns_PendingAndForwarded_OldestFirst_LimitedToN)); + await using var _ = writer; + + var baseTime = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + var evts = new[] + { + NewEvent(occurredAtUtc: baseTime.AddSeconds(5)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(1)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(3)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(2)), + NewEvent(occurredAtUtc: baseTime.AddSeconds(4)), + }; + foreach (var e in evts) await writer.WriteAsync(e); + + // Flip half to Forwarded — they must still surface in the reconciliation pull + // because central hasn't confirmed they were ingested yet. + await writer.MarkForwardedAsync(new[] { evts[0].EventId, evts[2].EventId }); + + var rows = await writer.ReadPendingSinceAsync(sinceUtc: DateTime.MinValue, batchSize: 3); + + Assert.Equal(3, rows.Count); + Assert.Equal(baseTime.AddSeconds(1), rows[0].OccurredAtUtc); + Assert.Equal(baseTime.AddSeconds(2), rows[1].OccurredAtUtc); + Assert.Equal(baseTime.AddSeconds(3), rows[2].OccurredAtUtc); + } + + [Fact] + public async Task ReadPendingSinceAsync_ExcludesRowsOlderThanSinceUtc() + { + var (writer, _) = CreateWriter(nameof(ReadPendingSinceAsync_ExcludesRowsOlderThanSinceUtc)); + await using var _w = writer; + + var baseTime = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + var old = NewEvent(occurredAtUtc: baseTime.AddSeconds(-30)); + var newer1 = NewEvent(occurredAtUtc: baseTime.AddSeconds(10)); + var newer2 = NewEvent(occurredAtUtc: baseTime.AddSeconds(20)); + + await writer.WriteAsync(old); + await writer.WriteAsync(newer1); + await writer.WriteAsync(newer2); + + var rows = await writer.ReadPendingSinceAsync(sinceUtc: baseTime, batchSize: 10); + + Assert.Equal(2, rows.Count); + Assert.Contains(rows, r => r.EventId == newer1.EventId); + Assert.Contains(rows, r => r.EventId == newer2.EventId); + Assert.DoesNotContain(rows, r => r.EventId == old.EventId); + } + + [Fact] + public async Task ReadPendingSinceAsync_ExcludesReconciledRows() + { + var (writer, _) = CreateWriter(nameof(ReadPendingSinceAsync_ExcludesReconciledRows)); + await using var _w = writer; + + var baseTime = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + var pending = NewEvent(occurredAtUtc: baseTime); + var reconciled = NewEvent(occurredAtUtc: baseTime.AddSeconds(1)); + + await writer.WriteAsync(pending); + await writer.WriteAsync(reconciled); + await writer.MarkReconciledAsync(new[] { reconciled.EventId }); + + var rows = await writer.ReadPendingSinceAsync(sinceUtc: DateTime.MinValue, batchSize: 10); + + Assert.Single(rows); + Assert.Equal(pending.EventId, rows[0].EventId); + } + + [Fact] + public async Task ReadPendingSinceAsync_InvalidBatchSize_Throws() + { + var (writer, _) = CreateWriter(nameof(ReadPendingSinceAsync_InvalidBatchSize_Throws)); + await using var _w = writer; + + await Assert.ThrowsAsync( + () => writer.ReadPendingSinceAsync(DateTime.MinValue, batchSize: 0)); + await Assert.ThrowsAsync( + () => writer.ReadPendingSinceAsync(DateTime.MinValue, batchSize: -3)); + } + + [Fact] + public async Task MarkReconciledAsync_FlipsPendingAndForwarded_To_Reconciled() + { + var (writer, dataSource) = CreateWriter(nameof(MarkReconciledAsync_FlipsPendingAndForwarded_To_Reconciled)); + await using var _ = writer; + + var a = NewEvent(); + var b = NewEvent(); + var c = NewEvent(); + await writer.WriteAsync(a); + await writer.WriteAsync(b); + await writer.WriteAsync(c); + + // b is currently Forwarded; a and c are Pending. + await writer.MarkForwardedAsync(new[] { b.EventId }); + + await writer.MarkReconciledAsync(new[] { a.EventId, b.EventId, c.EventId }); + + using var connection = OpenVerifierConnection(dataSource); + using var cmd = connection.CreateCommand(); + cmd.CommandText = "SELECT ForwardState, COUNT(*) FROM AuditLog GROUP BY ForwardState;"; + using var reader = cmd.ExecuteReader(); + var byState = new Dictionary(); + while (reader.Read()) + { + byState[reader.GetString(0)] = reader.GetInt64(1); + } + + Assert.Equal(3, byState[AuditForwardState.Reconciled.ToString()]); + Assert.False(byState.ContainsKey(AuditForwardState.Pending.ToString())); + Assert.False(byState.ContainsKey(AuditForwardState.Forwarded.ToString())); + } + + [Fact] + public async Task MarkReconciledAsync_Idempotent_LeavesAlreadyReconciledRowsUntouched() + { + var (writer, dataSource) = CreateWriter(nameof(MarkReconciledAsync_Idempotent_LeavesAlreadyReconciledRowsUntouched)); + await using var _ = writer; + + var a = NewEvent(); + await writer.WriteAsync(a); + await writer.MarkReconciledAsync(new[] { a.EventId }); + // Re-call must not throw and must leave the single row Reconciled. + await writer.MarkReconciledAsync(new[] { a.EventId }); + + using var connection = OpenVerifierConnection(dataSource); + using var cmd = connection.CreateCommand(); + cmd.CommandText = "SELECT ForwardState FROM AuditLog WHERE EventId = $id;"; + cmd.Parameters.AddWithValue("$id", a.EventId.ToString()); + + Assert.Equal(AuditForwardState.Reconciled.ToString(), cmd.ExecuteScalar() as string); + } + + [Fact] + public async Task MarkReconciledAsync_NonExistentId_NoThrow() + { + var (writer, _) = CreateWriter(nameof(MarkReconciledAsync_NonExistentId_NoThrow)); + await using var _w = writer; + + await writer.MarkReconciledAsync(new[] { Guid.NewGuid(), Guid.NewGuid() }); + // Completes without throwing. + } } diff --git a/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs index f8bef38..8d5d555 100644 --- a/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs @@ -7,6 +7,7 @@ using NSubstitute; using NSubstitute.ExceptionExtensions; using ScadaLink.AuditLog.Site.Telemetry; using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; using ScadaLink.Commons.Types.Enums; using ScadaLink.Communication.Grpc; diff --git a/tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs b/tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs new file mode 100644 index 0000000..d9a6ac2 --- /dev/null +++ b/tests/ScadaLink.Communication.Tests/SiteStreamPullAuditEventsTests.cs @@ -0,0 +1,185 @@ +using Akka.TestKit.Xunit2; +using Google.Protobuf.WellKnownTypes; +using Grpc.Core; +using Microsoft.Extensions.Logging.Abstractions; +using NSubstitute; +using NSubstitute.ExceptionExtensions; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.Communication.Grpc; + +namespace ScadaLink.Communication.Tests; + +/// +/// Bundle A A2 tests for . +/// Verifies the request → ISiteAuditQueue.ReadPendingSinceAsync → response → +/// MarkReconciledAsync round-trip through the gRPC handler. The queue is an +/// NSubstitute stub so the tests never touch SQLite. +/// +public class SiteStreamPullAuditEventsTests : TestKit +{ + private readonly ISiteStreamSubscriber _subscriber = Substitute.For(); + + private SiteStreamGrpcServer CreateServer() => + new(_subscriber, NullLogger.Instance); + + private static ServerCallContext NewContext(CancellationToken ct = default) + { + var context = Substitute.For(); + context.CancellationToken.Returns(ct); + return context; + } + + private static AuditEvent NewEvent(DateTime? occurredAt = null) => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = occurredAt + ?? DateTime.SpecifyKind(new DateTime(2026, 5, 20, 10, 0, 0), DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = "site-1", + PayloadTruncated = false, + ForwardState = AuditForwardState.Pending, + }; + + [Fact] + public async Task PullAuditEvents_NoQueueWired_ReturnsEmptyResponse() + { + var server = CreateServer(); + // Intentionally do NOT call SetSiteAuditQueue — simulates a central-only + // host or a wiring-incomplete startup window. + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddMinutes(-5)), + BatchSize = 100, + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Empty(response.Events); + Assert.False(response.MoreAvailable); + } + + [Fact] + public async Task PullAuditEvents_With5PendingRows_ReturnsAllFiveDtos_AndFlipsToReconciled() + { + var queue = Substitute.For(); + var events = Enumerable.Range(0, 5).Select(_ => NewEvent()).ToList(); + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)events); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, // larger than returned count so MoreAvailable should be false + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Equal(5, response.Events.Count); + Assert.False(response.MoreAvailable); // 5 < 100 + var expectedIds = events.Select(e => e.EventId.ToString()).ToHashSet(); + Assert.True(expectedIds.SetEquals(response.Events.Select(d => d.EventId).ToHashSet())); + + // Verify MarkReconciledAsync received the same 5 ids (best-effort flip). + await queue.Received(1).MarkReconciledAsync( + Arg.Is>(ids => ids.Count == 5 && + ids.ToHashSet().SetEquals(events.Select(e => e.EventId))), + Arg.Any()); + } + + [Fact] + public async Task PullAuditEvents_RowsOlderThanSinceUtc_Excluded() + { + // The handler delegates the since-utc filter to ReadPendingSinceAsync; + // this test verifies it passes the request value through verbatim + // (no clock skew, no off-by-one) and that an empty queue response + // yields an empty gRPC response. + var queue = Substitute.For(); + var capturedSince = DateTime.MinValue; + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => + { + capturedSince = call.ArgAt(0); + return (IReadOnlyList)Array.Empty(); + }); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var since = DateTime.SpecifyKind(new DateTime(2026, 5, 20, 9, 30, 0), DateTimeKind.Utc); + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(since), + BatchSize = 50, + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Empty(response.Events); + Assert.False(response.MoreAvailable); + Assert.Equal(since, capturedSince); + // Empty result → no MarkReconciledAsync call (no rows to flip). + await queue.DidNotReceive().MarkReconciledAsync( + Arg.Any>(), Arg.Any()); + } + + [Fact] + public async Task PullAuditEvents_BatchSize3_Returns3Rows_MoreAvailableTrue() + { + var queue = Substitute.For(); + var events = Enumerable.Range(0, 3).Select(_ => NewEvent()).ToList(); + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)events); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 3, + }; + + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Equal(3, response.Events.Count); + // saturated batch → central needs to know to issue a follow-up pull + Assert.True(response.MoreAvailable); + } + + [Fact] + public async Task PullAuditEvents_MarkReconciledThrows_ResponseStillReturned() + { + // The Reconciled flip is best-effort — if it fails, the response must + // still surface so central can ingest the rows (and dedup on EventId + // when it pulls them again). + var queue = Substitute.For(); + var events = Enumerable.Range(0, 2).Select(_ => NewEvent()).ToList(); + queue.ReadPendingSinceAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns((IReadOnlyList)events); + queue.MarkReconciledAsync(Arg.Any>(), Arg.Any()) + .ThrowsAsync(new InvalidOperationException("SQLite disposed mid-call")); + + var server = CreateServer(); + server.SetSiteAuditQueue(queue); + + var request = new PullAuditEventsRequest + { + SinceUtc = Timestamp.FromDateTime(DateTime.UtcNow.AddHours(-1)), + BatchSize = 100, + }; + + // Must NOT throw — the response is built before the flip and returned + // regardless of the flip outcome. + var response = await server.PullAuditEvents(request, NewContext()); + + Assert.Equal(2, response.Events.Count); + } +} From c763bd9a047bf5712afed088e3c010cb2b06f411 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 18:10:42 -0400 Subject: [PATCH 04/16] feat(auditlog): SiteAuditReconciliationActor central singleton (#23 M6) --- .../Central/IPullAuditEventsClient.cs | 45 ++ .../Central/ISiteEnumerator.cs | 34 ++ .../Central/SiteAuditReconciliationActor.cs | 324 +++++++++++++ .../Central/SiteAuditReconciliationOptions.cs | 60 +++ .../SiteAuditReconciliationActorTests.cs | 438 ++++++++++++++++++ 5 files changed, 901 insertions(+) create mode 100644 src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs create mode 100644 src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs create mode 100644 src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs create mode 100644 src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs diff --git a/src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs b/src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs new file mode 100644 index 0000000..e094e48 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/IPullAuditEventsClient.cs @@ -0,0 +1,45 @@ +using ScadaLink.Commons.Messages.Integration; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Mockable abstraction over the central-side PullAuditEvents gRPC +/// client surface that uses to +/// fetch the next reconciliation batch from a specific site. Extracted so the +/// actor can be unit-tested against an in-memory stub without standing up a +/// real GrpcChannel per site. +/// +/// +/// +/// The production implementation (host wiring task) wraps the auto-generated +/// SiteStreamService.SiteStreamServiceClient, multiplexing one +/// GrpcChannel per site keyed on +/// . Until that wiring lands the DI +/// composition root binds a NoOp default that returns an empty response — the +/// reconciliation tick is still scheduled and the cursor logic still runs, so +/// regressions in the actor itself are caught even before the real client +/// arrives. +/// +/// +/// Implementations MUST NOT throw on transport faults that the actor can +/// tolerate (connection refused, deadline exceeded). The actor's contract is +/// "one site's failure doesn't sink the rest of the tick"; an exception still +/// won't crash the actor (the per-site try/catch catches it), but returning +/// an empty response on a known-recoverable error keeps the logs cleaner. +/// +/// +public interface IPullAuditEventsClient +{ + /// + /// Issues a PullAuditEvents RPC against the site whose endpoint + /// is registered against . Returns the next + /// batch of + /// rows ordered oldest-first AND a MoreAvailable flag the actor + /// uses to decide whether to fire another pull immediately. + /// + Task PullAsync( + string siteId, + DateTime sinceUtc, + int batchSize, + CancellationToken ct); +} diff --git a/src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs b/src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs new file mode 100644 index 0000000..9e9607c --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/ISiteEnumerator.cs @@ -0,0 +1,34 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Enumeration surface consumed by to +/// discover which sites to poll on each reconciliation tick. Extracted so the +/// actor can be unit-tested against a static list without depending on the +/// production ISiteRepository + EF Core DbContext. +/// +/// +/// The production implementation wraps ISiteRepository.GetAllSitesAsync +/// and projects each Site to a using the +/// site's configured GrpcNodeAAddress (falling back to +/// GrpcNodeBAddress when NodeA is unset). Sites with NO gRPC address +/// configured are silently skipped — the reconciliation pull cannot reach +/// them, but absence of an address is a configuration decision, not a runtime +/// error. +/// +public interface ISiteEnumerator +{ + /// + /// Returns the current set of sites the reconciliation puller should visit + /// on the next tick. Implementations should reflect adds/removes promptly + /// — the actor calls this once per tick. + /// + Task> EnumerateAsync(CancellationToken ct = default); +} + +/// +/// One reconciliation target: the site identifier the actor uses as the +/// cursor key and the gRPC endpoint dials +/// to issue the pull. Endpoint is the bare authority (e.g. http://siteA:8083); +/// transport selection (TLS, keepalive, etc.) is the client's concern. +/// +public sealed record SiteEntry(string SiteId, string GrpcEndpoint); diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs new file mode 100644 index 0000000..6460c4d --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs @@ -0,0 +1,324 @@ +using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation +/// pull loop. On a configurable timer (default 5 minutes) the actor walks every +/// known site, asks the site for any rows with +/// >= the site's last reconciled +/// cursor, ingests them idempotently into the central +/// , and advances the cursor. +/// +/// +/// +/// Self-healing telemetry, not a dispatcher. The push path +/// ( + +/// IngestAuditEvents) is the primary mechanism. This actor exists so a +/// missed push (gRPC blip, central restart, site offline) is eventually +/// repaired by central re-pulling whatever the site still has in +/// Pending/Forwarded state. Idempotency on +/// (M2 Bundle A's race-fix) makes duplicate +/// arrivals from both paths a silent no-op. +/// +/// +/// Cursor lifetime. The per-site LastReconciledAt watermark is +/// kept in-memory for the actor's lifetime. The cluster singleton normally +/// survives the host process; on a deliberate failover OR a singleton restart +/// the cursors reset to . That is conservative +/// but correct — the next tick simply asks for everything the site still has, +/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was +/// considered and rejected for M6: the cost of a write per tick outweighs the +/// rare benefit of avoiding one over-broad pull after a restart. +/// +/// +/// Stalled detection. The brief calls a site "stalled" when two +/// consecutive pull cycles BOTH return non-empty AND MoreAvailable=true +/// — i.e. the backlog isn't draining. The actor publishes +/// on the actor system's +/// EventStream so a future ICentralHealthCollector bridge (M6 Bundle E) +/// can flip the health metric without coupling this actor to the health +/// collection surface today. +/// +/// +/// Failure isolation. A single site that throws (DNS, transport, +/// repository write) must NOT prevent other sites from being polled on the +/// same tick. The per-site work runs inside its own try/catch; the actor's +/// supervisor strategy keeps it alive across any leaked exception with +/// 's Restart +/// semantics — restart resets the in-memory cursors, but as noted above that's +/// a safe (over-pull, idempotent) recovery. +/// +/// +/// DI scopes. is a scoped EF Core +/// service registered by AddConfigurationDatabase. The singleton actor +/// opens one DI scope per tick and reuses the same repository across all +/// sites in that tick — one DbContext per tick mirrors the +/// AuditLogIngestActor + NotificationOutboxActor pattern. +/// +/// +public class SiteAuditReconciliationActor : ReceiveActor +{ + private readonly ISiteEnumerator _sites; + private readonly IPullAuditEventsClient _client; + private readonly IServiceProvider _services; + private readonly SiteAuditReconciliationOptions _options; + private readonly ILogger _logger; + + /// + /// Per-site reconciliation watermark — the highest + /// seen for that site on a previous + /// tick. Asking for OccurredAtUtc >= cursor rather than > + /// is the site contract (); + /// duplicate-with-same-timestamp rows are filtered out by the idempotent + /// repository write. + /// + private readonly Dictionary _cursors = new(); + + /// + /// Per-site count of consecutive non-draining cycles. Resets to zero on the + /// first draining (or empty) cycle. + /// + private readonly Dictionary _nonDrainingCycles = new(); + + /// + /// Per-site latched stalled state — used so the actor only publishes a + /// transition when the + /// stalled flag actually changes, not on every tick while stalled. + /// + private readonly Dictionary _stalled = new(); + + private ICancelable? _timer; + + public SiteAuditReconciliationActor( + ISiteEnumerator sites, + IPullAuditEventsClient client, + IServiceProvider services, + IOptions options, + ILogger logger) + { + ArgumentNullException.ThrowIfNull(sites); + ArgumentNullException.ThrowIfNull(client); + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(options); + ArgumentNullException.ThrowIfNull(logger); + + _sites = sites; + _client = client; + _services = services; + _options = options.Value; + _logger = logger; + + ReceiveAsync(_ => OnTickAsync()); + } + + protected override void PreStart() + { + base.PreStart(); + var interval = _options.ReconciliationInterval; + _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: ReconciliationTick.Instance, + sender: Self); + } + + protected override void PostStop() + { + _timer?.Cancel(); + base.PostStop(); + } + + private async Task OnTickAsync() + { + IReadOnlyList sites; + try + { + sites = await _sites.EnumerateAsync().ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick."); + return; + } + + if (sites.Count == 0) + { + return; + } + + IServiceScope? scope = null; + IAuditLogRepository repository; + try + { + scope = _services.CreateScope(); + repository = scope.ServiceProvider.GetRequiredService(); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick."); + scope?.Dispose(); + return; + } + + try + { + foreach (var site in sites) + { + try + { + await PullSiteAsync(site, repository).ConfigureAwait(false); + } + catch (Exception ex) + { + // Catch-all per the failure-isolation invariant: one site's + // fault must not sink the rest of the tick. The cursor for + // the failing site is left at its previous value so the + // next tick retries the same window. + _logger.LogWarning( + ex, + "Reconciliation pull failed for site {SiteId}; other sites continue.", + site.SiteId); + } + } + } + finally + { + scope.Dispose(); + } + } + + /// + /// Issues one PullAuditEvents RPC against the site, ingests the + /// returned rows idempotently into the central repository, and advances + /// the cursor based on the maximum + /// observed. The brief's "saturate until backlog clears" intent is met by + /// the natural cadence — each tick issues one pull, and a backed-up site + /// drains across consecutive ticks. The stalled signal (two non-draining + /// ticks in a row) surfaces when that drain isn't keeping up. + /// + private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository) + { + var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; + var response = await _client.PullAsync( + site.SiteId, since, _options.BatchSize, CancellationToken.None) + .ConfigureAwait(false); + + var maxOccurred = since; + var nowUtc = DateTime.UtcNow; + foreach (var evt in response.Events) + { + try + { + // Idempotent repository write: duplicate EventIds (from a + // concurrent push, or a retry of this very pull) collapse to + // a no-op courtesy of M2 Bundle A's race-fix on + // InsertIfNotExistsAsync. + var ingested = evt with { IngestedAtUtc = nowUtc }; + await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false); + } + catch (Exception ex) + { + // Per-row catch so one bad event does not abandon the rest of + // the batch. The cursor still advances based on OccurredAtUtc + // — the row was returned by the site, so the next tick won't + // re-fetch it; if it permanently fails to persist, that's an + // operational concern surfaced by the log, not a hot-loop + // trigger. + _logger.LogError( + ex, + "Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId}.", + evt.EventId, + site.SiteId); + } + + if (evt.OccurredAtUtc > maxOccurred) + { + maxOccurred = evt.OccurredAtUtc; + } + } + + _cursors[site.SiteId] = maxOccurred; + + var nonDraining = response.MoreAvailable && response.Events.Count > 0; + UpdateStalledState(site.SiteId, draining: !nonDraining); + } + + /// + /// Flips the per-site stalled flag based on whether this tick drained the + /// queue. A "draining" cycle is one where the server reported no more rows + /// available OR returned zero events. A "non-draining" cycle is the + /// inverse (events returned AND MoreAvailable=true). + /// + /// + /// The state machine: counter increments on each consecutive non-draining + /// tick. On reaching + /// the actor latches Stalled=true and publishes the transition; on + /// any subsequent draining tick the counter resets to zero AND, if the + /// latch is currently true, the actor publishes Stalled=false. Only + /// transitions are published — repeated ticks in the same state are + /// silent so a downstream subscriber doesn't see a flood of redundant + /// notifications. + /// + private void UpdateStalledState(string siteId, bool draining) + { + var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior; + + if (draining) + { + _nonDrainingCycles[siteId] = 0; + if (wasStalled) + { + _stalled[siteId] = false; + Context.System.EventStream.Publish( + new SiteAuditTelemetryStalledChanged(siteId, Stalled: false)); + } + return; + } + + var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1; + _nonDrainingCycles[siteId] = consecutive; + + if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled) + { + _stalled[siteId] = true; + Context.System.EventStream.Publish( + new SiteAuditTelemetryStalledChanged(siteId, Stalled: true)); + } + } + + /// + /// Resume on any unhandled exception inside the receive — the singleton + /// MUST stay alive even if the per-tick try/catch leaks. Restart would + /// reset the cursors (safe but wasteful); Resume preserves them. + /// + protected override SupervisorStrategy SupervisorStrategy() + { + return new OneForOneStrategy( + maxNrOfRetries: 0, + withinTimeRange: TimeSpan.Zero, + decider: Akka.Actor.SupervisorStrategy.DefaultDecider); + } + + /// Self-tick triggering a reconciliation pass across all sites. + internal sealed class ReconciliationTick + { + public static readonly ReconciliationTick Instance = new(); + private ReconciliationTick() { } + } +} + +/// +/// Published on the actor system EventStream when a site's reconciliation +/// puller transitions into or out of the "stalled" state (backlog not +/// draining across multiple cycles). The M6 Bundle E central health collector +/// will subscribe to this and surface +/// SiteAuditTelemetryStalled on the health-report payload. +/// +public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled); diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs new file mode 100644 index 0000000..d32c5e6 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationOptions.cs @@ -0,0 +1,60 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Tuning knobs for the central singleton. +/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per +/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty +/// AND MoreAvailable=true (the backlog is not draining). +/// +/// +/// +/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is +/// lost; it is intentionally low-frequency. Lowering +/// in production trades MS SQL load for +/// fresher self-healing — keep the default unless a deployment can prove the extra +/// load is acceptable. +/// +/// +/// = 2 because a single non-draining +/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the +/// stalled signal should only fire when the backlog persists across cycles, which is +/// the symptom the central health surface is asking us to detect. +/// +/// +public sealed class SiteAuditReconciliationOptions +{ + /// + /// Period of the reconciliation tick. Each tick visits every known site once. + /// + public int ReconciliationIntervalSeconds { get; set; } = 300; + + /// + /// Test-only override for finer control over the tick cadence than + /// whole-second resolution allows. When non-null, takes precedence over + /// . Not bound from config — + /// production config exposes + /// only. + /// + public TimeSpan? ReconciliationIntervalOverride { get; set; } + + /// + /// Resolves the effective tick interval, honouring the test override when + /// set. Falls back to . + /// + public TimeSpan ReconciliationInterval => + ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds); + + /// + /// Maximum number of + /// rows requested in a single PullAuditEvents RPC call. + /// + public int BatchSize { get; set; } = 256; + + /// + /// Number of consecutive non-draining cycles (events returned AND + /// MoreAvailable=true) that must accumulate for a site before the actor + /// publishes SiteAuditTelemetryStalledChanged(Stalled: true) on the + /// EventStream. + /// + public int StalledAfterNonDrainingCycles { get; set; } = 2; +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs new file mode 100644 index 0000000..2d77dcd --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs @@ -0,0 +1,438 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Messages.Integration; +using ScadaLink.Commons.Types.Audit; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle B (M6-T3) tests for . Most +/// tests substitute the with an in-memory +/// recording stub so the actor's tick / cursor / stalled state machinery can +/// be exercised in milliseconds without an MSSQL container. The duplicate / +/// idempotency assertion uses the real against +/// the so we verify InsertIfNotExistsAsync +/// actually swallows duplicate-key collisions (the M2 Bundle A race-fix the +/// reconciliation puller depends on). +/// +public class SiteAuditReconciliationActorTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public SiteAuditReconciliationActorTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private static AuditEvent NewEvent( + string siteId, + DateTime? occurredAt = null, + Guid? id = null) => new() + { + EventId = id ?? Guid.NewGuid(), + OccurredAtUtc = occurredAt ?? new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + }; + + private static SiteAuditReconciliationOptions FastTickOptions( + int batchSize = 256, + int stalledAfter = 2) => + new() + { + // 100 ms tick keeps each test under a second. AwaitAssert covers + // schedule jitter so a 100 ms tick has up to ~3 s to fire. + ReconciliationIntervalSeconds = 300, + ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100), + BatchSize = batchSize, + StalledAfterNonDrainingCycles = stalledAfter, + }; + + /// + /// In-memory recording stub used for non-MSSQL tests. Captures every + /// call AND deduplicates on + /// so duplicate-handling assertions don't + /// need a real database for the simple cases. + /// + private sealed class RecordingRepo : IAuditLogRepository + { + public List Inserted { get; } = new(); + private readonly HashSet _seen = new(); + public int InsertCallCount { get; private set; } + + public Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default) + { + InsertCallCount++; + if (_seen.Add(evt.EventId)) + { + Inserted.Add(evt); + } + return Task.CompletedTask; + } + + public Task> QueryAsync( + AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default) => + Task.FromResult>(Inserted); + + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + Task.CompletedTask; + } + + /// + /// In-memory enumerator returning a static list of sites. + /// + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + /// + /// Scripted pull client — returns the next queued response for the site + /// on each call, looping the last entry if the queue is exhausted. Also + /// records every invocation so tests can assert call counts + arguments. + /// + private sealed class ScriptedPullClient : IPullAuditEventsClient + { + public List<(string SiteId, DateTime SinceUtc, int BatchSize)> Calls { get; } = new(); + private readonly Dictionary> _scripted = new(); + private readonly Dictionary _throwOnSite = new(); + + public ScriptedPullClient Script(string siteId, params PullAuditEventsResponse[] responses) + { + _scripted[siteId] = new Queue(responses); + return this; + } + + public ScriptedPullClient ThrowFor(string siteId, Exception ex) + { + _throwOnSite[siteId] = ex; + return this; + } + + public Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + Calls.Add((siteId, sinceUtc, batchSize)); + if (_throwOnSite.TryGetValue(siteId, out var ex)) + { + throw ex; + } + if (_scripted.TryGetValue(siteId, out var queue) && queue.Count > 0) + { + return Task.FromResult(queue.Dequeue()); + } + return Task.FromResult( + new PullAuditEventsResponse(Array.Empty(), MoreAvailable: false)); + } + } + + private IServiceProvider BuildScopedProvider(IAuditLogRepository repo) + { + var services = new ServiceCollection(); + // The actor opens a scope per tick and resolves IAuditLogRepository + // from that scope; registering as scoped mirrors how + // AddConfigurationDatabase wires the real repository. + services.AddScoped(_ => repo); + return services.BuildServiceProvider(); + } + + private IActorRef CreateActor( + ISiteEnumerator sites, + IPullAuditEventsClient client, + IAuditLogRepository repo, + SiteAuditReconciliationOptions options) + { + var sp = BuildScopedProvider(repo); + return Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor( + sites, + client, + sp, + Options.Create(options), + NullLogger.Instance))); + } + + /// + /// Subscribes to the EventStream and collects every + /// publication into a list + /// the test can assert on. Uses a probe actor so the stream's + /// fire-and-forget delivery is observable from the test thread. + /// + private (Akka.TestKit.TestProbe Probe, List Captured) SubscribeStalled() + { + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(SiteAuditTelemetryStalledChanged)); + var captured = new List(); + return (probe, captured); + } + + // --------------------------------------------------------------------- + // 1. Timer_Fires_OnConfiguredInterval + // --------------------------------------------------------------------- + + [Fact] + public void Timer_Fires_OnConfiguredInterval() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + var client = new ScriptedPullClient(); + var repo = new RecordingRepo(); + var opts = FastTickOptions(); + + CreateActor(sites, client, repo, opts); + + // The first scheduled tick fires after `ReconciliationIntervalSeconds`, + // which is 0 for the test — Akka's scheduler still respects the + // ScheduleTellRepeatedlyCancelable contract that issues a Tell on the + // scheduler thread, so we await visible side effects (a PullAsync call) + // rather than racing on internal state. + AwaitAssert( + () => Assert.True(client.Calls.Count >= 1, $"expected >= 1 pull call, got {client.Calls.Count}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 2. Tick_PullsFromEachKnownSite + // --------------------------------------------------------------------- + + [Fact] + public void Tick_PullsFromEachKnownSite() + { + var sites = new StaticEnumerator( + new SiteEntry("siteA", "http://siteA:8083"), + new SiteEntry("siteB", "http://siteB:8083")); + var client = new ScriptedPullClient(); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert(() => + { + Assert.Contains(client.Calls, c => c.SiteId == "siteA"); + Assert.Contains(client.Calls, c => c.SiteId == "siteB"); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 3. Tick_IngestEvents_ViaInsertIfNotExistsAsync + // --------------------------------------------------------------------- + + [Fact] + public void Tick_IngestEvents_ViaInsertIfNotExistsAsync() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + var e1 = NewEvent("siteA"); + var e2 = NewEvent("siteA"); + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(new[] { e1, e2 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert(() => Assert.Equal(2, repo.InsertCallCount), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + Assert.Contains(repo.Inserted, e => e.EventId == e1.EventId); + Assert.Contains(repo.Inserted, e => e.EventId == e2.EventId); + } + + // --------------------------------------------------------------------- + // 4. Tick_Duplicates_NotDoubleInserted (real MSSQL idempotency) + // --------------------------------------------------------------------- + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + [SkippableFact] + public async Task Tick_Duplicates_NotDoubleInserted() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = "bundle-b-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var pre = NewEvent(siteId); + + // Seed the row directly so the actor sees it already present when the + // pull returns it. + await using (var seedContext = CreateContext()) + { + await new AuditLogRepository(seedContext).InsertIfNotExistsAsync(pre); + } + + // Stack one new and the pre-existing row in the pull response. The + // second-pull script returns empty so the actor settles. + var fresh = NewEvent(siteId); + var sites = new StaticEnumerator(new SiteEntry(siteId, "http://x:8083")); + var client = new ScriptedPullClient().Script(siteId, + new PullAuditEventsResponse(new[] { pre, fresh }, MoreAvailable: false)); + + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + CreateActor(sites, client, repo, FastTickOptions()); + + // Wait for the actor to ingest both rows. + await Task.Delay(TimeSpan.FromSeconds(1)); + AwaitAssert(() => Assert.True(client.Calls.Count >= 1), + duration: TimeSpan.FromSeconds(3)); + + // Even though the pull returned 2 events, only 1 fresh row should + // exist in MSSQL alongside the pre-existing one — InsertIfNotExistsAsync + // is first-write-wins on EventId. + await using var read = CreateContext(); + var rows = await read.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + Assert.Equal(2, rows.Count); + Assert.Contains(rows, r => r.EventId == pre.EventId); + Assert.Contains(rows, r => r.EventId == fresh.EventId); + } + + // --------------------------------------------------------------------- + // 5. Cursor_Advances_ToMaxOccurredAtUtc + // --------------------------------------------------------------------- + + [Fact] + public void Cursor_Advances_ToMaxOccurredAtUtc() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + var e1 = NewEvent("siteA", t1); + var e2 = NewEvent("siteA", t2); + var e3 = NewEvent("siteA", t3); + + // First pull returns three events with t1, t2, t3. Subsequent pulls + // return empty — but the test asserts the SECOND pull's since argument + // is t3 (the max OccurredAtUtc from the first pull). + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(new[] { e1, e2, e3 }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + // Wait until we have at least two pulls — the second one must use t3 + // as its `since` argument because that was the max OccurredAtUtc in + // the first response. + AwaitAssert(() => Assert.True(client.Calls.Count >= 2, + $"need at least 2 pulls to assert cursor advancement, got {client.Calls.Count}"), + duration: TimeSpan.FromSeconds(5), + interval: TimeSpan.FromMilliseconds(50)); + + Assert.Equal(DateTime.MinValue, client.Calls[0].SinceUtc); + Assert.Equal(t3, client.Calls[1].SinceUtc); + } + + // --------------------------------------------------------------------- + // 6. Tick_OneSiteThrows_OtherSitesStillProcessed + // --------------------------------------------------------------------- + + [Fact] + public void Tick_OneSiteThrows_OtherSitesStillProcessed() + { + var sites = new StaticEnumerator( + new SiteEntry("siteA", "http://siteA:8083"), + new SiteEntry("siteB", "http://siteB:8083")); + + var bEvent = NewEvent("siteB"); + var client = new ScriptedPullClient() + .ThrowFor("siteA", new InvalidOperationException("simulated transport failure")) + .Script("siteB", + new PullAuditEventsResponse(new[] { bEvent }, MoreAvailable: false)); + var repo = new RecordingRepo(); + + CreateActor(sites, client, repo, FastTickOptions()); + + AwaitAssert(() => + { + Assert.Contains(client.Calls, c => c.SiteId == "siteA"); + Assert.Contains(repo.Inserted, e => e.EventId == bEvent.EventId); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 7. StalledDetection_TwoConsecutiveNonDrainingCycles_PublishesStalledTrue + // --------------------------------------------------------------------- + + [Fact] + public void StalledDetection_TwoConsecutiveNonDrainingCycles_PublishesStalledTrue() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + + // Two scripted responses that each return events AND MoreAvailable=true + // — the second pull triggers the stalled transition. + var batch1 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var batch2 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(batch1, MoreAvailable: true), + new PullAuditEventsResponse(batch2, MoreAvailable: true)); + + var repo = new RecordingRepo(); + var (probe, _) = SubscribeStalled(); + + CreateActor(sites, client, repo, FastTickOptions(stalledAfter: 2)); + + // Expect Stalled=true after the second non-draining tick. The probe + // waits with its own timeout (a few seconds gives the 0 s repeat + // interval ample slack). + var msg = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal("siteA", msg.SiteId); + Assert.True(msg.Stalled); + } + + // --------------------------------------------------------------------- + // 8. StalledDetection_DrainingCycle_PublishesStalledFalse + // --------------------------------------------------------------------- + + [Fact] + public void StalledDetection_DrainingCycle_PublishesStalledFalse() + { + var sites = new StaticEnumerator(new SiteEntry("siteA", "http://siteA:8083")); + + // Two non-draining responses get the actor into Stalled=true, then a + // draining response (events but MoreAvailable=false) flips it back. + var batch1 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var batch2 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var batch3 = Enumerable.Range(0, 3).Select(_ => NewEvent("siteA")).ToArray(); + var client = new ScriptedPullClient().Script("siteA", + new PullAuditEventsResponse(batch1, MoreAvailable: true), + new PullAuditEventsResponse(batch2, MoreAvailable: true), + new PullAuditEventsResponse(batch3, MoreAvailable: false)); + + var repo = new RecordingRepo(); + var (probe, _) = SubscribeStalled(); + + CreateActor(sites, client, repo, FastTickOptions(stalledAfter: 2)); + + // First publication is the stalled=true transition; second is the + // back-to-draining flip. The actor publishes ONLY on transitions so we + // expect exactly these two messages in order. + var first = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.True(first.Stalled); + + var second = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.False(second.Stalled); + Assert.Equal("siteA", second.SiteId); + } +} From 6069a20e0f0e9a43f5d95564aabf0f1479f3910c Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 18:20:55 -0400 Subject: [PATCH 05/16] fix(configdb): replace SwitchOutPartitionAsync stub with drop-and-rebuild dance (#23 M6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces M1's NotSupportedException stub with the production drop-DROP-INDEX → CREATE-staging → SWITCH PARTITION → DROP-staging → CREATE-INDEX dance documented in alog.md §4. UX_AuditLog_EventId is intentionally non-aligned with ps_AuditLog_Month so single-column EventId uniqueness can be enforced cheaply for InsertIfNotExistsAsync; SQL Server rejects ALTER TABLE SWITCH while a non-aligned unique index is present, so the implementation drops it, switches the partition data into a GUID-suffixed staging table on [PRIMARY], drops staging (discarding the rows), and rebuilds the unique index — all inside an explicit transaction with a CATCH that guarantees the unique index is rebuilt regardless of failure point. Also adds GetPartitionBoundariesOlderThanAsync to IAuditLogRepository: a CROSS APPLY over sys.partition_range_values + per-partition MAX(OccurredAtUtc) to enumerate retention-eligible months for the M6 purge actor (next commit). Tests verify: * Old partition's rows are removed; other months untouched * UX_AuditLog_EventId is rebuilt after a successful switch * InsertIfNotExistsAsync's first-write-wins idempotency still holds after switch * On engineered SWITCH failure (inbound FK from a probe table), SqlException propagates AND UX_AuditLog_EventId is still present (CATCH branch ran) * GetPartitionBoundariesOlderThanAsync returns only boundaries whose partition's MAX(OccurredAtUtc) is strictly older than the threshold; empty partitions excluded --- .../Repositories/IAuditLogRepository.cs | 43 +++- .../Repositories/AuditLogRepository.cs | 203 ++++++++++++++++- .../Central/AuditLogIngestActorTests.cs | 4 + .../SiteAuditReconciliationActorTests.cs | 4 + .../Repositories/AuditLogRepositoryTests.cs | 215 +++++++++++++++++- 5 files changed, 445 insertions(+), 24 deletions(-) diff --git a/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs b/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs index 7b15962..9932c5c 100644 --- a/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs +++ b/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs @@ -45,12 +45,43 @@ public interface IAuditLogRepository /// /// Switches out (purges) the monthly partition whose lower bound is - /// . The honest M1 implementation throws - /// : the UX_AuditLog_EventId unique - /// index is non-partition-aligned (lives on [PRIMARY], not on - /// ps_AuditLog_Month), so SQL Server rejects - /// ALTER TABLE … SWITCH PARTITION until the drop-and-rebuild dance - /// shipped by the M6 purge actor is in place. + /// . /// + /// + /// + /// Drop-and-rebuild dance. UX_AuditLog_EventId is intentionally + /// non-partition-aligned (it lives on [PRIMARY] so single-column + /// EventId uniqueness — required by — + /// can be enforced cheaply). SQL Server rejects + /// ALTER TABLE … SWITCH PARTITION while a non-aligned unique index + /// is present, so the M6 implementation drops the index, creates a staging + /// table with byte-identical schema, switches the partition's data into + /// staging, drops staging (discarding the rows), and rebuilds the unique + /// index. The CATCH branch guarantees the index is rebuilt even on partial + /// failure so the table never returns to live traffic without its + /// idempotency-supporting index. + /// + /// + /// Outage window. The dance briefly removes the unique index, so + /// concurrent calls during the switch + /// could in principle race past the IF NOT EXISTS check without the index + /// catching the duplicate. This is acceptable for the daily purge cadence + /// — the inserts that the IF NOT EXISTS check guards are themselves rare + /// enough that a sub-second collision window is operationally negligible, + /// and the composite PK still rejects same-(EventId, OccurredAtUtc) rows. + /// + /// Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default); + + /// + /// Returns the set of pf_AuditLog_Month partition lower-bound + /// boundaries whose partitions contain only rows with + /// strictly older than + /// . Boundaries whose partition is empty are + /// excluded (a no-op switch is wasted work). Used by the M6 purge actor + /// to enumerate retention-eligible months on every tick. + /// + Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, + CancellationToken ct = default); } diff --git a/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs b/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs index d88271f..9dc2f41 100644 --- a/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs +++ b/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs @@ -179,18 +179,199 @@ VALUES } /// - /// M1 honest contract: throws . The - /// UX_AuditLog_EventId unique index is non-aligned with - /// ps_AuditLog_Month (it lives on [PRIMARY] to keep - /// cheap), and SQL Server rejects - /// ALTER TABLE … SWITCH PARTITION when a non-aligned index is present. - /// The drop-and-rebuild dance that makes the switch legal ships with the M6 - /// purge actor. + /// M6-T4 production implementation of the drop-and-rebuild dance documented + /// on . /// - public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) + /// + /// + /// The staging table name is GUID-suffixed so concurrent purge attempts on + /// different boundaries cannot collide. The staging schema is byte-identical + /// to the live AuditLog table (same column types, lengths, + /// nullability, and clustered-key shape) — SQL Server's + /// ALTER TABLE … SWITCH PARTITION rejects any drift. Keep this CREATE + /// in sync with both the migration that ships the live table + /// (20260520142214_AddAuditLogTable) and + /// AuditLogEntityTypeConfiguration. + /// + /// + /// All five steps run inside an explicit transaction so the SWITCH + + /// staging-DROP are atomic from the perspective of a consumer reading via + /// snapshot isolation; the CATCH rolls back and runs an idempotent + /// "rebuild UX_AuditLog_EventId if it doesn't exist" so a partial failure + /// never leaves the live table without its idempotency-supporting unique + /// index. + /// + /// + public async Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) { - throw new NotSupportedException( - "AuditLog partition switch is blocked by the non-aligned UX_AuditLog_EventId " + - "unique index; the drop-and-rebuild dance ships in M6 (purge actor)."); + // GUID-suffixed staging name: prevents collision with any concurrent + // purge attempt and avoids polluting the AuditLog object namespace with + // a predictable identifier. + var stagingTableName = $"AuditLog_Staging_{Guid.NewGuid():N}"; + + // ISO 8601 in UTC — SQL Server's datetime2 literal parser accepts this + // unambiguously and the value is round-trip-safe across SET DATEFORMAT + // settings. + var monthBoundaryStr = monthBoundary.ToUniversalTime().ToString("yyyy-MM-dd HH:mm:ss"); + + var sql = $@" + BEGIN TRY + BEGIN TRANSACTION; + + -- 1. Drop the non-aligned unique index. ALTER TABLE SWITCH refuses + -- to run while it exists. + IF EXISTS (SELECT 1 FROM sys.indexes WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog')) + DROP INDEX UX_AuditLog_EventId ON dbo.AuditLog; + + -- 2. Staging table on [PRIMARY] (non-partitioned) with column shapes + -- byte-identical to dbo.AuditLog. Any drift here causes SWITCH to + -- reject the operation with msg 4904/4915. + CREATE TABLE dbo.[{stagingTableName}] ( + EventId uniqueidentifier NOT NULL, + OccurredAtUtc datetime2(7) NOT NULL, + IngestedAtUtc datetime2(7) NULL, + Channel varchar(32) NOT NULL, + Kind varchar(32) NOT NULL, + CorrelationId uniqueidentifier NULL, + SourceSiteId varchar(64) NULL, + SourceInstanceId varchar(128) NULL, + SourceScript varchar(128) NULL, + Actor varchar(128) NULL, + Target varchar(256) NULL, + Status varchar(32) NOT NULL, + HttpStatus int NULL, + DurationMs int NULL, + ErrorMessage nvarchar(1024) NULL, + ErrorDetail nvarchar(max) NULL, + RequestSummary nvarchar(max) NULL, + ResponseSummary nvarchar(max) NULL, + PayloadTruncated bit NOT NULL, + Extra nvarchar(max) NULL, + ForwardState varchar(32) NULL, + CONSTRAINT PK_{stagingTableName} PRIMARY KEY CLUSTERED (EventId, OccurredAtUtc) + ) ON [PRIMARY]; + + -- 3. Switch the partition out. $partition.pf_AuditLog_Month returns + -- the partition number that contains the supplied boundary value; + -- SWITCH PARTITION N moves that partition's pages to the staging + -- table (metadata-only, no row copying). + DECLARE @partitionNumber int = $partition.pf_AuditLog_Month('{monthBoundaryStr}'); + DECLARE @sql nvarchar(max) = 'ALTER TABLE dbo.AuditLog SWITCH PARTITION ' + CAST(@partitionNumber AS nvarchar(10)) + ' TO dbo.[{stagingTableName}];'; + EXEC sp_executesql @sql; + + -- 4. Drop staging — the rows are discarded here. This is the purge. + DROP TABLE dbo.[{stagingTableName}]; + + -- 5. Rebuild the non-aligned unique index. Live traffic that hit the + -- table during steps 1-4 saw composite-PK uniqueness only; from + -- here on, single-column EventId uniqueness is restored. + CREATE UNIQUE NONCLUSTERED INDEX UX_AuditLog_EventId ON dbo.AuditLog (EventId) ON [PRIMARY]; + + COMMIT TRANSACTION; + END TRY + BEGIN CATCH + IF @@TRANCOUNT > 0 ROLLBACK TRANSACTION; + + -- Best-effort staging cleanup. The DROP INDEX in step 1 is now + -- rolled back (so the index is back), but the staging table from + -- step 2 may or may not survive the rollback depending on the + -- failure point. Guard the DROP so a missing staging table doesn't + -- mask the original error. + IF OBJECT_ID('dbo.[{stagingTableName}]', 'U') IS NOT NULL DROP TABLE dbo.[{stagingTableName}]; + + -- Idempotent index rebuild — covers the niche case where ROLLBACK + -- failed to restore UX_AuditLog_EventId (or the failure happened + -- AFTER the COMMIT, which shouldn't be possible inside this TRY + -- but is cheap insurance). Without this, a failed switch could + -- leave the live table without its idempotency-supporting index. + IF NOT EXISTS (SELECT 1 FROM sys.indexes WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog')) + CREATE UNIQUE NONCLUSTERED INDEX UX_AuditLog_EventId ON dbo.AuditLog (EventId) ON [PRIMARY]; + + -- Surface the original error to the caller — the purge actor logs + -- and continues with the next boundary. + THROW; + END CATCH;"; + + await _context.Database.ExecuteSqlRawAsync(sql, ct); + } + + /// + /// Returns the set of pf_AuditLog_Month boundaries whose partition's + /// MAX(OccurredAtUtc) is strictly older than . + /// Boundaries with empty partitions are excluded — purging an empty + /// partition is wasted I/O. + /// + /// + /// + /// The CTE pulls every boundary value defined by the partition function and + /// joins it (via $PARTITION.pf_AuditLog_Month) to the live AuditLog + /// to compute per-partition MAX(OccurredAtUtc). The outer filter + /// keeps only those whose MAX is non-NULL (partition has rows) AND strictly + /// less than the threshold (every row is past retention). + /// + /// + /// Note: the query scans the live OccurredAtUtc column to compute + /// the MAX per partition. With IX_AuditLog_OccurredAtUtc on the + /// partition-aligned scheme this is a single index seek per partition; for + /// 24 partitions and a daily purge cadence the cost is negligible. + /// + /// + public async Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, + CancellationToken ct = default) + { + var thresholdUtc = threshold.ToUniversalTime(); + var thresholdStr = thresholdUtc.ToString("yyyy-MM-dd HH:mm:ss.fffffff"); + + // Per-partition MAX over the live table. We materialise the boundary + // list first (24 rows) then LEFT JOIN to the MAX aggregate so empty + // partitions surface as NULL and get filtered out by the WHERE clause. + var sql = $@" + WITH Boundaries AS ( + SELECT CAST(rv.value AS datetime2(7)) AS BoundaryValue, + rv.boundary_id AS BoundaryId + FROM sys.partition_range_values rv + INNER JOIN sys.partition_functions pf ON rv.function_id = pf.function_id + WHERE pf.name = 'pf_AuditLog_Month' + ) + SELECT b.BoundaryValue + FROM Boundaries b + CROSS APPLY ( + SELECT MAX(a.OccurredAtUtc) AS MaxOccurredAt + FROM dbo.AuditLog a + WHERE $PARTITION.pf_AuditLog_Month(a.OccurredAtUtc) = b.BoundaryId + 1 + ) x + WHERE x.MaxOccurredAt IS NOT NULL + AND x.MaxOccurredAt < CAST('{thresholdStr}' AS datetime2(7)) + ORDER BY b.BoundaryValue;"; + + var conn = _context.Database.GetDbConnection(); + var openedHere = false; + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(ct).ConfigureAwait(false); + openedHere = true; + } + + var results = new List(); + try + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = sql; + await using var reader = await cmd.ExecuteReaderAsync(ct).ConfigureAwait(false); + while (await reader.ReadAsync(ct).ConfigureAwait(false)) + { + results.Add(reader.GetDateTime(0)); + } + } + finally + { + if (openedHere) + { + await conn.CloseAsync().ConfigureAwait(false); + } + } + + return results; } } diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs index 36de05f..203fb6d 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs @@ -216,5 +216,9 @@ public class AuditLogIngestActorTests : TestKit, IClassFixture _inner.SwitchOutPartitionAsync(monthBoundary, ct); + + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) => + _inner.GetPartitionBoundariesOlderThanAsync(threshold, ct); } } diff --git a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs index 2d77dcd..d1dad16 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs @@ -89,6 +89,10 @@ public class SiteAuditReconciliationActorTests : TestKit, IClassFixture Task.CompletedTask; + + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); } /// diff --git a/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs b/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs index 958b2b1..df1daeb 100644 --- a/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs +++ b/tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs @@ -1,3 +1,4 @@ +using Microsoft.Data.SqlClient; using Microsoft.EntityFrameworkCore; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Types.Audit; @@ -309,21 +310,221 @@ public class AuditLogRepositoryTests : IClassFixture Assert.True(events.Select(e => e.EventId).ToHashSet().SetEquals(allIds)); } + // ------------------------------------------------------------------------ + // M6-T4 Bundle C: SwitchOutPartitionAsync drop-and-rebuild integration tests + // ------------------------------------------------------------------------ + // + // The partition-switch path replaces M1's NotSupportedException stub with + // the production drop-DROP-INDEX → CREATE-staging → SWITCH PARTITION → + // DROP-staging → CREATE-INDEX dance documented in alog.md §4. These tests + // verify the side effects an outsider can observe: + // * rows in the targeted month are removed + // * rows in OTHER months are NOT touched + // * UX_AuditLog_EventId still exists after a successful switch + // * InsertIfNotExistsAsync's first-write-wins idempotency still holds + // after a switch (the rebuilt index is real) + // * a thrown SqlException leaves UX_AuditLog_EventId rebuilt (the CATCH + // branch's recovery path runs) + [SkippableFact] - public async Task SwitchOutPartitionAsync_ThrowsNotSupported_ForM1() + public async Task SwitchOutPartitionAsync_OldPartition_RemovesRows_NewPartitionsKept() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = NewSiteId(); + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Three distinct months — Jan, Feb, Mar 2026 — so the switch on Jan's + // boundary purges exactly one month's worth of rows. Boundary values + // come from the partition function's pre-seeded list (alog.md §4). + var janEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 1, 15, 10, 0, 0, DateTimeKind.Utc)); + var febEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 2, 15, 10, 0, 0, DateTimeKind.Utc)); + var marEvt = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 3, 15, 10, 0, 0, DateTimeKind.Utc)); + await repo.InsertIfNotExistsAsync(janEvt); + await repo.InsertIfNotExistsAsync(febEvt); + await repo.InsertIfNotExistsAsync(marEvt); + + // Boundary value '2026-01-01' identifies the January 2026 partition under + // RANGE RIGHT semantics ($PARTITION returns the partition into which the + // boundary value itself falls — the partition whose lower bound is the + // boundary). + await repo.SwitchOutPartitionAsync(new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc)); + + await using var readContext = CreateContext(); + var remaining = await readContext.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + Assert.DoesNotContain(remaining, e => e.EventId == janEvt.EventId); + Assert.Contains(remaining, e => e.EventId == febEvt.EventId); + Assert.Contains(remaining, e => e.EventId == marEvt.EventId); + } + + [SkippableFact] + public async Task SwitchOutPartitionAsync_RebuildsUxIndex_AfterSwitch() { Skip.IfNot(_fixture.Available, _fixture.SkipReason); await using var context = CreateContext(); var repo = new AuditLogRepository(context); - // The partition-switch path is intentionally blocked in M1 because - // UX_AuditLog_EventId is non-aligned. The drop-and-rebuild dance ships - // with the M6 purge actor. - var ex = await Assert.ThrowsAsync( - () => repo.SwitchOutPartitionAsync(new DateTime(2026, 2, 1, 0, 0, 0, DateTimeKind.Utc))); + // Pick a different month per test so successive test runs (which share + // the fixture's MSSQL database) don't tread on each other. + await repo.SwitchOutPartitionAsync(new DateTime(2026, 4, 1, 0, 0, 0, DateTimeKind.Utc)); - Assert.Contains("M6", ex.Message, StringComparison.OrdinalIgnoreCase); + await using var verifyContext = CreateContext(); + var indexExists = await ScalarAsync( + verifyContext, + "SELECT COUNT(*) FROM sys.indexes " + + "WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog');"); + Assert.Equal(1, indexExists); + } + + [SkippableFact] + public async Task SwitchOutPartitionAsync_InsertIfNotExistsAsync_StillEnforcesFirstWriteWins_AfterSwitch() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = NewSiteId(); + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Pre-existing row in May 2026 — must survive a switch on a different + // (older) partition. + var preExisting = NewEvent(siteId, occurredAtUtc: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc)); + await repo.InsertIfNotExistsAsync(preExisting); + + // Switch out the June 2026 partition (different month, empty). + await repo.SwitchOutPartitionAsync(new DateTime(2026, 6, 1, 0, 0, 0, DateTimeKind.Utc)); + + // Re-attempting the same EventId after the switch must STILL be a no-op + // (UX_AuditLog_EventId is the index that enables idempotency; if the + // rebuild left it broken, this insert would silently produce a duplicate + // row and the count assertion below would catch it). + var dup = preExisting with { ErrorMessage = "second-should-be-ignored-after-switch" }; + await repo.InsertIfNotExistsAsync(dup); + + await using var readContext = CreateContext(); + var rows = await readContext.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + Assert.Single(rows); + Assert.Equal(preExisting.EventId, rows[0].EventId); + // First-write-wins: the original ErrorMessage (null) survives. + Assert.Null(rows[0].ErrorMessage); + } + + [SkippableFact] + public async Task SwitchOutPartitionAsync_PartialFailure_RebuildsUxIndex_RaisesException() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Force a deterministic switch failure with an inbound FOREIGN KEY: + // ALTER TABLE … SWITCH refuses to move rows out of a partition that's + // referenced by an FK from another table, raising msg 4928 + // ("ALTER TABLE SWITCH statement failed because target table … has a + // foreign key …"). The CATCH branch then rolls back and rebuilds the + // unique index — which the assertion below verifies. + // + // The probe table is uniquely named with a guid suffix so reruns of + // this test inside the same fixture DB never collide. We clean it up + // in the finally so the constraint never leaks into other tests. + var probeTable = $"AuditFkProbe_{Guid.NewGuid():N}".Substring(0, 32); + await using (var setup = new SqlConnection(_fixture.ConnectionString)) + { + await setup.OpenAsync(); + await using var cmd = setup.CreateCommand(); + // Composite FK references AuditLog's composite PK (EventId, OccurredAtUtc). + cmd.CommandText = + $"CREATE TABLE dbo.[{probeTable}] ( " + + $" EventId uniqueidentifier NOT NULL, " + + $" OccurredAtUtc datetime2(7) NOT NULL, " + + $" CONSTRAINT FK_{probeTable}_AuditLog FOREIGN KEY (EventId, OccurredAtUtc) " + + $" REFERENCES dbo.AuditLog(EventId, OccurredAtUtc));"; + await cmd.ExecuteNonQueryAsync(); + } + + try + { + var ex = await Assert.ThrowsAnyAsync( + () => repo.SwitchOutPartitionAsync(new DateTime(2026, 9, 1, 0, 0, 0, DateTimeKind.Utc))); + // Smoke-check the message references the SWITCH statement so we + // know we hit the engineered failure, not some unrelated error. + Assert.Contains("SWITCH", ex.Message, StringComparison.OrdinalIgnoreCase); + } + finally + { + // Always drop the probe table so the FK is gone before the next + // test runs against the shared fixture. + await using var cleanup = new SqlConnection(_fixture.ConnectionString); + await cleanup.OpenAsync(); + await using var cmd = cleanup.CreateCommand(); + cmd.CommandText = + $"IF OBJECT_ID('dbo.[{probeTable}]', 'U') IS NOT NULL DROP TABLE dbo.[{probeTable}];"; + await cmd.ExecuteNonQueryAsync(); + } + + // The CATCH block in the production SQL guarantees UX_AuditLog_EventId + // is rebuilt regardless of which step failed inside the TRY. + await using var verifyContext = CreateContext(); + var indexExists = await ScalarAsync( + verifyContext, + "SELECT COUNT(*) FROM sys.indexes " + + "WHERE name = 'UX_AuditLog_EventId' AND object_id = OBJECT_ID('dbo.AuditLog');"); + Assert.Equal(1, indexExists); + } + + // ------------------------------------------------------------------------ + // M6-T4 Bundle C: GetPartitionBoundariesOlderThanAsync + // ------------------------------------------------------------------------ + + [SkippableFact] + public async Task GetPartitionBoundariesOlderThanAsync_ReturnsBoundaries_WithMaxOccurredOlderThanThreshold() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = NewSiteId(); + await using var context = CreateContext(); + var repo = new AuditLogRepository(context); + + // Seed events in two months: July 2026 (old) and August 2026 (new). + await repo.InsertIfNotExistsAsync(NewEvent(siteId, occurredAtUtc: new DateTime(2026, 7, 10, 0, 0, 0, DateTimeKind.Utc))); + await repo.InsertIfNotExistsAsync(NewEvent(siteId, occurredAtUtc: new DateTime(2026, 8, 10, 0, 0, 0, DateTimeKind.Utc))); + + // Threshold = Aug 1 2026 — July partition's MAX (July 10) is older; + // August partition's MAX (August 10) is newer. We expect only the July + // boundary back. + var threshold = new DateTime(2026, 8, 1, 0, 0, 0, DateTimeKind.Utc); + var boundaries = await repo.GetPartitionBoundariesOlderThanAsync(threshold); + + // The repo may also return EARLIER boundaries that have no data (their + // MAX is NULL → treated as "no data, nothing to purge" by the contract). + // We only assert the inclusion/exclusion that matters for our seeded + // rows. + Assert.Contains(new DateTime(2026, 7, 1, 0, 0, 0, DateTimeKind.Utc), boundaries); + Assert.DoesNotContain(new DateTime(2026, 8, 1, 0, 0, 0, DateTimeKind.Utc), boundaries); + } + + private async Task ScalarAsync(ScadaLinkDbContext context, string sql) + { + var conn = context.Database.GetDbConnection(); + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(); + } + await using var cmd = conn.CreateCommand(); + cmd.CommandText = sql; + var result = await cmd.ExecuteScalarAsync(); + if (result is null || result is DBNull) + { + return default!; + } + return (T)Convert.ChangeType(result, typeof(T) == typeof(string) ? typeof(string) : Nullable.GetUnderlyingType(typeof(T)) ?? typeof(T))!; } // --- helpers ------------------------------------------------------------ From 660fdc4e93430b8d4c411a6e5238db78d5520060 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 18:36:31 -0400 Subject: [PATCH 06/16] feat(auditlog): AuditLogPurgeActor daily partition-switch purge (#23 M6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Central singleton (M6-T4 Bundle C) that drives the daily AuditLog partition purge. On a configurable timer (default 24 hours) the actor: 1. Queries IAuditLogRepository.GetPartitionBoundariesOlderThanAsync for monthly boundaries whose latest OccurredAtUtc is older than DateTime.UtcNow - AuditLogOptions.RetentionDays. 2. For each eligible boundary calls SwitchOutPartitionAsync, which runs the drop-and-rebuild dance around UX_AuditLog_EventId. 3. Publishes AuditLogPurgedEvent(boundary, rowsDeleted, durationMs) on the actor-system EventStream so the Bundle E central health collector and ops surfaces can subscribe without coupling to this actor. Co-changes: * SwitchOutPartitionAsync returns long (rows deleted) — sampled BEFORE the switch via COUNT_BIG over the per-partition filter so the count reflects what the switch removed, not a post-purge scan of a table that no longer exists. All stub implementations updated. * AuditLogPurgeOptions: IntervalHours (default 24), IntervalOverride for tests, Interval property resolving either. * AuditLogPurgedEvent: record with MonthBoundary, RowsDeleted, DurationMs. Behavior: * Continue-on-error per boundary — one partition that throws does NOT abandon the rest of the tick. * DI scope opened per tick (IAuditLogRepository is a SCOPED EF Core service); mirrors SiteAuditReconciliationActor and AuditLogIngestActor. * SupervisorStrategy Resume keeps the singleton alive across leaked exceptions. * EventStream capture BEFORE the first await — Context is unsafe after await in async receive handlers (same pattern as Sender-capture in AuditLogIngestActor.OnIngestAsync). Tests: * Tick_Fires_OnDailyInterval — visible timer side effect. * Tick_OldPartitions_SwitchedOut — both seeded boundaries purged. * Tick_NewerPartitions_Untouched — empty enumerator → no switches. * Tick_PublishesPurgedEvent_WithRowCount — AuditLogPurgedEvent carries RowsDeleted and DurationMs. * Tick_SwitchThrows_OtherPartitionsStillProcessed — continue-on-error. * Threshold_UsesAuditLogOptionsRetentionDays — non-default 30-day window computed from UtcNow - RetentionDays. * EndToEnd_RealPartition_RowsRemoved_PurgedEventPublished — TestKit + MsSqlMigrationFixture: real partitioned table, Jan-2026 row purged, Apr-2026 row kept, AuditLogPurgedEvent observed via probe. --- .../Central/AuditLogPurgeActor.cs | 214 ++++++++++ .../Central/AuditLogPurgeOptions.cs | 43 ++ .../Central/AuditLogPurgedEvent.cs | 29 ++ .../Repositories/IAuditLogRepository.cs | 7 +- .../Repositories/AuditLogRepository.cs | 49 ++- .../Central/AuditLogIngestActorTests.cs | 2 +- .../Central/AuditLogPurgeActorTests.cs | 376 ++++++++++++++++++ .../SiteAuditReconciliationActorTests.cs | 4 +- 8 files changed, 718 insertions(+), 6 deletions(-) create mode 100644 src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs create mode 100644 src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs create mode 100644 src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs b/src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs new file mode 100644 index 0000000..153e238 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPurgeActor.cs @@ -0,0 +1,214 @@ +using System.Diagnostics; +using Akka.Actor; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Configuration; +using ScadaLink.Commons.Interfaces.Repositories; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition +/// purge. On a configurable timer (default 24 hours) the actor: +/// +/// Queries +/// for monthly boundaries whose latest OccurredAtUtc is older +/// than DateTime.UtcNow - RetentionDays. +/// For each eligible boundary, calls +/// which runs +/// the drop-and-rebuild dance around UX_AuditLog_EventId. +/// Publishes on the actor-system +/// EventStream so the Bundle E central health collector + ops surfaces +/// can subscribe without coupling to this actor. +/// +/// +/// +/// +/// Daily cadence. Partition switch is metadata-only but the +/// drop-and-rebuild dance briefly removes UX_AuditLog_EventId; running +/// more often than necessary trades unique-index rebuild outages for +/// negligible freshness wins. The default 24-hour interval matches +/// alog.md §10's retention policy. +/// +/// +/// Continue-on-error. A single boundary that throws (transient SQL +/// failure, contention with backup, missing object) must NOT prevent the +/// other eligible boundaries from being purged on the same tick. Per-boundary +/// work runs inside its own try/catch; the actor's +/// uses Resume so any leaked exception keeps +/// the singleton alive for the next tick. +/// +/// +/// DI scopes. is a scoped EF Core +/// service registered by AddConfigurationDatabase. The singleton +/// opens one DI scope per tick and reuses the same repository across every +/// boundary in that tick — mirrors the +/// pattern. +/// +/// +/// EventStream. Publishing through +/// the EventStream rather than direct messaging avoids coupling this actor +/// to its consumers; M6 Bundle E will subscribe a central health-counter +/// bridge that surfaces purge progress on the central health report. +/// +/// +public class AuditLogPurgeActor : ReceiveActor +{ + private readonly IServiceProvider _services; + private readonly AuditLogPurgeOptions _purgeOptions; + private readonly AuditLogOptions _auditOptions; + private readonly ILogger _logger; + private ICancelable? _timer; + + public AuditLogPurgeActor( + IServiceProvider services, + IOptions purgeOptions, + IOptions auditOptions, + ILogger logger) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(purgeOptions); + ArgumentNullException.ThrowIfNull(auditOptions); + ArgumentNullException.ThrowIfNull(logger); + + _services = services; + _purgeOptions = purgeOptions.Value; + _auditOptions = auditOptions.Value; + _logger = logger; + + ReceiveAsync(_ => OnTickAsync()); + } + + protected override void PreStart() + { + base.PreStart(); + var interval = _purgeOptions.Interval; + _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable( + initialDelay: interval, + interval: interval, + receiver: Self, + message: PurgeTick.Instance, + sender: Self); + } + + protected override void PostStop() + { + _timer?.Cancel(); + base.PostStop(); + } + + /// + /// Resume keeps the singleton alive across any leaked exception. Restart + /// would re-run PreStart and reschedule the timer (harmless but wasteful); + /// Stop is wrong because the singleton must keep ticking until shutdown. + /// + protected override SupervisorStrategy SupervisorStrategy() + { + return new OneForOneStrategy( + maxNrOfRetries: 0, + withinTimeRange: TimeSpan.Zero, + decider: Akka.Actor.SupervisorStrategy.DefaultDecider); + } + + private async Task OnTickAsync() + { + // Capture EventStream BEFORE the first await. Accessing Context (and + // therefore Context.System) after an await is unsafe because Akka's + // ActorBase.Context throws "no active ActorContext" once the + // continuation runs on a thread that isn't currently dispatching this + // actor — mirrors the same Sender-capture pattern in + // AuditLogIngestActor.OnIngestAsync. + var eventStream = Context.System.EventStream; + + // Compute the retention threshold from AuditLogOptions.RetentionDays + // each tick — the options class supports hot reload via + // IOptionsMonitor for the redaction policy and similar settings; we + // read the snapshot per-tick so an operator who lowers RetentionDays + // sees the change applied on the next purge without an actor + // restart. + var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays); + + IServiceScope? scope = null; + IAuditLogRepository repository; + try + { + scope = _services.CreateScope(); + repository = scope.ServiceProvider.GetRequiredService(); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick."); + scope?.Dispose(); + return; + } + + try + { + IReadOnlyList boundaries; + try + { + boundaries = await repository + .GetPartitionBoundariesOlderThanAsync(threshold) + .ConfigureAwait(false); + } + catch (Exception ex) + { + _logger.LogError( + ex, + "Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.", + threshold); + return; + } + + if (boundaries.Count == 0) + { + return; + } + + foreach (var boundary in boundaries) + { + // Per-boundary try/catch: one bad partition (transient SQL + // failure, missing object, contention with backup) does NOT + // abandon the rest of the tick. + var sw = Stopwatch.StartNew(); + try + { + var rowsDeleted = await repository + .SwitchOutPartitionAsync(boundary) + .ConfigureAwait(false); + sw.Stop(); + + eventStream.Publish( + new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds)); + + _logger.LogInformation( + "Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.", + boundary, + rowsDeleted, + sw.ElapsedMilliseconds); + } + catch (Exception ex) + { + sw.Stop(); + _logger.LogError( + ex, + "Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.", + boundary, + sw.ElapsedMilliseconds); + } + } + } + finally + { + scope.Dispose(); + } + } + + /// Self-tick triggering a purge pass across all eligible partitions. + internal sealed class PurgeTick + { + public static readonly PurgeTick Instance = new(); + private PurgeTick() { } + } +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs b/src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs new file mode 100644 index 0000000..5f9d824 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPurgeOptions.cs @@ -0,0 +1,43 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Tuning knobs for the central singleton. +/// Default cadence is 24 hours per the M6 plan; the retention window itself +/// is sourced from +/// (default 365) so operators tune retention from a single section. +/// +/// +/// +/// The purge actor is a daily-cadence singleton, not a hot-loop, because +/// partition-switch I/O is metadata-only but the drop-and-rebuild dance +/// briefly removes the UX_AuditLog_EventId unique index — running +/// more often than necessary trades index-rebuild outages for marginal +/// freshness gains. Lower this only when an operator can prove they need +/// sub-daily purge granularity. +/// +/// +/// exists for tests to drop the cadence to +/// milliseconds without polluting the production config surface; production +/// binds only. +/// +/// +public sealed class AuditLogPurgeOptions +{ + /// Period of the purge tick in hours (default 24). + public int IntervalHours { get; set; } = 24; + + /// + /// Test-only override for finer control over the tick cadence than + /// whole-hour resolution allows. When non-null, takes precedence over + /// . Not bound from config — production + /// config exposes only. + /// + public TimeSpan? IntervalOverride { get; set; } + + /// + /// Resolves the effective tick interval, honouring the test override + /// when set. Falls back to . + /// + public TimeSpan Interval => + IntervalOverride ?? TimeSpan.FromHours(IntervalHours); +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs b/src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs new file mode 100644 index 0000000..78d4987 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPurgedEvent.cs @@ -0,0 +1,29 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Published on the actor-system EventStream by +/// after each successful partition switch-out. Downstream consumers (Bundle E +/// central health collector, ops dashboards, audit trails) subscribe so a +/// purge action is observable without the actor needing to know about any +/// specific subscriber. +/// +/// +/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e. +/// the first instant of the purged month in UTC. +/// +/// +/// Approximate row count purged from the partition, sampled BEFORE the +/// switch. Exact accounting would require a post-switch scan of the staging +/// table, which the dance drops immediately, so this is the closest +/// observable proxy. Zero is a valid value when the actor's enumerator +/// included a partition the operator subsequently emptied by hand. +/// +/// +/// Wall-clock time spent inside SwitchOutPartitionAsync for this +/// boundary, in milliseconds. Useful for spotting the rare slow purge +/// without spinning up dedicated telemetry. +/// +public sealed record AuditLogPurgedEvent( + DateTime MonthBoundary, + long RowsDeleted, + long DurationMs); diff --git a/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs b/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs index 9932c5c..bcda482 100644 --- a/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs +++ b/src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs @@ -45,7 +45,10 @@ public interface IAuditLogRepository /// /// Switches out (purges) the monthly partition whose lower bound is - /// . + /// and returns the approximate number + /// of rows discarded — sampled inside the transaction BEFORE the switch + /// so the row count reflects what the switch removed, not a post-purge + /// scan of a table that no longer exists. /// /// /// @@ -71,7 +74,7 @@ public interface IAuditLogRepository /// and the composite PK still rejects same-(EventId, OccurredAtUtc) rows. /// /// - Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default); + Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default); /// /// Returns the set of pf_AuditLog_Month partition lower-bound diff --git a/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs b/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs index 9dc2f41..d2d74ac 100644 --- a/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs +++ b/src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs @@ -202,7 +202,7 @@ VALUES /// index. /// /// - public async Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) + public async Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) { // GUID-suffixed staging name: prevents collision with any concurrent // purge attempt and avoids polluting the AuditLog object namespace with @@ -214,6 +214,17 @@ VALUES // settings. var monthBoundaryStr = monthBoundary.ToUniversalTime().ToString("yyyy-MM-dd HH:mm:ss"); + // Two-statement batch: the first SELECT samples the per-partition row + // count BEFORE the dance so we can report it back to the purge actor; + // the second batch performs the drop-and-rebuild. We use OUTPUT-style + // variables wired through @@ROWCOUNT after the SWITCH is not viable + // because SWITCH is a metadata-only operation that doesn't move rows in + // a way @@ROWCOUNT can observe. + var sampleSql = $@" + SELECT COUNT_BIG(*) FROM dbo.AuditLog + WHERE $PARTITION.pf_AuditLog_Month(OccurredAtUtc) = + $partition.pf_AuditLog_Month('{monthBoundaryStr}');"; + var sql = $@" BEGIN TRY BEGIN TRANSACTION; @@ -292,7 +303,43 @@ VALUES THROW; END CATCH;"; + // Sample the row count before the switch. The sample is best-effort + // (no transaction wrapping the sample-then-switch pair) because the + // central singleton is the only writer to this RPC and a daily-purge + // tick doesn't compete with concurrent SwitchOut callers. A + // concurrent INSERT racing the sample under-reports by at most a + // few rows, which is acceptable for an "approximate" purged-row + // count surfaced via AuditLogPurgedEvent. + long rowsDeleted = 0; + var conn = _context.Database.GetDbConnection(); + var openedHere = false; + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(ct).ConfigureAwait(false); + openedHere = true; + } + try + { + await using (var sampleCmd = conn.CreateCommand()) + { + sampleCmd.CommandText = sampleSql; + var sampleResult = await sampleCmd.ExecuteScalarAsync(ct).ConfigureAwait(false); + if (sampleResult is not null && sampleResult is not DBNull) + { + rowsDeleted = Convert.ToInt64(sampleResult); + } + } + } + finally + { + if (openedHere) + { + await conn.CloseAsync().ConfigureAwait(false); + } + } + await _context.Database.ExecuteSqlRawAsync(sql, ct); + return rowsDeleted; } /// diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs index 203fb6d..724ae68 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs @@ -214,7 +214,7 @@ public class AuditLogIngestActorTests : TestKit, IClassFixture _inner.QueryAsync(filter, paging, ct); - public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => _inner.SwitchOutPartitionAsync(monthBoundary, ct); public Task> GetPartitionBoundariesOlderThanAsync( diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs new file mode 100644 index 0000000..afa20bf --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPurgeActorTests.cs @@ -0,0 +1,376 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Configuration; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Types.Audit; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle C (#23 M6-T4) tests for . The fast, +/// schedule-only tests substitute a recording stub for +/// so the timer + per-boundary error-isolation +/// + event-publish machinery can be exercised without an MSSQL container. +/// The end-to-end "real partition gets switched out" assertion lives in the +/// repository tests (Bundle C of M6-T4); this actor file is purely about the +/// actor's policy decisions. +/// +public class AuditLogPurgeActorTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public AuditLogPurgeActorTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + /// + /// In-memory recording stub. Captures every + /// + every + /// so tests can assert which boundaries + /// the actor chose to purge and how many ticks it issued. Also lets a + /// specific boundary be configured to throw so the continue-on-error path + /// is exercisable. + /// + private sealed class RecordingRepo : IAuditLogRepository + { + public List ThresholdQueries { get; } = new(); + public List SwitchedBoundaries { get; } = new(); + public Func RowsPerBoundary { get; set; } = _ => 0L; + public DateTime? ThrowOnBoundary { get; set; } + public Exception? BoundaryException { get; set; } + + // The actor enumerator returns whichever list is configured here. + // Mutating this between ticks lets tests simulate "no longer + // eligible" boundaries on the second tick. + public List Boundaries { get; set; } = new(); + + public Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default) => + Task.CompletedTask; + + public Task> QueryAsync( + AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) + { + if (ThrowOnBoundary.HasValue && monthBoundary == ThrowOnBoundary.Value) + { + throw BoundaryException ?? new InvalidOperationException("simulated switch failure"); + } + SwitchedBoundaries.Add(monthBoundary); + return Task.FromResult(RowsPerBoundary(monthBoundary)); + } + + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) + { + ThresholdQueries.Add(threshold); + return Task.FromResult>(Boundaries.ToArray()); + } + } + + private IServiceProvider BuildScopedProvider(IAuditLogRepository repo) + { + var services = new ServiceCollection(); + // Mirror AddConfigurationDatabase: IAuditLogRepository is scoped, so + // the actor opens a fresh scope per tick and resolves there. + services.AddScoped(_ => repo); + return services.BuildServiceProvider(); + } + + private IActorRef CreateActor( + IAuditLogRepository repo, + AuditLogPurgeOptions purgeOptions, + AuditLogOptions? auditOptions = null) + { + var sp = BuildScopedProvider(repo); + return Sys.ActorOf(Props.Create(() => new AuditLogPurgeActor( + sp, + Options.Create(purgeOptions), + Options.Create(auditOptions ?? new AuditLogOptions()), + NullLogger.Instance))); + } + + private static AuditLogPurgeOptions FastTickOptions(TimeSpan? interval = null) => new() + { + IntervalHours = 24, + IntervalOverride = interval ?? TimeSpan.FromMilliseconds(100), + }; + + /// + /// Subscribe a probe to the EventStream so the test can observe + /// publications synchronously. + /// + private Akka.TestKit.TestProbe SubscribePurged() + { + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + return probe; + } + + // --------------------------------------------------------------------- + // 1. Tick_Fires_OnDailyInterval + // --------------------------------------------------------------------- + + [Fact] + public void Tick_Fires_OnDailyInterval() + { + var repo = new RecordingRepo(); + CreateActor(repo, FastTickOptions()); + + // The first scheduled tick fires after the configured interval. We + // assert the visible side effect (the enumerator was called) rather + // than racing on internal state. + AwaitAssert( + () => Assert.True(repo.ThresholdQueries.Count >= 1, + $"expected >= 1 enumerator call, got {repo.ThresholdQueries.Count}"), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 2. Tick_OldPartitions_SwitchedOut + // --------------------------------------------------------------------- + + [Fact] + public void Tick_OldPartitions_SwitchedOut() + { + var repo = new RecordingRepo + { + Boundaries = new List + { + new(2025, 11, 1, 0, 0, 0, DateTimeKind.Utc), + new(2025, 12, 1, 0, 0, 0, DateTimeKind.Utc), + }, + RowsPerBoundary = _ => 42L, + }; + + CreateActor(repo, FastTickOptions()); + + AwaitAssert( + () => + { + Assert.Contains(new DateTime(2025, 11, 1, 0, 0, 0, DateTimeKind.Utc), repo.SwitchedBoundaries); + Assert.Contains(new DateTime(2025, 12, 1, 0, 0, 0, DateTimeKind.Utc), repo.SwitchedBoundaries); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 3. Tick_NewerPartitions_Untouched + // --------------------------------------------------------------------- + + [Fact] + public void Tick_NewerPartitions_Untouched() + { + // The actor's contract: it only touches whatever the enumerator + // returns. The enumerator (in production) filters out non-eligible + // boundaries; here we simulate that by handing back an empty list + // and asserting the actor switched nothing despite the tick firing. + var repo = new RecordingRepo { Boundaries = new List() }; + + CreateActor(repo, FastTickOptions()); + + // Wait for at least one tick (visible via the enumerator call) then + // assert no switch happened. + AwaitAssert( + () => Assert.True(repo.ThresholdQueries.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + Assert.Empty(repo.SwitchedBoundaries); + } + + // --------------------------------------------------------------------- + // 4. Tick_PublishesPurgedEvent_WithRowCount + // --------------------------------------------------------------------- + + [Fact] + public void Tick_PublishesPurgedEvent_WithRowCount() + { + var boundary = new DateTime(2025, 6, 1, 0, 0, 0, DateTimeKind.Utc); + var repo = new RecordingRepo + { + Boundaries = new List { boundary }, + RowsPerBoundary = _ => 1234L, + }; + + var probe = SubscribePurged(); + CreateActor(repo, FastTickOptions()); + + var msg = probe.ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal(boundary, msg.MonthBoundary); + Assert.Equal(1234L, msg.RowsDeleted); + Assert.True(msg.DurationMs >= 0, + $"DurationMs should be non-negative; was {msg.DurationMs}"); + } + + // --------------------------------------------------------------------- + // 5. Tick_SwitchThrows_OtherPartitionsStillProcessed (continue-on-error) + // --------------------------------------------------------------------- + + [Fact] + public void Tick_SwitchThrows_OtherPartitionsStillProcessed() + { + var poisonBoundary = new DateTime(2025, 7, 1, 0, 0, 0, DateTimeKind.Utc); + var goodBoundary = new DateTime(2025, 8, 1, 0, 0, 0, DateTimeKind.Utc); + var repo = new RecordingRepo + { + Boundaries = new List { poisonBoundary, goodBoundary }, + ThrowOnBoundary = poisonBoundary, + BoundaryException = new InvalidOperationException("simulated switch failure for poison boundary"), + }; + + CreateActor(repo, FastTickOptions()); + + AwaitAssert( + () => + { + // The good boundary was still switched even though the poison + // boundary threw. + Assert.Contains(goodBoundary, repo.SwitchedBoundaries); + Assert.DoesNotContain(poisonBoundary, repo.SwitchedBoundaries); + }, + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + } + + // --------------------------------------------------------------------- + // 6. EndToEnd_RealPartition_RowsRemoved_PurgedEventPublished + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_RealPartition_RowsRemoved_PurgedEventPublished() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Today is ~2026-05-20 per the test environment. With RetentionDays = + // 60 the actor computes threshold ≈ 2026-03-21: + // * Jan partition (MAX = Jan 15) → older than threshold → PURGED + // * Apr partition (MAX = Apr 15) → newer than threshold → KEPT + var siteId = "purge-e2e-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEvt = new AuditEvent + { + EventId = Guid.NewGuid(), + OccurredAtUtc = new DateTime(2026, 1, 15, 0, 0, 0, DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + }; + var aprEvt = new AuditEvent + { + EventId = Guid.NewGuid(), + OccurredAtUtc = new DateTime(2026, 4, 15, 0, 0, 0, DateTimeKind.Utc), + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + }; + + await using (var seedContext = CreateMsSqlContext()) + { + var seedRepo = new AuditLogRepository(seedContext); + await seedRepo.InsertIfNotExistsAsync(janEvt); + await seedRepo.InsertIfNotExistsAsync(aprEvt); + } + + // Wire the actor's DI scope to the real repository against the + // fixture's MSSQL database. The actor opens a fresh scope per tick, + // so register the context as scoped (mirroring the production + // AddConfigurationDatabase wiring). + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var auditOptions = new AuditLogOptions { RetentionDays = 60 }; + var purgeOptions = new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }; + + var probe = SubscribePurged(); + Sys.ActorOf(Props.Create(() => new AuditLogPurgeActor( + sp, + Options.Create(purgeOptions), + Options.Create(auditOptions), + NullLogger.Instance))); + + // The probe receives one AuditLogPurgedEvent per partition the actor + // purges per tick — other test runs that share the fixture DB may + // also leave behind eligible partitions, but this test creates its + // own fixture DB so the Jan-2026 partition is the only eligible one. + // Use FishForMessage to filter just in case, with a generous timeout + // because the real drop-and-rebuild dance against MSSQL routinely + // takes a couple of seconds on a busy dev container. + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + var matched = probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + + Assert.True(matched.RowsDeleted >= 1, + $"Expected RowsDeleted >= 1 for the Jan-2026 partition; got {matched.RowsDeleted}."); + + // Settle: allow any in-flight tick to commit before reading. + await Task.Delay(TimeSpan.FromMilliseconds(500)); + await using var verifyContext = CreateMsSqlContext(); + var rows = await verifyContext.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + Assert.DoesNotContain(rows, r => r.EventId == janEvt.EventId); + Assert.Contains(rows, r => r.EventId == aprEvt.EventId); + } + + private ScadaLinkDbContext CreateMsSqlContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + // --------------------------------------------------------------------- + // 7. Threshold_UsesAuditLogOptionsRetentionDays + // --------------------------------------------------------------------- + + [Fact] + public void Threshold_UsesAuditLogOptionsRetentionDays() + { + // The actor computes the threshold from AuditLogOptions.RetentionDays; + // assert the enumerator received a threshold whose value is in the + // expected window (today - retentionDays) rather than DateTime.MinValue + // or some other accidental default. We use a non-default retention + // (30 days) so the assertion isn't satisfied by the 365 default. + var repo = new RecordingRepo(); + CreateActor( + repo, + FastTickOptions(), + auditOptions: new AuditLogOptions { RetentionDays = 30 }); + + AwaitAssert( + () => Assert.True(repo.ThresholdQueries.Count >= 1), + duration: TimeSpan.FromSeconds(3), + interval: TimeSpan.FromMilliseconds(50)); + + var threshold = repo.ThresholdQueries[0]; + var expected = DateTime.UtcNow - TimeSpan.FromDays(30); + // 1-minute slack covers test-thread scheduling jitter between the + // tick firing and the assertion running. + Assert.True( + Math.Abs((threshold - expected).TotalMinutes) < 1.0, + $"threshold {threshold:o} should be within 1 minute of {expected:o}"); + } +} diff --git a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs index d1dad16..5cbcfe9 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditReconciliationActorTests.cs @@ -87,8 +87,8 @@ public class SiteAuditReconciliationActorTests : TestKit, IClassFixture Task.FromResult>(Inserted); - public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => - Task.CompletedTask; + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + Task.FromResult(0L); public Task> GetPartitionBoundariesOlderThanAsync( DateTime threshold, CancellationToken ct = default) => From cc2d6e91f12e6250a96e5c3d849af313575aa9ae Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 18:39:19 -0400 Subject: [PATCH 07/16] fix(auditlog): SiteAuditReconciliationActor captures EventStream before await (#23 M6) --- .../Central/SiteAuditReconciliationActor.cs | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs index 6460c4d..e38e6d2 100644 --- a/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs +++ b/src/ScadaLink.AuditLog/Central/SiteAuditReconciliationActor.cs @@ -137,6 +137,14 @@ public class SiteAuditReconciliationActor : ReceiveActor private async Task OnTickAsync() { + // Capture EventStream BEFORE the first await. Accessing Context (and + // therefore Context.System) after an await is unsafe because Akka's + // ActorBase.Context throws "no active ActorContext" once the + // continuation runs on a thread that isn't currently dispatching this + // actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the + // AuditLogIngestActor.OnIngestAsync Sender-capture pattern. + var eventStream = Context.System.EventStream; + IReadOnlyList sites; try { @@ -173,7 +181,7 @@ public class SiteAuditReconciliationActor : ReceiveActor { try { - await PullSiteAsync(site, repository).ConfigureAwait(false); + await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false); } catch (Exception ex) { @@ -203,7 +211,7 @@ public class SiteAuditReconciliationActor : ReceiveActor /// drains across consecutive ticks. The stalled signal (two non-draining /// ticks in a row) surfaces when that drain isn't keeping up. /// - private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository) + private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream) { var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue; var response = await _client.PullAsync( @@ -247,7 +255,7 @@ public class SiteAuditReconciliationActor : ReceiveActor _cursors[site.SiteId] = maxOccurred; var nonDraining = response.MoreAvailable && response.Events.Count > 0; - UpdateStalledState(site.SiteId, draining: !nonDraining); + UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream); } /// @@ -266,7 +274,7 @@ public class SiteAuditReconciliationActor : ReceiveActor /// silent so a downstream subscriber doesn't see a flood of redundant /// notifications. /// - private void UpdateStalledState(string siteId, bool draining) + private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream) { var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior; @@ -276,7 +284,7 @@ public class SiteAuditReconciliationActor : ReceiveActor if (wasStalled) { _stalled[siteId] = false; - Context.System.EventStream.Publish( + eventStream.Publish( new SiteAuditTelemetryStalledChanged(siteId, Stalled: false)); } return; @@ -288,7 +296,7 @@ public class SiteAuditReconciliationActor : ReceiveActor if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled) { _stalled[siteId] = true; - Context.System.EventStream.Publish( + eventStream.Publish( new SiteAuditTelemetryStalledChanged(siteId, Stalled: true)); } } From 75b060e0a8449a85c7f0d1f8b9f4aa9470eeb570 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 18:51:43 -0400 Subject: [PATCH 08/16] feat(auditlog): AuditLogPartitionMaintenanceService monthly roll-forward (#23 M6) --- .../AuditLogPartitionMaintenanceOptions.cs | 37 +++ .../AuditLogPartitionMaintenanceService.cs | 145 ++++++++++++ .../ServiceCollectionExtensions.cs | 37 +++ .../Interfaces/IPartitionMaintenance.cs | 48 ++++ .../AuditLogPartitionMaintenance.cs | 218 ++++++++++++++++++ .../ServiceCollectionExtensions.cs | 9 + src/ScadaLink.Host/Program.cs | 4 + ...uditLogPartitionMaintenanceServiceTests.cs | 154 +++++++++++++ .../AuditLogPartitionMaintenanceTests.cs | 182 +++++++++++++++ 9 files changed, 834 insertions(+) create mode 100644 src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs create mode 100644 src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs create mode 100644 src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs create mode 100644 src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs create mode 100644 tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs new file mode 100644 index 0000000..317e6e7 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceOptions.cs @@ -0,0 +1,37 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Tuning knobs for the central +/// hosted service (M6-T5). +/// Defaults: once every 24 hours, keep at least one future monthly +/// boundary ahead of . +/// +/// +/// +/// The hosted service drives a daily roll-forward of +/// pf_AuditLog_Month: each tick reads the current max boundary and +/// SPLITs new monthly boundaries until at least +/// future months are covered. The 1-month +/// default is intentionally conservative — anything less risks an +/// end-of-month race where inserts land in the unbounded tail partition; +/// anything more wastes nothing but represents premature commitment. +/// +/// +/// The 24-hour cadence is the cheapest interval that still guarantees +/// at-most-one missed boundary in steady state (even a hard failover the +/// hosted service can recover on its very next tick). Lowering this below +/// an hour would generate more metadata churn than it saves. +/// +/// +public sealed class AuditLogPartitionMaintenanceOptions +{ + /// Period of the maintenance tick in seconds (default 86 400 = 24 h). + public int IntervalSeconds { get; set; } = 86_400; + + /// + /// Minimum number of future months that pf_AuditLog_Month must + /// cover after each tick. Default 1 — i.e. as of mid-May the partition + /// for the next full month (June) must already be present. + /// + public int LookaheadMonths { get; set; } = 1; +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs new file mode 100644 index 0000000..2aa02f8 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs @@ -0,0 +1,145 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using ScadaLink.Commons.Interfaces; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Central (M6-T5, Bundle D) that rolls +/// pf_AuditLog_Month forward once a day. Each tick opens a fresh DI +/// scope, resolves , and calls +/// to SPLIT any +/// missing future boundaries — the partition function must always cover at +/// least +/// future months, otherwise inserts past the highest boundary accumulate in +/// a single unbounded tail partition that SwitchOutPartitionAsync +/// cannot purge cleanly. +/// +/// +/// +/// Why a hosted service, not an actor. Bundle C's +/// sits inside the central singleton +/// because it needs supervised lifecycle alongside the rest of the +/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day +/// chore with no cross-actor coordination, so we use the much simpler +/// hosted-service pattern: Task.Run on start, Task.Delay +/// between ticks, cancellation on stop. Reusing +/// from the central node-only DI graph +/// keeps the contract testable without any actor framework involvement. +/// +/// +/// Failure containment. The tick body wraps the maintenance call in +/// a try/catch so a transient SQL Server error never tears down the hosted +/// service — the next tick simply retries. The exception is logged with +/// the original stack trace at Error level; ops surfaces (M6 Bundle +/// E's central health collector) can subscribe to the logger to alert on +/// repeated failures. +/// +/// +/// Startup ordering. A first tick fires immediately at +/// so a fresh deployment doesn't need to wait +/// for +/// the partition function to come up to spec. This is also what the brief +/// asks for ("Run once on startup"). +/// +/// +/// DI scope per tick. is scoped +/// (alongside the rest of the EF repositories) because the implementation +/// reuses the per-scope ScadaLinkDbContext. A hosted service is a +/// singleton, so it must open and dispose a scope around each tick — the +/// same pattern uses. +/// +/// +public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable +{ + private readonly IServiceScopeFactory _scopeFactory; + private readonly IOptions _options; + private readonly ILogger _logger; + private CancellationTokenSource? _cts; + private Task? _loop; + + public AuditLogPartitionMaintenanceService( + IServiceScopeFactory scopeFactory, + IOptions options, + ILogger logger) + { + _scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory)); + _options = options ?? throw new ArgumentNullException(nameof(options)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + } + + /// + public Task StartAsync(CancellationToken ct) + { + // Linked CTS lets StopAsync's cancellation AND the host's shutdown + // token both terminate the loop; either side firing aborts the + // pending Task.Delay. + _cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + _loop = Task.Run(() => RunLoopAsync(_cts.Token)); + return Task.CompletedTask; + } + + private async Task RunLoopAsync(CancellationToken ct) + { + // Run once on startup so a fresh deployment isn't gated on the + // IntervalSeconds initial wait — the brief calls this out explicitly. + await SafeMaintainAsync(ct).ConfigureAwait(false); + + while (!ct.IsCancellationRequested) + { + try + { + await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct) + .ConfigureAwait(false); + } + catch (OperationCanceledException) + { + break; + } + + await SafeMaintainAsync(ct).ConfigureAwait(false); + } + } + + private async Task SafeMaintainAsync(CancellationToken ct) + { + try + { + await using var scope = _scopeFactory.CreateAsyncScope(); + var maintenance = scope.ServiceProvider.GetRequiredService(); + var added = await maintenance + .EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct) + .ConfigureAwait(false); + if (added.Count > 0) + { + _logger.LogInformation( + "AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}", + added.Count, + string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd")))); + } + } + catch (Exception ex) + { + // Catch-all is deliberate: the hosted service must survive every + // class of tick failure (transient SQL, DI resolution, etc.) so + // the next tick gets a chance. The brief's contract is + // "exception logged, not propagated". + _logger.LogError(ex, "AuditLogPartitionMaintenance tick failed"); + } + } + + /// + public Task StopAsync(CancellationToken ct) + { + _cts?.Cancel(); + return _loop ?? Task.CompletedTask; + } + + /// + public void Dispose() + { + _cts?.Dispose(); + } +} diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index cf04abd..2216eb2 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -1,6 +1,7 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.AuditLog.Central; @@ -43,6 +44,9 @@ public static class ServiceCollectionExtensions /// Configuration section bound to . public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry"; + /// Configuration section bound to . + public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance"; + /// /// Registers the Audit Log (#23) component services: options, the site /// SQLite writer chain (primary + ring fallback + failure-counter sink), @@ -216,4 +220,37 @@ public static class ServiceCollectionExtensions ServiceDescriptor.Singleton()); return services; } + + /// + /// Audit Log (#23) M6-T5 Bundle D — central-only registration for the + /// hosted service plus + /// its binding. Must be + /// called from the Central role's composition root (not from a site + /// composition root); the underlying IPartitionMaintenance + /// implementation is registered by AddConfigurationDatabase and + /// only exists on the central node. + /// + /// + /// + /// Separated from because AddAuditLog is + /// also invoked from site composition roots — silently starting a + /// hosted service that resolves an unregistered dependency on a site + /// would fail every tick. Keeping the central-only registration in its + /// own helper preserves the "every Add* call is safe to issue + /// from any composition root" invariant. + /// + /// + public static IServiceCollection AddAuditLogCentralMaintenance( + this IServiceCollection services, + IConfiguration config) + { + ArgumentNullException.ThrowIfNull(services); + ArgumentNullException.ThrowIfNull(config); + + services.AddOptions() + .Bind(config.GetSection(PartitionMaintenanceSectionName)); + services.AddHostedService(); + + return services; + } } diff --git a/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs b/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs new file mode 100644 index 0000000..b8b3ec5 --- /dev/null +++ b/src/ScadaLink.Commons/Interfaces/IPartitionMaintenance.cs @@ -0,0 +1,48 @@ +namespace ScadaLink.Commons.Interfaces; + +/// +/// Abstraction over the central AuditLog partition-function roll-forward +/// operation. M6-T5 introduces a daily-cadence hosted service +/// (AuditLogPartitionMaintenanceService) that calls +/// to make sure +/// pf_AuditLog_Month always has at least LookaheadMonths of +/// future boundaries available — otherwise inserts past the highest +/// boundary land in a single ever-growing tail partition that +/// SwitchOutPartitionAsync cannot purge cleanly. +/// +/// +/// +/// The interface lives in ScadaLink.Commons so the central hosted +/// service in ScadaLink.AuditLog can depend on it without taking a +/// reference on ScadaLink.ConfigurationDatabase; the EF-based +/// implementation ships in +/// ScadaLink.ConfigurationDatabase.Maintenance.AuditLogPartitionMaintenance +/// and is registered by AddConfigurationDatabase. +/// +/// +/// Both methods read sys.partition_range_values / mutate +/// pf_AuditLog_Month via raw SQL — there is no EF model for a +/// partition function. The interface deliberately exposes only the two +/// operations the hosted service needs; it is not a general partition-DDL +/// surface. +/// +/// +public interface IPartitionMaintenance +{ + /// + /// Splits new monthly boundaries on pf_AuditLog_Month so the + /// function covers at least future + /// months relative to . Idempotent — a + /// boundary that already exists is skipped rather than re-issued. + /// Returns the boundaries actually added, in chronological order. + /// + Task> EnsureLookaheadAsync(int lookaheadMonths, CancellationToken ct = default); + + /// + /// Reads the current maximum boundary value from + /// sys.partition_range_values for pf_AuditLog_Month. + /// Returns null when the partition function does not exist or + /// has no boundaries. + /// + Task GetMaxBoundaryAsync(CancellationToken ct = default); +} diff --git a/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs b/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs new file mode 100644 index 0000000..cdbd54b --- /dev/null +++ b/src/ScadaLink.ConfigurationDatabase/Maintenance/AuditLogPartitionMaintenance.cs @@ -0,0 +1,218 @@ +using System.Globalization; +using Microsoft.Data.SqlClient; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.Commons.Interfaces; + +namespace ScadaLink.ConfigurationDatabase.Maintenance; + +/// +/// EF/SQL-Server implementation of that +/// rolls forward pf_AuditLog_Month by issuing +/// ALTER PARTITION FUNCTION … SPLIT RANGE for each missing future +/// monthly boundary. +/// +/// +/// +/// The class is scoped (registered alongside the other repositories in +/// AddConfigurationDatabase) because it shares +/// — the hosted service opens a per-tick DI scope, resolves a fresh instance, +/// and lets the scope's DbContext dispose with it. The class itself +/// holds no state between calls. +/// +/// +/// Idempotency model. Each tick reads the current max boundary from +/// sys.partition_range_values and only issues SPLIT RANGE for +/// boundaries that strictly follow it — a boundary already covered is never +/// re-issued, so the "boundary already exists" failure (SQL Server msg 7708 +/// / 7711) is avoided by construction rather than caught. The pre-check is +/// cheaper than the alternative TRY/CATCH around every SPLIT call and also +/// keeps the returned added list semantically precise. +/// +/// +/// Why "first of next month". The migration seeds boundaries on the +/// first-of-month at midnight UTC; we preserve that convention so the +/// resulting partition layout is uniform. +/// rounds an arbitrary timestamp up to the next first-of-month boundary +/// (e.g. 2026-05-20 → 2026-06-01), and +/// walks one month at a time from there. +/// +/// +/// Permissions. The migration's scadalink_audit_purger role +/// already carries ALTER ON SCHEMA::dbo, which is sufficient for +/// ALTER PARTITION FUNCTION SPLIT RANGE. No additional grant is +/// required. +/// +/// +public sealed class AuditLogPartitionMaintenance : IPartitionMaintenance +{ + private const string PartitionFunctionName = "pf_AuditLog_Month"; + private const string PartitionSchemeName = "ps_AuditLog_Month"; + private const string TargetFileGroup = "PRIMARY"; + + private readonly ScadaLinkDbContext _context; + private readonly ILogger _logger; + + public AuditLogPartitionMaintenance( + ScadaLinkDbContext context, + ILogger? logger = null) + { + _context = context ?? throw new ArgumentNullException(nameof(context)); + _logger = logger ?? NullLogger.Instance; + } + + /// + public async Task GetMaxBoundaryAsync(CancellationToken ct = default) + { + // CAST the sql_variant `value` column to datetime2(7) — every boundary in + // pf_AuditLog_Month is declared as datetime2(7) by the migration, so the + // cast never loses precision. + const string sql = @" +SELECT MAX(CAST(rv.value AS datetime2(7))) +FROM sys.partition_range_values rv +INNER JOIN sys.partition_functions pf ON rv.function_id = pf.function_id +WHERE pf.name = 'pf_AuditLog_Month';"; + + var conn = _context.Database.GetDbConnection(); + var openedHere = false; + if (conn.State != System.Data.ConnectionState.Open) + { + await conn.OpenAsync(ct).ConfigureAwait(false); + openedHere = true; + } + + try + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = sql; + var raw = await cmd.ExecuteScalarAsync(ct).ConfigureAwait(false); + if (raw is null || raw is DBNull) + { + return null; + } + + // ExecuteScalarAsync materialises datetime2 as DateTime with + // DateTimeKind.Unspecified; the boundary values are stored at + // UTC midnight by convention (migration seeds with 'T00:00:00'), + // so we re-tag the kind so downstream comparisons against + // DateTime.UtcNow stay in the same kind space. + var dt = (DateTime)raw; + return DateTime.SpecifyKind(dt, DateTimeKind.Utc); + } + finally + { + if (openedHere) + { + await conn.CloseAsync().ConfigureAwait(false); + } + } + } + + /// + public async Task> EnsureLookaheadAsync( + int lookaheadMonths, + CancellationToken ct = default) + { + if (lookaheadMonths < 1) + { + throw new ArgumentOutOfRangeException( + nameof(lookaheadMonths), + lookaheadMonths, + "Lookahead must be at least one month — the partition function would otherwise be allowed to fall behind 'now'."); + } + + var nowUtc = DateTime.UtcNow; + // Horizon: the FIRST-OF-MONTH that must be the strictly-greater-than + // max boundary after this call. Example: nowUtc = 2026-05-20 and + // lookaheadMonths = 1 → horizon = 2026-07-01 (so the partition for + // June 2026 is already in place by mid-May). + var horizon = NormalizeToFirstOfMonth(nowUtc).AddMonths(lookaheadMonths); + + var max = await GetMaxBoundaryAsync(ct).ConfigureAwait(false); + if (max is null) + { + // No partition function (e.g. migrations not applied) — nothing + // we can safely SPLIT against. Log and return; the absence is a + // genuine misconfiguration that other parts of the system will + // surface louder than we could here. + _logger.LogWarning( + "EnsureLookaheadAsync: partition function {PartitionFunctionName} not found; skipping.", + PartitionFunctionName); + return Array.Empty(); + } + + // Start splitting from the FIRST month strictly after max — if max is + // already first-of-month (the common case), that's max + 1 month; + // otherwise NormalizeToFirstOfMonth rounds up. + var next = NormalizeToFirstOfMonth(max.Value.AddDays(1)); + + // Edge case: max already past horizon → no work to do. + if (next > horizon) + { + return Array.Empty(); + } + + var added = new List(); + while (next <= horizon) + { + // Boundary literal must be a deterministic, culture-invariant ISO + // string — SQL Server parses it as datetime2 via implicit conversion. + // SPLIT RANGE does NOT accept @-parameters; the value is part of the + // DDL statement, so we render it directly. The format is + // guaranteed (yyyy-MM-ddTHH:mm:ss.fffffff) so there is no injection + // surface. + var literal = next.ToString("yyyy-MM-ddTHH:mm:ss.fffffff", CultureInfo.InvariantCulture); + + // Before every SPLIT we must (re-)set the NEXT USED filegroup on + // ps_AuditLog_Month. Even though the scheme was created with + // `ALL TO ([PRIMARY])` (which auto-populates NEXT USED once), SQL + // Server consumes that hint on the FIRST split — subsequent splits + // raise msg 7707 ("partition scheme … does not have any next used + // filegroup") unless NEXT USED is explicitly re-set. Re-issuing it + // before every split is idempotent and keeps the loop simple. + var sql = $@" +ALTER PARTITION SCHEME {PartitionSchemeName} NEXT USED [{TargetFileGroup}]; +ALTER PARTITION FUNCTION {PartitionFunctionName}() SPLIT RANGE ('{literal}');"; + + try + { + await _context.Database.ExecuteSqlRawAsync(sql, ct).ConfigureAwait(false); + added.Add(next); + } + catch (SqlException ex) + { + // Belt-and-braces: even though we read max-boundary first, an + // ALTER from another process could have raced us. Logging at + // Warning rather than Error because the desired end state + // (boundary present) is satisfied by either path. + _logger.LogWarning( + ex, + "EnsureLookaheadAsync: SPLIT RANGE for boundary {Boundary:o} failed; continuing.", + next); + } + + next = NextMonthBoundary(next); + } + + return added; + } + + /// + /// Rounds an arbitrary instant UP to the next first-of-month UTC. Inputs + /// that ARE already a first-of-month at midnight are returned as-is so + /// callers can compose this freely without double-incrementing. + /// + private static DateTime NormalizeToFirstOfMonth(DateTime instant) + { + var utc = instant.Kind == DateTimeKind.Utc + ? instant + : DateTime.SpecifyKind(instant, DateTimeKind.Utc); + + var firstOfThisMonth = new DateTime(utc.Year, utc.Month, 1, 0, 0, 0, DateTimeKind.Utc); + return utc == firstOfThisMonth ? firstOfThisMonth : firstOfThisMonth.AddMonths(1); + } + + private static DateTime NextMonthBoundary(DateTime boundary) => + boundary.AddMonths(1); +} diff --git a/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs b/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs index bf79b29..d926f1e 100644 --- a/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs @@ -1,8 +1,10 @@ using Microsoft.AspNetCore.DataProtection; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.DependencyInjection; +using ScadaLink.Commons.Interfaces; using ScadaLink.Commons.Interfaces.Repositories; using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.ConfigurationDatabase.Maintenance; using ScadaLink.ConfigurationDatabase.Repositories; using ScadaLink.ConfigurationDatabase.Services; @@ -52,6 +54,13 @@ public static class ServiceCollectionExtensions services.AddScoped(); services.AddScoped(); + // #23 M6 Bundle D: IPartitionMaintenance drives the daily roll-forward + // of pf_AuditLog_Month from the central AuditLogPartitionMaintenanceService + // hosted service. Scoped because the implementation reuses the per-scope + // ScadaLinkDbContext for raw-SQL execution; the hosted service opens a + // fresh scope on each tick (mirrors AuditLogPurgeActor / AuditLogIngestActor). + services.AddScoped(); + services.AddDataProtection() .PersistKeysToDbContext(); diff --git a/src/ScadaLink.Host/Program.cs b/src/ScadaLink.Host/Program.cs index b1119d1..3632824 100644 --- a/src/ScadaLink.Host/Program.cs +++ b/src/ScadaLink.Host/Program.cs @@ -84,6 +84,10 @@ try // IAuditLogRepository. The site writer chain is still registered (lazy // singletons) but is never resolved on a central node. builder.Services.AddAuditLog(builder.Configuration); + // #23 M6-T5 Bundle D — central-only hosted service that rolls + // pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance + // (registered below by AddConfigurationDatabase). + builder.Services.AddAuditLogCentralMaintenance(builder.Configuration); // Site Call Audit (#22) — central node owns the SiteCallAuditActor // singleton (M3 Bundle F). The extension itself currently registers // nothing — actor Props are constructed inline in AkkaHostedService — diff --git a/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs new file mode 100644 index 0000000..4e65207 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/AuditLogPartitionMaintenanceServiceTests.cs @@ -0,0 +1,154 @@ +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Interfaces; +using Xunit; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle D (#23 M6-T5) tests for . +/// All tests use an in-memory stub — +/// the real EF/MSSQL implementation is exercised by the +/// AuditLogPartitionMaintenanceTests integration suite in +/// ScadaLink.ConfigurationDatabase.Tests. This file is purely +/// about the hosted service's policy decisions (start/stop, exception +/// containment). +/// +public class AuditLogPartitionMaintenanceServiceTests +{ + /// + /// Recording stub — counts EnsureLookaheadAsync invocations and lets the + /// test inject an exception per invocation to drive the catch-all path. + /// + private sealed class RecordingMaintenance : IPartitionMaintenance + { + public int EnsureCallCount; + public Exception? ThrowOnce; + + public Task> EnsureLookaheadAsync(int lookaheadMonths, CancellationToken ct = default) + { + Interlocked.Increment(ref EnsureCallCount); + if (ThrowOnce is { } ex) + { + ThrowOnce = null; + throw ex; + } + return Task.FromResult>(Array.Empty()); + } + + public Task GetMaxBoundaryAsync(CancellationToken ct = default) => + Task.FromResult(DateTime.UtcNow.AddMonths(6)); + } + + /// + /// Captures logged exceptions so the catch-all assertion can prove + /// the exception was actually logged (not silently swallowed) and was + /// the exact instance the stub threw. + /// + private sealed class CapturingLogger : ILogger + { + public List<(LogLevel Level, Exception? Exception, string Message)> Entries { get; } = new(); + + public IDisposable? BeginScope(TState state) where TState : notnull => null; + + public bool IsEnabled(LogLevel logLevel) => true; + + public void Log( + LogLevel logLevel, + EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + Entries.Add((logLevel, exception, formatter(state, exception))); + } + } + + private static IServiceProvider BuildProvider(IPartitionMaintenance maintenance) + { + var services = new ServiceCollection(); + // IPartitionMaintenance is registered as scoped by AddConfigurationDatabase; + // we mirror that here so the hosted service's CreateAsyncScope + + // GetRequiredService resolves the stub the test injected. + services.AddScoped(_ => maintenance); + return services.BuildServiceProvider(); + } + + [Fact] + public async Task StartStop_NoExceptions() + { + // Long interval so only the eager startup tick fires inside the test + // window — keeps assertions deterministic without relying on + // multiple cadence loops. + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = 1, + }); + var maintenance = new RecordingMaintenance(); + var sp = BuildProvider(maintenance); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + + await svc.StartAsync(CancellationToken.None); + + // Spin briefly until the startup tick has fired — the loop's first + // SafeMaintainAsync runs on a background Task.Run continuation, so + // we can't synchronously rely on its completion. + var deadline = DateTime.UtcNow.AddSeconds(3); + while (Volatile.Read(ref maintenance.EnsureCallCount) < 1 && DateTime.UtcNow < deadline) + { + await Task.Delay(20); + } + + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + Assert.True(maintenance.EnsureCallCount >= 1, $"expected at least 1 ensure call, got {maintenance.EnsureCallCount}"); + } + + [Fact] + public async Task SafeMaintain_ExceptionLogged_NotPropagated() + { + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = 1, + }); + // The injected exception fires on the FIRST EnsureLookaheadAsync call + // (the startup tick) — the hosted service must contain it and + // continue running. + var boom = new InvalidOperationException("simulated maintenance failure"); + var maintenance = new RecordingMaintenance { ThrowOnce = boom }; + var sp = BuildProvider(maintenance); + var logger = new CapturingLogger(); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + logger); + + // StartAsync must not throw even though the very first tick will fail. + await svc.StartAsync(CancellationToken.None); + + // Wait for the error to surface in the logger. + var deadline = DateTime.UtcNow.AddSeconds(3); + while (!logger.Entries.Any(e => e.Exception == boom) && DateTime.UtcNow < deadline) + { + await Task.Delay(20); + } + + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + var errorEntry = Assert.Single(logger.Entries, e => e.Exception == boom); + Assert.Equal(LogLevel.Error, errorEntry.Level); + Assert.Equal(1, maintenance.EnsureCallCount); + } +} diff --git a/tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs b/tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs new file mode 100644 index 0000000..2d8c6c8 --- /dev/null +++ b/tests/ScadaLink.ConfigurationDatabase.Tests/Maintenance/AuditLogPartitionMaintenanceTests.cs @@ -0,0 +1,182 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.ConfigurationDatabase.Maintenance; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; +using Xunit; + +namespace ScadaLink.ConfigurationDatabase.Tests.Maintenance; + +/// +/// Bundle D (#23 M6-T5) integration tests for +/// . Uses the same +/// as the AuditLog migration / repository +/// tests so the ALTER PARTITION FUNCTION DDL runs against the actual seeded +/// pf_AuditLog_Month. +/// +/// +/// The migration seeds boundaries for every month in 2026 and 2027 (Jan 2026 +/// through Dec 2027). Tests pick a lookahead relative to the current +/// max-boundary at test start (rather than a fixed-target date) so each test +/// is robust against earlier tests in the class having added boundaries to +/// the shared fixture DB. Tests run sequentially within the class via xunit's +/// per-class collection serialisation. +/// +public class AuditLogPartitionMaintenanceTests : IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public AuditLogPartitionMaintenanceTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + private AuditLogPartitionMaintenance NewMaintenance(ScadaLinkDbContext ctx) => + new(ctx, NullLogger.Instance); + + /// + /// Computes the lookahead-in-months required to fall strictly inside the + /// already-covered boundary range. Picks something well below the + /// distance from "now" to the current max — guaranteed not to need any + /// new SPLIT. + /// + private static int LookaheadInsideExistingRange(DateTime max) + { + var now = DateTime.UtcNow; + // (max - now) in whole months, minus a 1-month safety margin so we + // never accidentally hit the boundary horizon edge case. + var months = ((max.Year - now.Year) * 12) + max.Month - now.Month - 1; + return Math.Max(1, months); + } + + /// + /// Computes the lookahead-in-months required to add exactly + /// new boundaries past the current max. + /// + /// + /// EnsureLookaheadAsync defines horizon = + /// NormalizeToFirstOfMonth(UtcNow) + lookaheadMonths. The new + /// boundaries it issues are first-of-month values strictly greater than + /// max, up to and including horizon. So + /// lookaheadMonths = monthsBetween(NormalizeToFirstOfMonth(UtcNow), max) + extraBoundaries + /// is the exact value that lands horizon on max + extraBoundaries + /// months. + /// + private static int LookaheadForExtraBoundaries(DateTime max, int extraBoundaries) + { + var nowFirstOfMonth = FirstOfNextMonth(DateTime.UtcNow); + var monthsToMax = ((max.Year - nowFirstOfMonth.Year) * 12) + max.Month - nowFirstOfMonth.Month; + return monthsToMax + extraBoundaries; + } + + private static DateTime FirstOfNextMonth(DateTime instant) + { + var firstOfThisMonth = new DateTime(instant.Year, instant.Month, 1, 0, 0, 0, DateTimeKind.Utc); + return firstOfThisMonth.AddMonths(1); + } + + [SkippableFact] + public async Task EnsureLookahead_AlreadyHasFutureRange_NoSplit_ReturnsEmpty() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx = CreateContext(); + var maintenance = NewMaintenance(ctx); + + var max = await maintenance.GetMaxBoundaryAsync(); + Assert.NotNull(max); + + // Pick a lookahead small enough that horizon (NormalizeToFirstOfMonth(now) + // + lookahead) lands well INSIDE the already-covered range — no SPLIT + // should fire. + var lookahead = LookaheadInsideExistingRange(max.Value); + + var added = await maintenance.EnsureLookaheadAsync(lookahead); + + Assert.Empty(added); + + // Sanity: the max boundary is unchanged after the no-op call. + var maxAfter = await maintenance.GetMaxBoundaryAsync(); + Assert.Equal(max, maxAfter); + } + + [SkippableFact] + public async Task EnsureLookahead_NeedsOneMoreBoundary_Splits_Returns1Boundary() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx = CreateContext(); + var maintenance = NewMaintenance(ctx); + + var maxBefore = await maintenance.GetMaxBoundaryAsync(); + Assert.NotNull(maxBefore); + + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 1); + var expectedAdded = maxBefore.Value.AddMonths(1); + + var added = await maintenance.EnsureLookaheadAsync(lookahead); + + Assert.Single(added); + Assert.Equal(expectedAdded, added[0]); + + var maxAfter = await maintenance.GetMaxBoundaryAsync(); + Assert.Equal(expectedAdded, maxAfter); + } + + [SkippableFact] + public async Task EnsureLookahead_NeedsThreeBoundaries_Splits_Returns3Boundaries() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx = CreateContext(); + var maintenance = NewMaintenance(ctx); + + var maxBefore = await maintenance.GetMaxBoundaryAsync(); + Assert.NotNull(maxBefore); + + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 3); + + var added = await maintenance.EnsureLookaheadAsync(lookahead); + + Assert.Equal(3, added.Count); + Assert.Equal(maxBefore.Value.AddMonths(1), added[0]); + Assert.Equal(maxBefore.Value.AddMonths(2), added[1]); + Assert.Equal(maxBefore.Value.AddMonths(3), added[2]); + + var maxAfter = await maintenance.GetMaxBoundaryAsync(); + Assert.Equal(maxBefore.Value.AddMonths(3), maxAfter); + } + + [SkippableFact] + public async Task EnsureLookahead_BoundaryAlreadyExists_NoError_Idempotent() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var ctx1 = CreateContext(); + var m1 = NewMaintenance(ctx1); + + var maxStart = await m1.GetMaxBoundaryAsync(); + Assert.NotNull(maxStart); + + // First call: add one boundary. + var lookahead = LookaheadForExtraBoundaries(maxStart.Value, extraBoundaries: 1); + var firstAdded = await m1.EnsureLookaheadAsync(lookahead); + Assert.Single(firstAdded); + + // Second call: the boundary just added is now part of pf_AuditLog_Month, + // so the same lookahead value should be a no-op — no exception, no + // duplicate SPLIT. + await using var ctx2 = CreateContext(); + var m2 = NewMaintenance(ctx2); + var secondAdded = await m2.EnsureLookaheadAsync(lookahead); + + Assert.Empty(secondAdded); + + // The max boundary is unchanged across the second call. + var maxAfter = await m2.GetMaxBoundaryAsync(); + Assert.Equal(firstAdded[0], maxAfter); + } +} From e93f655ce4b5589d8f0e37a48b3269ecd1a6a68d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:02:01 -0400 Subject: [PATCH 09/16] feat(health): SiteAuditBacklog metric (count + age + bytes) (#23 M6) --- .../ServiceCollectionExtensions.cs | 8 ++ .../Site/SiteAuditBacklogReporter.cs | 133 +++++++++++++++++ .../Site/SqliteAuditWriter.cs | 79 ++++++++++ .../Interfaces/Services/ISiteAuditQueue.cs | 14 ++ .../Messages/Health/SiteHealthReport.cs | 10 +- .../Types/SiteAuditBacklogSnapshot.cs | 32 +++++ .../ISiteHealthCollector.cs | 10 ++ .../SiteHealthCollector.cs | 17 ++- .../SqliteAuditWriterBacklogStatsTests.cs | 136 ++++++++++++++++++ .../SiteAuditBacklogMetricTests.cs | 73 ++++++++++ .../Actors/DeploymentManagerRedeployTests.cs | 1 + 11 files changed, 511 insertions(+), 2 deletions(-) create mode 100644 src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs create mode 100644 src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs create mode 100644 tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index 2216eb2..7bab904 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -218,6 +218,14 @@ public static class ServiceCollectionExtensions ServiceDescriptor.Singleton()); services.Replace( ServiceDescriptor.Singleton()); + // M6 Bundle E (T6): the site-side backlog reporter polls the + // SqliteAuditWriter every 30 s and pushes the snapshot into the + // collector so the next SiteHealthReport carries a fresh + // SiteAuditBacklog field. Registered alongside the other site-only + // metric bridges so AddAuditLog (which runs on central too) stays + // free of hosted-service registrations that would resolve a missing + // ISiteHealthCollector on central. + services.AddHostedService(); return services; } diff --git a/src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs b/src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs new file mode 100644 index 0000000..955832a --- /dev/null +++ b/src/ScadaLink.AuditLog/Site/SiteAuditBacklogReporter.cs @@ -0,0 +1,133 @@ +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.HealthMonitoring; + +namespace ScadaLink.AuditLog.Site; + +/// +/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that +/// periodically pulls a backlog snapshot from +/// and pushes it into so the next +/// emits a fresh +/// SiteAuditBacklog field on the site health report. +/// +/// +/// +/// Why a hosted service, not the report sender. Querying SQLite for the +/// backlog requires the queue's write lock; doing it inline in +/// would couple the collector +/// to and turn an in-memory snapshot read into +/// a synchronous I/O call on the report path. The hosted-service pattern keeps +/// the report path pure and the SQL probe off the report timing budget. +/// +/// +/// Cadence. 30 s by default — coarse enough to amortise the SQL probe +/// across many reports, fine enough that the central dashboard never lags by +/// more than one health-report interval. Tunable via +/// in a follow-up +/// if ops needs a different cadence; for M6 we hard-code the value because the +/// brief calls it out explicitly. +/// +/// +/// Failure containment. The probe call is wrapped in a try/catch so a +/// transient SQLite error never tears down the hosted service — the next tick +/// retries. Mirrors 's +/// "exception logged, not propagated" contract. +/// +/// +public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable +{ + /// + /// Default poll cadence. Half a typical 60 s health-report interval keeps + /// the snapshot fresh without spinning the SQL probe more often than + /// necessary. + /// + internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30); + + private readonly ISiteAuditQueue _queue; + private readonly ISiteHealthCollector _collector; + private readonly ILogger _logger; + private readonly TimeSpan _refreshInterval; + private CancellationTokenSource? _cts; + private Task? _loop; + + public SiteAuditBacklogReporter( + ISiteAuditQueue queue, + ISiteHealthCollector collector, + ILogger logger, + TimeSpan? refreshInterval = null) + { + _queue = queue ?? throw new ArgumentNullException(nameof(queue)); + _collector = collector ?? throw new ArgumentNullException(nameof(collector)); + _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + _refreshInterval = refreshInterval ?? DefaultRefreshInterval; + } + + /// + public Task StartAsync(CancellationToken ct) + { + // Linked CTS lets StopAsync's cancellation AND the host's shutdown + // token both terminate the loop; either side firing aborts the + // pending Task.Delay. + _cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + _loop = Task.Run(() => RunLoopAsync(_cts.Token)); + return Task.CompletedTask; + } + + private async Task RunLoopAsync(CancellationToken ct) + { + // First tick runs immediately so the very first health report after + // process start carries a real backlog snapshot — without this the + // dashboard would show null for the first 30 s after a deploy. + await SafeProbeAsync(ct).ConfigureAwait(false); + + while (!ct.IsCancellationRequested) + { + try + { + await Task.Delay(_refreshInterval, ct).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + break; + } + + await SafeProbeAsync(ct).ConfigureAwait(false); + } + } + + private async Task SafeProbeAsync(CancellationToken ct) + { + try + { + var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false); + _collector.UpdateSiteAuditBacklog(snapshot); + } + catch (OperationCanceledException) + { + // Shutdown — let the outer loop exit cleanly. + throw; + } + catch (Exception ex) + { + // Catch-all is deliberate: the hosted service must survive every + // class of probe failure (transient SQLite lock contention, disk + // I/O hiccup, …) so the next tick gets a chance. + _logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry."); + } + } + + /// + public Task StopAsync(CancellationToken ct) + { + _cts?.Cancel(); + return _loop ?? Task.CompletedTask; + } + + /// + public void Dispose() + { + _cts?.Dispose(); + } +} diff --git a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs index b00f205..bf5cb8b 100644 --- a/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs +++ b/src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs @@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ScadaLink.Commons.Entities.Audit; using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.AuditLog.Site; @@ -484,6 +485,84 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable } } + /// + /// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot + /// of the site queue's pending count, the oldest pending row's + /// , and the on-disk file size. Called + /// by the site-side SiteAuditBacklogReporter hosted service on its + /// 30 s tick to refresh the SiteHealthReport.SiteAuditBacklog field. + /// + /// + /// The pending-count + oldest-row queries run inside the same write lock as + /// the hot-path INSERT batch so the snapshot is consistent against the + /// connection's view (no torn read of an in-flight transaction). The on-disk + /// size lookup happens OUTSIDE the lock — it's a stat() call on the file + /// path and doesn't touch the connection. In-memory and missing files + /// return 0 bytes (the snapshot is for ops dashboards, not a correctness + /// invariant). + /// + public Task GetBacklogStatsAsync(CancellationToken ct = default) + { + int pendingCount; + DateTime? oldestPending; + + lock (_writeLock) + { + ObjectDisposedException.ThrowIf(_disposed, this); + + // Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same + // index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred + // index makes both aggregates cheap (count is a covering scan, min + // is the first key). + using var cmd = _connection.CreateCommand(); + cmd.CommandText = """ + SELECT COUNT(*), MIN(OccurredAtUtc) + FROM AuditLog + WHERE ForwardState = $pending; + """; + cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString()); + + using var reader = cmd.ExecuteReader(); + reader.Read(); + pendingCount = reader.GetInt32(0); + oldestPending = reader.IsDBNull(1) + ? null + : DateTime.Parse(reader.GetString(1), + System.Globalization.CultureInfo.InvariantCulture, + System.Globalization.DateTimeStyles.RoundtripKind); + } + + // File-size lookup outside the lock — the DatabasePath option is the + // canonical source. The connection-string-override branch (used by + // some tests) keeps the same DatabasePath value, so this works + // uniformly. In-memory / mode=memory paths return 0 because the file + // doesn't exist on disk. + long onDiskBytes = 0; + try + { + if (!string.IsNullOrEmpty(_options.DatabasePath) && + !_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) && + !_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) && + File.Exists(_options.DatabasePath)) + { + onDiskBytes = new FileInfo(_options.DatabasePath).Length; + } + } + catch (Exception ex) + { + // File system probe is a best-effort health-metric — never abort + // a backlog snapshot because stat() failed. Log and report 0. + _logger.LogDebug(ex, + "SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.", + _options.DatabasePath); + } + + return Task.FromResult(new SiteAuditBacklogSnapshot( + PendingCount: pendingCount, + OldestPendingUtc: oldestPending, + OnDiskBytes: onDiskBytes)); + } + private static DateTime EnsureUtc(DateTime value) => value.Kind == DateTimeKind.Utc ? value diff --git a/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs b/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs index 32d8646..c9e0462 100644 --- a/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs +++ b/src/ScadaLink.Commons/Interfaces/Services/ISiteAuditQueue.cs @@ -1,4 +1,5 @@ using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Types; namespace ScadaLink.Commons.Interfaces.Services; @@ -70,4 +71,17 @@ public interface ISiteAuditQueue /// are left untouched (idempotent re-call). Non-existent ids are silent no-ops. /// Task MarkReconciledAsync(IReadOnlyList eventIds, CancellationToken ct = default); + + /// + /// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot + /// of the site queue's pending count + oldest pending timestamp + on-disk + /// SQLite file size. Surfaced on + /// as + /// SiteAuditBacklog by the periodic SiteAuditBacklogReporter + /// hosted service so a stuck site→central drain is visible on the central + /// health dashboard. Safe to call concurrently with hot-path writes — + /// implementations are expected to take the same connection lock used by + /// the hot-path INSERT batch and the drain queries. + /// + Task GetBacklogStatsAsync(CancellationToken ct = default); } diff --git a/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs b/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs index bba4c8d..5567037 100644 --- a/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs +++ b/src/ScadaLink.Commons/Messages/Health/SiteHealthReport.cs @@ -1,3 +1,4 @@ +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.Commons.Messages.Health; @@ -32,7 +33,14 @@ public record SiteHealthReport( // marker). Surfaces a misconfigured / catastrophic regex on // /monitoring/health. Defaults to 0 for back-compat with existing // producers and tests that don't construct the field. - int AuditRedactionFailure = 0); + int AuditRedactionFailure = 0, + // Audit Log (#23) M6 Bundle E (T6): point-in-time snapshot of the + // site-local SQLite audit-log queue (pending count, oldest pending row, + // on-disk bytes). Populated by the site-side SiteAuditBacklogReporter + // hosted service every 30 s. Defaults to null so existing producers / + // tests that don't refresh the snapshot stay valid; the central health + // surface treats null as "no data yet" rather than a zeroed queue. + SiteAuditBacklogSnapshot? SiteAuditBacklog = null); /// /// Broadcast wrapper used between central nodes to keep per-node diff --git a/src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs b/src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs new file mode 100644 index 0000000..687a743 --- /dev/null +++ b/src/ScadaLink.Commons/Types/SiteAuditBacklogSnapshot.cs @@ -0,0 +1,32 @@ +namespace ScadaLink.Commons.Types; + +/// +/// Audit Log (#23) M6 Bundle E (T6) — point-in-time snapshot of the site-local +/// SQLite audit-log queue health, surfaced on +/// as +/// SiteAuditBacklog and refreshed periodically by the +/// SiteAuditBacklogReporter hosted service. +/// +/// +/// Number of rows currently in +/// — i.e. +/// not yet acknowledged by central via either the push-telemetry or +/// reconciliation-pull paths. A persistently non-zero value with rising +/// indicates the site→central drain isn't +/// keeping up. +/// +/// +/// of +/// the oldest Pending row, or null if the queue is empty. Used by ops +/// to compute backlog age without a separate query. +/// +/// +/// Size of the SQLite file on disk in bytes, or 0 if the writer is +/// running against an in-memory database. Mirrors the 7-day retention +/// invariant (alog.md §10) — a steady file-size growth past the retention +/// window points at a stuck purge or a stuck forwarder. +/// +public sealed record SiteAuditBacklogSnapshot( + int PendingCount, + DateTime? OldestPendingUtc, + long OnDiskBytes); diff --git a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs index bcd5f9e..a1ca37b 100644 --- a/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/ISiteHealthCollector.cs @@ -1,4 +1,5 @@ using ScadaLink.Commons.Messages.Health; +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.HealthMonitoring; @@ -28,6 +29,15 @@ public interface ISiteHealthCollector /// AddAuditLogHealthMetricsBridge(). /// void IncrementAuditRedactionFailure(); + /// + /// Audit Log (#23) M6 Bundle E (T6) — replace the latest site-local + /// audit-queue backlog snapshot (pending count, oldest pending row, + /// on-disk file bytes) used by the next call. + /// Refreshed periodically by the SiteAuditBacklogReporter hosted + /// service so each report carries a recent point-in-time view of the + /// site→central drain health. + /// + void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot); void UpdateConnectionHealth(string connectionName, ConnectionHealth health); void RemoveConnection(string connectionName); void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved); diff --git a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs index 47567c9..6f55061 100644 --- a/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs +++ b/src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs @@ -1,5 +1,6 @@ using System.Collections.Concurrent; using ScadaLink.Commons.Messages.Health; +using ScadaLink.Commons.Types; using ScadaLink.Commons.Types.Enums; namespace ScadaLink.HealthMonitoring; @@ -15,6 +16,7 @@ public class SiteHealthCollector : ISiteHealthCollector private int _deadLetterCount; private int _siteAuditWriteFailures; private int _auditRedactionFailures; + private volatile SiteAuditBacklogSnapshot? _siteAuditBacklog; private readonly ConcurrentDictionary _connectionStatuses = new(); private readonly ConcurrentDictionary _tagResolutionCounts = new(); private readonly ConcurrentDictionary _connectionEndpoints = new(); @@ -89,6 +91,18 @@ public class SiteHealthCollector : ISiteHealthCollector Interlocked.Increment(ref _auditRedactionFailures); } + /// + /// Audit Log (#23) M6 Bundle E (T6) — replace the latest backlog snapshot + /// from the site SQLite writer. The field is a single reference write + /// (volatile) so the next sees the most recent + /// snapshot — there is no count to reset, the report just carries forward + /// whatever was last refreshed. + /// + public void UpdateSiteAuditBacklog(SiteAuditBacklogSnapshot snapshot) + { + _siteAuditBacklog = snapshot ?? throw new ArgumentNullException(nameof(snapshot)); + } + /// /// Update the health status for a named data connection. /// Called by DCL when connection state changes. @@ -207,6 +221,7 @@ public class SiteHealthCollector : ISiteHealthCollector ParkedMessageCount: Interlocked.CompareExchange(ref _parkedMessageCount, 0, 0), ClusterNodes: _clusterNodes?.ToList(), SiteAuditWriteFailures: siteAuditWriteFailures, - AuditRedactionFailure: auditRedactionFailures); + AuditRedactionFailure: auditRedactionFailures, + SiteAuditBacklog: _siteAuditBacklog); } } diff --git a/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs new file mode 100644 index 0000000..95f9570 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterBacklogStatsTests.cs @@ -0,0 +1,136 @@ +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Site; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Types.Enums; + +namespace ScadaLink.AuditLog.Tests.Site; + +/// +/// Bundle E (M6-T6) tests for . +/// Exercises the health-metric surface that SiteAuditBacklogReporter +/// polls every 30 s and pushes onto the site health report as +/// SiteAuditBacklog. +/// +public class SqliteAuditWriterBacklogStatsTests : IDisposable +{ + private readonly string _dbPath; + + public SqliteAuditWriterBacklogStatsTests() + { + // OnDiskBytes assertions only make sense against a real file — the + // shared-cache in-memory mode returns 0 for the file size, so this + // suite is opinionated about file-backed storage. Tests in + // SqliteAuditWriterWriteTests use in-memory for performance reasons. + _dbPath = Path.Combine(Path.GetTempPath(), + $"audit-backlog-stats-{Guid.NewGuid():N}.db"); + } + + public void Dispose() + { + if (File.Exists(_dbPath)) + { + try { File.Delete(_dbPath); } catch { /* test cleanup best-effort */ } + } + } + + private SqliteAuditWriter CreateWriter() + { + var options = new SqliteAuditWriterOptions { DatabasePath = _dbPath }; + return new SqliteAuditWriter( + Options.Create(options), + NullLogger.Instance); + } + + private static AuditEvent NewEvent(DateTime? occurredAtUtc = null) => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = occurredAtUtc ?? DateTime.UtcNow, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + PayloadTruncated = false, + }; + + [Fact] + public async Task EmptyDb_Returns_Zero_Null_AndZeroBytes() + { + // No file exists yet — the writer ctor creates one but no rows are + // inserted; the snapshot should report a clean queue. OnDiskBytes is + // allowed to be zero (fresh ftruncate) OR small (page header) — the + // contract only requires non-negative; we assert >= 0 and exercise + // the pending fields strictly. + await using var writer = CreateWriter(); + + var snapshot = await writer.GetBacklogStatsAsync(); + + Assert.Equal(0, snapshot.PendingCount); + Assert.Null(snapshot.OldestPendingUtc); + Assert.True(snapshot.OnDiskBytes >= 0, + $"OnDiskBytes must be non-negative, got {snapshot.OnDiskBytes}"); + } + + [Fact] + public async Task Pending_5_Returns_5() + { + await using var writer = CreateWriter(); + + for (var i = 0; i < 5; i++) + { + await writer.WriteAsync(NewEvent()); + } + + var snapshot = await writer.GetBacklogStatsAsync(); + + Assert.Equal(5, snapshot.PendingCount); + } + + [Fact] + public async Task OldestPending_Is_Earliest_OccurredAtUtc() + { + await using var writer = CreateWriter(); + + var t1 = new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc); + var t2 = new DateTime(2026, 5, 20, 10, 1, 0, DateTimeKind.Utc); + var t3 = new DateTime(2026, 5, 20, 10, 2, 0, DateTimeKind.Utc); + + // Insert out of order so the snapshot is not "the last write" by + // accident — the OldestPendingUtc must come from a column-min, not + // an insertion-order proxy. + await writer.WriteAsync(NewEvent(t2)); + await writer.WriteAsync(NewEvent(t1)); + await writer.WriteAsync(NewEvent(t3)); + + var snapshot = await writer.GetBacklogStatsAsync(); + + Assert.Equal(3, snapshot.PendingCount); + Assert.NotNull(snapshot.OldestPendingUtc); + // The DB round-trips OccurredAtUtc through the "o" format which + // preserves Kind=Utc — assert tick-equality. + Assert.Equal(t1, snapshot.OldestPendingUtc!.Value); + } + + [Fact] + public async Task OnDiskBytes_ReturnsFileSize() + { + await using var writer = CreateWriter(); + + // Insert enough rows to grow the file past the empty schema baseline. + for (var i = 0; i < 100; i++) + { + await writer.WriteAsync(NewEvent()); + } + + var snapshot = await writer.GetBacklogStatsAsync(); + + // The exact size depends on SQLite page allocation, but a file-backed + // db with 100 inserted rows MUST be larger than the empty schema + // (a few pages, ~4 KB). The implementation should return the + // FileInfo.Length value verbatim. + Assert.True(File.Exists(_dbPath), $"DB file should exist at {_dbPath}"); + var expected = new FileInfo(_dbPath).Length; + Assert.Equal(expected, snapshot.OnDiskBytes); + Assert.True(snapshot.OnDiskBytes > 0, + $"after 100 inserts OnDiskBytes must be > 0, got {snapshot.OnDiskBytes}"); + } +} diff --git a/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs b/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs new file mode 100644 index 0000000..a57f773 --- /dev/null +++ b/tests/ScadaLink.HealthMonitoring.Tests/SiteAuditBacklogMetricTests.cs @@ -0,0 +1,73 @@ +using ScadaLink.Commons.Types; + +namespace ScadaLink.HealthMonitoring.Tests; + +/// +/// Bundle E (M6-T6) regression coverage. The site-side audit-log SQLite writer +/// exposes a backlog snapshot (SiteAuditBacklogSnapshot) via the +/// ISiteAuditQueue.GetBacklogStatsAsync surface. A periodic +/// SiteAuditBacklogReporter hosted service polls that snapshot and +/// pushes it into the collector via +/// so the next includes it in +/// the report payload as SiteAuditBacklog. Unlike the +/// SiteAuditWriteFailures / AuditRedactionFailure interval counters, the +/// backlog snapshot is not reset on collect — the field carries forward +/// whatever the most recent refresh pushed in. +/// +public class SiteAuditBacklogMetricTests +{ + private readonly SiteHealthCollector _collector = new(); + + [Fact] + public void Update_Then_CollectReport_IncludesBacklog() + { + var snapshot = new SiteAuditBacklogSnapshot( + PendingCount: 42, + OldestPendingUtc: new DateTime(2026, 5, 20, 10, 0, 0, DateTimeKind.Utc), + OnDiskBytes: 1234567); + + _collector.UpdateSiteAuditBacklog(snapshot); + + var report = _collector.CollectReport("site-1"); + + Assert.Equal(snapshot, report.SiteAuditBacklog); + } + + [Fact] + public void Report_Payload_Includes_SiteAuditBacklog_AsNullByDefault() + { + // No refresh has been pushed yet — the report carries null so the + // central UI can distinguish "no data yet" from "queue empty". + var report = _collector.CollectReport("site-1"); + + Assert.Null(report.SiteAuditBacklog); + } + + [Fact] + public void CollectReport_DoesNotReset_SiteAuditBacklog() + { + // Backlog snapshot is a point-in-time reading, not a per-interval + // counter — successive CollectReport calls before the next + // SiteAuditBacklogReporter tick MUST keep returning the same snapshot + // so a slow refresh cadence doesn't blank the central dashboard. + var snapshot = new SiteAuditBacklogSnapshot( + PendingCount: 7, + OldestPendingUtc: null, + OnDiskBytes: 8192); + + _collector.UpdateSiteAuditBacklog(snapshot); + + var first = _collector.CollectReport("site-1"); + var second = _collector.CollectReport("site-1"); + + Assert.Equal(snapshot, first.SiteAuditBacklog); + Assert.Equal(snapshot, second.SiteAuditBacklog); + } + + [Fact] + public void Update_With_Null_Throws_ArgumentNullException() + { + Assert.Throws( + () => _collector.UpdateSiteAuditBacklog(null!)); + } +} diff --git a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs index aec5917..e514acd 100644 --- a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs +++ b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs @@ -71,6 +71,7 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable public void IncrementDeadLetter() { } public void IncrementSiteAuditWriteFailures() { } public void IncrementAuditRedactionFailure() { } + public void UpdateSiteAuditBacklog(ScadaLink.Commons.Types.SiteAuditBacklogSnapshot snapshot) { } public void UpdateConnectionHealth(string connectionName, ConnectionHealth health) { } public void RemoveConnection(string connectionName) { } public void UpdateTagResolution(string connectionName, int totalSubscribed, int successfullyResolved) { } From 42333a72ed415cc2888d576e39adbee429cb6b8b Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:07:44 -0400 Subject: [PATCH 10/16] feat(health): SiteAuditTelemetryStalledTracker subscribes to EventStream (#23 M6) --- .../SiteAuditTelemetryStalledTracker.cs | 158 ++++++++++++++++++ .../ServiceCollectionExtensions.cs | 11 ++ .../SiteAuditTelemetryStalledTrackerTests.cs | 116 +++++++++++++ 3 files changed, 285 insertions(+) create mode 100644 src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs b/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs new file mode 100644 index 0000000..40624e7 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs @@ -0,0 +1,158 @@ +using System.Collections.Concurrent; +using Akka.Actor; +using Akka.Event; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the +/// actor system's EventStream for +/// publications and maintains a per-site latched stalled-state map readable +/// via . Consumed by the M6 Bundle E +/// aggregator so the central health +/// surface can surface per-site "reconciliation isn't draining" without +/// coupling the publisher () to the +/// health collection plumbing. +/// +/// +/// +/// Why an internal actor. Akka.NET's only +/// supports subscribers — there is no callback or +/// channel-based overload. The tracker therefore spawns a small subscriber +/// actor that forwards each event into the shared +/// on the actor's thread, and +/// readers () take a copy off that dictionary on any +/// thread. Mirrors the DeadLetterMonitorActor shape — subscribe in +/// , unsubscribe in +/// , which the tracker triggers via a Stop +/// at . +/// +/// +/// Per-site latching. The publisher () +/// only publishes on stalled-state transitions, so the dictionary is the +/// authoritative latched state. Sites that have never published are absent +/// from the snapshot — the consumer surface treats absence as +/// Stalled=false (default healthy), the same default the reconciliation +/// actor's own internal latch uses. +/// +/// +/// Singleton lifecycle. Registered as a singleton via +/// ; +/// tears the internal subscriber down at host shutdown. +/// +/// +public sealed class SiteAuditTelemetryStalledTracker : IDisposable +{ + private readonly EventStream _eventStream; + private readonly ConcurrentDictionary _state = new(); + private readonly IActorRef? _subscriber; + private bool _disposed; + + /// + /// Construct around a bare . Intended for unit + /// tests where the caller wants to publish events without standing up an + /// actor system — the tracker registers a transient subscriber actor only + /// if the supplied stream is backed by an actor system. In the bare-stream + /// mode (no actor system) the tracker still exposes the + /// surface but cannot self-subscribe; production + /// callers always go through . + /// + /// + /// Subscribing to requires an , + /// which can only be created from an . The bare- + /// stream ctor therefore can NOT itself wire the subscriber — tests that + /// want event-driven updates must use the ActorSystem ctor (or push state + /// directly via ). The tests in + /// SiteAuditTelemetryStalledTrackerTests use the ActorSystem ctor + /// via Akka.TestKit so they exercise the production subscribe path. + /// + public SiteAuditTelemetryStalledTracker(EventStream eventStream) + { + _eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream)); + // No subscriber actor — see the remarks above. Apply() is exposed + // internally so the ActorSystem ctor's internal forwarder can update + // state without re-implementing the dictionary write. + _subscriber = null; + } + + /// + /// Production ctor: subscribes a small internal actor to the supplied + /// system's EventStream so every published + /// updates the latched + /// per-site map. tears the subscriber down. + /// + public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem) + { + ArgumentNullException.ThrowIfNull(actorSystem); + _eventStream = actorSystem.EventStream; + // Anonymous subscriber actor scoped to the system; props build it + // with a callback into THIS tracker's Apply method so the actor's + // single-threaded receive serialises every dictionary write. + _subscriber = actorSystem.ActorOf( + Props.Create(() => new StalledChangedSubscriber(this)), + name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}"); + // Subscribe synchronously from the ctor so the subscription is in + // place before the tracker is returned to the caller — the actor's + // own PreStart runs asynchronously and would otherwise race the + // first publish. EventStream.Subscribe is thread-safe. + _eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged)); + } + + /// + /// Returns a defensive copy of the per-site latched stalled state. + /// Absent sites are interpreted as Stalled=false by consumers. + /// + public IReadOnlyDictionary Snapshot() => + new Dictionary(_state); + + /// + /// Applied by the internal subscriber actor on every + /// publication. Exposed + /// internally so tests against the bare-stream ctor can still drive the + /// tracker, but the production path always goes through the actor. + /// + internal void Apply(SiteAuditTelemetryStalledChanged evt) + { + if (evt is null) return; + _state[evt.SiteId] = evt.Stalled; + } + + public void Dispose() + { + if (_disposed) return; + _disposed = true; + if (_subscriber is not null) + { + // Unsubscribe runs in PostStop on the subscriber actor; Stop is + // fire-and-forget but the actor's PostStop hook is guaranteed to + // run before its mailbox is collected. + _subscriber.Tell(PoisonPill.Instance); + } + } + + /// + /// Internal subscriber actor — receives every + /// off the EventStream and + /// forwards it into the parent . + /// Unlike DeadLetterMonitorActor, the subscription is registered by + /// the tracker constructor BEFORE this actor begins processing messages so + /// publishes that arrive between actor creation and PreStart cannot be + /// missed. Unsubscribe still runs in . + /// + private sealed class StalledChangedSubscriber : ReceiveActor + { + private readonly SiteAuditTelemetryStalledTracker _parent; + + public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent) + { + _parent = parent; + Receive(evt => _parent.Apply(evt)); + } + + protected override void PostStop() + { + Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged)); + base.PostStop(); + } + } +} diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index 7bab904..5cde08b 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -259,6 +259,17 @@ public static class ServiceCollectionExtensions .Bind(config.GetSection(PartitionMaintenanceSectionName)); services.AddHostedService(); + // M6 Bundle E (T7): central-singleton tracker for per-site + // SiteAuditTelemetryStalledChanged events published by + // SiteAuditReconciliationActor. Singleton so the EventStream + // subscription registers exactly once on host startup; the tracker's + // ctor needs an ActorSystem which is composed by the AkkaHostedService + // — we go through a factory so the SP is the source of truth for the + // system reference rather than re-resolving it on the consumer side. + services.AddSingleton(sp => + new SiteAuditTelemetryStalledTracker( + sp.GetRequiredService())); + return services; } } diff --git a/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs new file mode 100644 index 0000000..7c375a1 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/SiteAuditTelemetryStalledTrackerTests.cs @@ -0,0 +1,116 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using ScadaLink.AuditLog.Central; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle E (M6-T7) tests for . +/// The tracker subscribes to the actor system's EventStream for +/// publications and maintains a +/// per-site latch the central health surface can read. Since reconciliation is +/// central-driven, the "stalled" state semantically belongs to central — not +/// to the per-site +/// payload (which the site itself emits). The tracker therefore lives as a +/// central singleton, not on the site health collector. +/// +public class SiteAuditTelemetryStalledTrackerTests : TestKit +{ + /// + /// Helper: publishes a stalled-changed event on the actor system's + /// EventStream and waits a moment for the tracker's subscribe callback to + /// run. AwaitAssert avoids racing on the stream's async fan-out. + /// + private void PublishAndWait(SiteAuditTelemetryStalledTracker tracker, SiteAuditTelemetryStalledChanged evt) + { + Sys.EventStream.Publish(evt); + AwaitAssert( + () => + { + var snapshot = tracker.Snapshot(); + Assert.True(snapshot.TryGetValue(evt.SiteId, out var stalled), + $"tracker did not record event for {evt.SiteId}"); + Assert.Equal(evt.Stalled, stalled); + }, + duration: TimeSpan.FromSeconds(2), + interval: TimeSpan.FromMilliseconds(20)); + } + + [Fact] + public void Initial_Snapshot_IsEmpty() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + var snapshot = tracker.Snapshot(); + + Assert.Empty(snapshot); + } + + [Fact] + public void StalledTrue_Event_TrackerReports_Stalled() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + + var snapshot = tracker.Snapshot(); + Assert.True(snapshot["siteA"]); + } + + [Fact] + public void StalledFalse_Event_TrackerReports_NotStalled() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + // First flip the site into stalled so the false transition has a + // prior value to overwrite — mirrors how the reconciliation actor + // only publishes false after a true. + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: false)); + + var snapshot = tracker.Snapshot(); + Assert.False(snapshot["siteA"]); + } + + [Fact] + public void Multiple_Sites_Tracked_Independently() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteB", Stalled: false)); + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteC", Stalled: true)); + + var snapshot = tracker.Snapshot(); + Assert.Equal(3, snapshot.Count); + Assert.True(snapshot["siteA"]); + Assert.False(snapshot["siteB"]); + Assert.True(snapshot["siteC"]); + } + + [Fact] + public void Constructor_With_Null_ActorSystem_Throws() + { + Assert.Throws( + () => new SiteAuditTelemetryStalledTracker((ActorSystem)null!)); + } + + [Fact] + public void Dispose_Unsubscribes_From_EventStream() + { + var tracker = new SiteAuditTelemetryStalledTracker(Sys); + + PublishAndWait(tracker, new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + + tracker.Dispose(); + + // After dispose any further events are ignored — the snapshot + // reflects the last known state at dispose time. + Sys.EventStream.Publish(new SiteAuditTelemetryStalledChanged("siteA", Stalled: false)); + + // Give the stream a moment in case the unsubscribe is racey; the + // assertion is that siteA stays at true. + Thread.Sleep(50); + Assert.True(tracker.Snapshot()["siteA"]); + } +} From 70ed8d4557b7f258a9bddaf39b496814ebe0f036 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:11:52 -0400 Subject: [PATCH 11/16] feat(health): CentralAuditWriteFailures + AuditCentralHealthSnapshot (#23 M6) --- .../Central/AuditCentralHealthSnapshot.cs | 70 ++++++++ .../Central/AuditLogIngestActor.cs | 17 ++ .../Central/CentralAuditWriter.cs | 24 ++- .../Central/IAuditCentralHealthSnapshot.cs | 62 +++++++ .../ICentralAuditWriteFailureCounter.cs | 23 +++ .../NoOpCentralAuditWriteFailureCounter.cs | 17 ++ .../ServiceCollectionExtensions.cs | 36 ++++- .../Central/CentralAuditWriteFailuresTests.cs | 151 ++++++++++++++++++ 8 files changed, 398 insertions(+), 2 deletions(-) create mode 100644 src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs create mode 100644 src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs create mode 100644 src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs create mode 100644 src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs diff --git a/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs b/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs new file mode 100644 index 0000000..c44453b --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs @@ -0,0 +1,70 @@ +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of +/// . Owns thread-safe +/// counters for +/// CentralAuditWriteFailures + AuditRedactionFailure and +/// delegates SiteAuditTelemetryStalled to the +/// . Also implements the writer +/// surfaces ( + +/// ) so a single concrete object +/// is the source of truth — DI binds those two interfaces to this same +/// singleton instance on the central composition root. +/// +/// +/// +/// Why one type for read + write. The writer interfaces are tiny +/// (Increment()) and the read surface needs visibility of those +/// counters anyway — having a single class own both means the +/// Interlocked field IS the snapshot value, no extra plumbing needed. +/// Mirrors the +/// pattern where +/// the collector both receives and exposes the metric. +/// +/// +/// Tracker dependency. +/// is a separate singleton that owns its own actor lifecycle; this snapshot +/// just reads its +/// surface on each access. Keeping +/// the tracker as a separate type avoids tangling EventStream subscription +/// state with the simple Interlocked counters here. +/// +/// +public sealed class AuditCentralHealthSnapshot + : IAuditCentralHealthSnapshot, + ICentralAuditWriteFailureCounter, + IAuditRedactionFailureCounter +{ + private int _centralAuditWriteFailures; + private int _auditRedactionFailure; + private readonly SiteAuditTelemetryStalledTracker _stalledTracker; + + public AuditCentralHealthSnapshot(SiteAuditTelemetryStalledTracker stalledTracker) + { + _stalledTracker = stalledTracker + ?? throw new ArgumentNullException(nameof(stalledTracker)); + } + + /// + public int CentralAuditWriteFailures => + Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0); + + /// + public int AuditRedactionFailure => + Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0); + + /// + public IReadOnlyDictionary SiteAuditTelemetryStalled => + _stalledTracker.Snapshot(); + + /// + void ICentralAuditWriteFailureCounter.Increment() => + Interlocked.Increment(ref _centralAuditWriteFailures); + + /// + void IAuditRedactionFailureCounter.Increment() => + Interlocked.Increment(ref _auditRedactionFailure); +} diff --git a/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs b/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs index 8e7f21b..61a6daf 100644 --- a/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs +++ b/src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs @@ -124,6 +124,7 @@ public class AuditLogIngestActor : ReceiveActor IServiceScope? scope = null; IAuditLogRepository repository; IAuditPayloadFilter? filter = null; + ICentralAuditWriteFailureCounter? failureCounter = null; if (_injectedRepository is not null) { repository = _injectedRepository; @@ -133,6 +134,10 @@ public class AuditLogIngestActor : ReceiveActor scope = _serviceProvider!.CreateScope(); repository = scope.ServiceProvider.GetRequiredService(); filter = scope.ServiceProvider.GetService(); + // M6 Bundle E (T8): central health counter is best-effort — + // unregistered (test composition roots) means the per-row catch + // simply logs without surfacing on the health dashboard. + failureCounter = scope.ServiceProvider.GetService(); } try @@ -157,6 +162,10 @@ public class AuditLogIngestActor : ReceiveActor { // Per-row catch — one bad row never sinks the whole batch. // The row stays Pending at the site; the next drain retries. + // M6 Bundle E (T8): bump the central health counter so a + // sustained insert-throw failure surfaces on the dashboard. + try { failureCounter?.Increment(); } + catch { /* counter must never throw — defence in depth */ } _logger.LogError(ex, "Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.", evt.EventId); @@ -204,6 +213,10 @@ public class AuditLogIngestActor : ReceiveActor // never throw, so we can apply it inside the per-entry try // without risking an unbounded blast radius. var filter = scope.ServiceProvider.GetService(); + // M6 Bundle E (T8): same best-effort central health counter as + // the OnIngestAsync path — null on test composition roots that + // skip the registration. + var failureCounter = scope.ServiceProvider.GetService(); foreach (var entry in cmd.Entries) { @@ -240,6 +253,10 @@ public class AuditLogIngestActor : ReceiveActor // EventId is NOT added to `accepted` so the site keeps its // row Pending and retries on the next drain. Other entries // in the batch continue with their own transactions. + // M6 Bundle E (T8): bump the central health counter so a + // sustained dual-write failure surfaces on the dashboard. + try { failureCounter?.Increment(); } + catch { /* counter must never throw — defence in depth */ } _logger.LogError( ex, "Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.", diff --git a/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs b/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs index ff48bea..80bfc45 100644 --- a/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs +++ b/src/ScadaLink.AuditLog/Central/CentralAuditWriter.cs @@ -42,6 +42,7 @@ public sealed class CentralAuditWriter : ICentralAuditWriter private readonly IServiceProvider _services; private readonly ILogger _logger; private readonly IAuditPayloadFilter? _filter; + private readonly ICentralAuditWriteFailureCounter _failureCounter; /// /// Bundle C (M5-T6) — the central direct-write path used by the @@ -50,15 +51,23 @@ public sealed class CentralAuditWriter : ICentralAuditWriter /// optional so the M4 test composition roots that don't pass one keep /// working (they only ever write small payloads); production DI registers /// the real filter via . + /// M6 Bundle E (T8) — adds the optional + /// so a swallowed repository + /// throw bumps the central health surface's + /// CentralAuditWriteFailures counter. Defaults to a NoOp so test + /// composition roots that don't wire the counter keep their current + /// behaviour. /// public CentralAuditWriter( IServiceProvider services, ILogger logger, - IAuditPayloadFilter? filter = null) + IAuditPayloadFilter? filter = null, + ICentralAuditWriteFailureCounter? failureCounter = null) { _services = services ?? throw new ArgumentNullException(nameof(services)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); _filter = filter; + _failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter(); } /// @@ -92,6 +101,19 @@ public sealed class CentralAuditWriter : ICentralAuditWriter catch (Exception ex) { // Audit failure NEVER aborts the user-facing action — swallow and log. + // M6 Bundle E (T8): also surface the failure on the central health + // counter so a sustained audit-write outage is visible on the + // health dashboard rather than disappearing into the log file. + try + { + _failureCounter.Increment(); + } + catch + { + // Counter must NEVER throw — defence in depth. Even if a + // misbehaving custom counter does, swallowing here keeps the + // best-effort contract intact. + } _logger.LogWarning( ex, "CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})", diff --git a/src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs b/src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs new file mode 100644 index 0000000..6b7fae2 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/IAuditCentralHealthSnapshot.cs @@ -0,0 +1,62 @@ +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side +/// audit-health counters: (every +/// repository insert throw from / +/// ), +/// (every payload-filter redactor throw on the central path), and +/// (per-site latched state from the +/// ). +/// +/// +/// +/// Read-only contract. Implementations expose a point-in-time snapshot +/// — increments and tracker updates happen through the dedicated counter / +/// tracker interfaces, not through this surface. Consumers (M7+ central +/// health pages) read these properties; they never mutate. +/// +/// +/// Why a parallel surface from . +/// aggregates per-site +/// SiteHealthState reports the SITE emits. The central audit-write +/// failure / redaction-failure counters originate ON central (no site report +/// carries them), so they live on a dedicated snapshot rather than being +/// retro-fitted into a per-site state. The two surfaces will be composed at +/// the M7 dashboard layer. +/// +/// +public interface IAuditCentralHealthSnapshot +{ + /// + /// Count of central-side audit-write failures since process start. + /// Incremented by every / + /// repository insert that throws. + /// + int CentralAuditWriteFailures { get; } + + /// + /// Count of central-side payload-filter redactor over-redactions since + /// process start. Incremented by every header / body / SQL-parameter + /// redactor stage that throws (the filter falls back to the + /// <redacted: redactor error> marker and never aborts the + /// user-facing action). Sites have their own counter + /// (-backed + /// SiteHealthReport.AuditRedactionFailure) and the central + /// composition root's binding routes ALL central redactor throws + /// (CentralAuditWriter + AuditLogIngestActor paths) into this counter. + /// + int AuditRedactionFailure { get; } + + /// + /// Per-site latched stalled state: true when the + /// has observed two + /// consecutive non-draining cycles for that site, false after the + /// first draining cycle. Sites absent from the map are interpreted as + /// healthy (Stalled=false default). Snapshot is a defensive + /// copy — readers must not mutate. + /// + IReadOnlyDictionary SiteAuditTelemetryStalled { get; } +} diff --git a/src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs b/src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs new file mode 100644 index 0000000..4e34256 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/ICentralAuditWriteFailureCounter.cs @@ -0,0 +1,23 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit +/// writers (, ) +/// every time a repository InsertIfNotExistsAsync throws. Mirrors the +/// site-side +/// shape one-for-one — same one-method contract, same NoOp default, same +/// must-never-abort-the-user-facing-action invariant. +/// +/// +/// Audit-write failures NEVER abort the user-facing action (alog.md §13) — +/// the writer swallows the exception and surfaces the failure via this counter +/// instead. A NoOp default is the correct safe fallback while the central +/// health surface is being wired in; +/// is the production binding that routes increments into the aggregated +/// central health snapshot consumed by future M7+ pages. +/// +public interface ICentralAuditWriteFailureCounter +{ + /// Increment the central audit-write failure counter by one. + void Increment(); +} diff --git a/src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs b/src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs new file mode 100644 index 0000000..d4eb216 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/NoOpCentralAuditWriteFailureCounter.cs @@ -0,0 +1,17 @@ +namespace ScadaLink.AuditLog.Central; + +/// +/// Default binding used when +/// the central health surface () has +/// not been wired (test composition roots, site-only hosts that incidentally +/// resolve a ). Drops every increment on the +/// floor. Mirrors . +/// +public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter +{ + /// + public void Increment() + { + // intentional no-op + } +} diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index 5cde08b..2fe18ea 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -155,6 +155,13 @@ public static class ServiceCollectionExtensions services.AddSingleton( sp => sp.GetRequiredService()); + // M6 Bundle E (T8): central audit-write failure counter — NoOp default + // for site/test composition roots that don't wire the central health + // snapshot. AddAuditLogCentralMaintenance below replaces this binding + // with the AuditCentralHealthSnapshot implementation so increments + // surface on the central dashboard. + services.TryAddSingleton(); + // M4 Bundle B: central direct-write audit writer used by // NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to // emit AuditLog rows that originate ON central, not via site telemetry. @@ -167,10 +174,13 @@ public static class ServiceCollectionExtensions // Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so // NotificationOutboxActor + Inbound API rows are truncated + redacted // before they hit MS SQL. + // M6 Bundle E (T8): also wire the ICentralAuditWriteFailureCounter + // so swallowed repo throws bump the central health counter. services.AddSingleton(sp => new CentralAuditWriter( sp, sp.GetRequiredService>(), - sp.GetRequiredService())); + sp.GetRequiredService(), + sp.GetRequiredService())); return services; } @@ -270,6 +280,30 @@ public static class ServiceCollectionExtensions new SiteAuditTelemetryStalledTracker( sp.GetRequiredService())); + // M6 Bundle E (T8 + T9): central health snapshot — a single object + // that owns the CentralAuditWriteFailures + AuditRedactionFailure + // Interlocked counters AND surfaces them on + // IAuditCentralHealthSnapshot. The same instance is bound to BOTH + // writer-side interfaces (ICentralAuditWriteFailureCounter + + // IAuditRedactionFailureCounter) so every central-side increment + // routes into the shared counters; site nodes keep their existing + // Site bridges (registered by AddAuditLogHealthMetricsBridge) so + // the same counter type does not shadow the site-side metric. + services.AddSingleton(); + services.AddSingleton( + sp => sp.GetRequiredService()); + services.Replace(ServiceDescriptor.Singleton( + sp => sp.GetRequiredService())); + // M6 Bundle E (T9): override the NoOp IAuditRedactionFailureCounter + // (registered by AddAuditLog) with the central snapshot binding so + // payload-filter throws on CentralAuditWriter / AuditLogIngestActor + // paths surface on the central dashboard. The site composition root + // overrides this binding AGAIN via AddAuditLogHealthMetricsBridge — + // central nodes do not call that bridge, so this is the final + // binding on a central host. + services.Replace(ServiceDescriptor.Singleton( + sp => sp.GetRequiredService())); + return services; } } diff --git a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs new file mode 100644 index 0000000..0383e09 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs @@ -0,0 +1,151 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Messages.Audit; +using ScadaLink.Commons.Types.Audit; +using ScadaLink.Commons.Types.Enums; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle E (M6-T8) regression coverage for the central-side audit-write +/// failure counter. and +/// both swallow repository throws (audit +/// must NEVER abort the user-facing action, alog.md §13) but bump the +/// so the central health +/// surface () can flag a sustained +/// outage. +/// +public class CentralAuditWriteFailuresTests : TestKit +{ + private static AuditEvent NewEvent() => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = DateTime.UtcNow, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + }; + + /// + /// Repository stub that always throws on insert — exercises the failure + /// path in both and + /// . + /// + private sealed class ThrowingRepo : IAuditLogRepository + { + public Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default) => + throw new InvalidOperationException("simulated repo failure"); + public Task> QueryAsync( + AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + public Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default) => + Task.FromResult(0L); + public Task> GetPartitionBoundariesOlderThanAsync( + DateTime threshold, CancellationToken ct = default) => + Task.FromResult>(Array.Empty()); + } + + /// + /// In-memory recording + /// every call so tests can assert on the count. + /// + private sealed class RecordingFailureCounter : ICentralAuditWriteFailureCounter + { + private int _count; + public int Count => Volatile.Read(ref _count); + public void Increment() => Interlocked.Increment(ref _count); + } + + [Fact] + public async Task Forced_Failure_Increments_Counter() + { + // Direct test: build the writer with a throwing scope and verify the + // injected counter is bumped on the swallowed insert exception. + var counter = new RecordingFailureCounter(); + var services = new ServiceCollection(); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var writer = new CentralAuditWriter( + sp, + NullLogger.Instance, + filter: null, + failureCounter: counter); + + // WriteAsync swallows the exception and increments the counter. + await writer.WriteAsync(NewEvent()); + + Assert.Equal(1, counter.Count); + } + + [Fact] + public async Task AuditLogIngestActor_Failure_Increments_Counter() + { + // The actor's production ctor resolves both IAuditLogRepository AND + // ICentralAuditWriteFailureCounter from the scope per-message; we + // register both and verify the per-row catch bumps the counter for + // every row in the batch. + var counter = new RecordingFailureCounter(); + var services = new ServiceCollection(); + services.AddScoped(); + // Counter is a singleton — the actor's per-message scope still + // resolves the same instance via the scope's parent provider. + services.AddSingleton(counter); + var sp = services.BuildServiceProvider(); + + var actor = Sys.ActorOf(Props.Create(() => new AuditLogIngestActor( + sp, NullLogger.Instance))); + + var batch = new[] { NewEvent(), NewEvent(), NewEvent() }; + var reply = await actor.Ask( + new IngestAuditEventsCommand(batch), TimeSpan.FromSeconds(5)); + + // Every row threw → none accepted, counter bumped once per row. + Assert.Empty(reply.AcceptedEventIds); + Assert.Equal(batch.Length, counter.Count); + } + + [Fact] + public void Snapshot_Aggregates_Counters_And_StalledState() + { + // AuditCentralHealthSnapshot implements both writer surfaces; bumping + // through the writer interfaces is reflected on the read surface, and + // SiteAuditTelemetryStalled is sourced from the injected tracker. + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + var snapshot = new AuditCentralHealthSnapshot(tracker); + + Assert.Equal(0, snapshot.CentralAuditWriteFailures); + Assert.Equal(0, snapshot.AuditRedactionFailure); + Assert.Empty(snapshot.SiteAuditTelemetryStalled); + + ((ICentralAuditWriteFailureCounter)snapshot).Increment(); + ((ICentralAuditWriteFailureCounter)snapshot).Increment(); + ((ScadaLink.AuditLog.Payload.IAuditRedactionFailureCounter)snapshot).Increment(); + + // Publish a stalled-changed event so the tracker registers a site. + Sys.EventStream.Publish(new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); + AwaitAssert(() => + { + var stalledMap = snapshot.SiteAuditTelemetryStalled; + Assert.True(stalledMap.TryGetValue("siteA", out var s) && s, + "expected siteA to be stalled in snapshot"); + }, + duration: TimeSpan.FromSeconds(2), + interval: TimeSpan.FromMilliseconds(20)); + + Assert.Equal(2, snapshot.CentralAuditWriteFailures); + Assert.Equal(1, snapshot.AuditRedactionFailure); + } + + [Fact] + public void AuditCentralHealthSnapshot_Construction_Without_Tracker_Throws() + { + Assert.Throws( + () => new AuditCentralHealthSnapshot(null!)); + } +} From 2744011ce9ee8746e811fc7d08759a8df9379023 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:13:19 -0400 Subject: [PATCH 12/16] feat(health): surface AuditRedactionFailure in central snapshot (#23 M6) --- .../CentralAuditRedactionFailureCounter.cs | 57 +++++++++++ .../ServiceCollectionExtensions.cs | 16 +-- ...entralAuditRedactionFailureCounterTests.cs | 99 +++++++++++++++++++ 3 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs create mode 100644 tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs diff --git a/src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs b/src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs new file mode 100644 index 0000000..102b6d9 --- /dev/null +++ b/src/ScadaLink.AuditLog/Central/CentralAuditRedactionFailureCounter.cs @@ -0,0 +1,57 @@ +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Central; + +/// +/// Audit Log (#23) M6 Bundle E (T9) — bridges +/// (incremented by +/// every time a header / body / SQL +/// parameter redactor stage throws and the filter has to over-redact the +/// offending field) into so the +/// failure surfaces on the central health surface as +/// AuditCentralHealthSnapshot.AuditRedactionFailure. +/// +/// +/// +/// Site vs central. M5 Bundle C wired the SITE-side bridge +/// (), +/// which routes increments into the site health report payload's +/// AuditRedactionFailure field. That handles redactor failures on the +/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the +/// MIRROR bridge here so the same payload filter — when it runs on the +/// central / +/// paths — surfaces its failures on the +/// central dashboard rather than disappearing into a NoOp. +/// +/// +/// Registration shape. Site composition roots call +/// , +/// which overrides the binding with the site bridge. Central composition +/// roots call , +/// which overrides with this central bridge. A node never wears both hats — +/// site and central are distinct host roles — so the two bridges never +/// fight over the same binding at runtime. +/// +/// +/// Why not a thin wrapper around the snapshot directly? The snapshot +/// itself could be the bound implementation (it already implements +/// ), but a dedicated class makes +/// the central-vs-site asymmetry explicit at the DI boundary — readers of +/// +/// see "site → site bridge, central → central bridge", matching the +/// +/// shape one-for-one. +/// +/// +public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter +{ + private readonly AuditCentralHealthSnapshot _snapshot; + + public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot) + { + _snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot)); + } + + /// + public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment(); +} diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index 2fe18ea..6460b1a 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -295,14 +295,18 @@ public static class ServiceCollectionExtensions services.Replace(ServiceDescriptor.Singleton( sp => sp.GetRequiredService())); // M6 Bundle E (T9): override the NoOp IAuditRedactionFailureCounter - // (registered by AddAuditLog) with the central snapshot binding so - // payload-filter throws on CentralAuditWriter / AuditLogIngestActor - // paths surface on the central dashboard. The site composition root + // (registered by AddAuditLog) with the CentralAuditRedactionFailureCounter + // bridge so payload-filter throws on CentralAuditWriter / + // AuditLogIngestActor paths surface on the central dashboard. The + // bridge is a thin wrapper around the AuditCentralHealthSnapshot + // singleton so all central redactor failures route into the same + // counter as CentralAuditWriteFailures. The site composition root // overrides this binding AGAIN via AddAuditLogHealthMetricsBridge — // central nodes do not call that bridge, so this is the final - // binding on a central host. - services.Replace(ServiceDescriptor.Singleton( - sp => sp.GetRequiredService())); + // binding on a central host. Mirrors the M5 Bundle C + // HealthMetricsAuditRedactionFailureCounter shape one-for-one. + services.Replace(ServiceDescriptor.Singleton()); return services; } diff --git a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs new file mode 100644 index 0000000..7fbfebf --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs @@ -0,0 +1,99 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.AuditLog; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Payload; + +namespace ScadaLink.AuditLog.Tests.Central; + +/// +/// Bundle E (M6-T9) coverage for the central-side payload-filter redactor +/// failure bridge. M5 wired the SITE bridge +/// (HealthMetricsAuditRedactionFailureCounter) that pushes increments +/// into the site health report; M6 mirrors that with +/// so the same payload +/// filter — when it runs on the central writer paths — surfaces failures on +/// the central . +/// +public class CentralAuditRedactionFailureCounterTests : TestKit +{ + [Fact] + public void Increment_Routes_To_Snapshot() + { + using var tracker = new SiteAuditTelemetryStalledTracker(Sys); + var snapshot = new AuditCentralHealthSnapshot(tracker); + var counter = new CentralAuditRedactionFailureCounter(snapshot); + + counter.Increment(); + counter.Increment(); + counter.Increment(); + + Assert.Equal(3, snapshot.AuditRedactionFailure); + } + + [Fact] + public void Construction_With_Null_Snapshot_Throws() + { + Assert.Throws( + () => new CentralAuditRedactionFailureCounter(null!)); + } + + [Fact] + public void AddAuditLogCentralMaintenance_Replaces_IAuditRedactionFailureCounter_With_CentralImpl() + { + // AddAuditLog registers NoOp; AddAuditLogCentralMaintenance is the + // override path. The replaced binding MUST resolve to the central + // bridge — a site host that wires AddAuditLogHealthMetricsBridge + // instead would resolve to the site bridge (covered in + // AddAuditLogTests). + var config = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["AuditLog:SiteWriter:DatabasePath"] = ":memory:", + }) + .Build(); + + var services = new ServiceCollection(); + services.AddSingleton(); + services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); + // The AuditCentralHealthSnapshot ctor takes the stalled tracker + // which itself needs an ActorSystem — register a real system + // (test-kit's Sys) so the DI graph composes. + services.AddSingleton(Sys); + services.AddAuditLog(config); + services.AddAuditLogCentralMaintenance(config); + using var provider = services.BuildServiceProvider(); + + var counter = provider.GetRequiredService(); + + Assert.IsType(counter); + } + + [Fact] + public void AddAuditLog_Default_IAuditRedactionFailureCounter_Is_NoOp() + { + // Sanity check: without AddAuditLogCentralMaintenance the default + // remains the NoOp from M5 — the central bridge only takes effect + // when the central-only registration runs. + var config = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary + { + ["AuditLog:SiteWriter:DatabasePath"] = ":memory:", + }) + .Build(); + + var services = new ServiceCollection(); + services.AddSingleton(); + services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); + services.AddAuditLog(config); + using var provider = services.BuildServiceProvider(); + + var counter = provider.GetRequiredService(); + + Assert.IsType(counter); + } +} From ef49b55cf6c116d1d523ecca21bc2f5bc6cd9194 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:25:28 -0400 Subject: [PATCH 13/16] fix(health): decouple AuditCentralHealthSnapshot from ActorSystem (#23 M6) The snapshot's per-site stalled latch now lives on the snapshot itself and is fed by SiteAuditTelemetryStalledTracker via ApplyStalled, removing the chain that required ActorSystem at DI composition time. The tracker is now constructed by AkkaHostedService once ActorSystem.Create returns, with a lock-guarded auxiliary-disposable list so concurrent host start/stop in tests cannot race the enumeration. --- .../Central/AuditCentralHealthSnapshot.cs | 46 +++++++++------ .../SiteAuditTelemetryStalledTracker.cs | 36 +++++++++++- .../ServiceCollectionExtensions.cs | 16 ++---- .../Actors/AkkaHostedService.cs | 57 +++++++++++++++++++ ...entralAuditRedactionFailureCounterTests.cs | 11 ++-- .../Central/CentralAuditWriteFailuresTests.cs | 23 +++++--- 6 files changed, 144 insertions(+), 45 deletions(-) diff --git a/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs b/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs index c44453b..e728c51 100644 --- a/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs +++ b/src/ScadaLink.AuditLog/Central/AuditCentralHealthSnapshot.cs @@ -1,3 +1,4 @@ +using System.Collections.Concurrent; using ScadaLink.AuditLog.Payload; namespace ScadaLink.AuditLog.Central; @@ -6,10 +7,10 @@ namespace ScadaLink.AuditLog.Central; /// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of /// . Owns thread-safe /// counters for -/// CentralAuditWriteFailures + AuditRedactionFailure and -/// delegates SiteAuditTelemetryStalled to the -/// . Also implements the writer -/// surfaces ( + +/// CentralAuditWriteFailures + AuditRedactionFailure and a +/// per-site latched stalled-state map fed by the +/// . Also implements the +/// writer surfaces ( + /// ) so a single concrete object /// is the source of truth — DI binds those two interfaces to this same /// singleton instance on the central composition root. @@ -25,12 +26,14 @@ namespace ScadaLink.AuditLog.Central; /// the collector both receives and exposes the metric. /// /// -/// Tracker dependency. -/// is a separate singleton that owns its own actor lifecycle; this snapshot -/// just reads its -/// surface on each access. Keeping -/// the tracker as a separate type avoids tangling EventStream subscription -/// state with the simple Interlocked counters here. +/// Stalled-state plumbing. The per-site stalled latch lives directly +/// on this snapshot. is the +/// EventStream subscriber that pushes +/// publications in via +/// . Keeping the dictionary on this type (rather +/// than reading the tracker on every access) lets the snapshot be constructed +/// without an dependency — the tracker +/// is wired up later from the Akka bootstrap, once the system is built. /// /// public sealed class AuditCentralHealthSnapshot @@ -40,13 +43,7 @@ public sealed class AuditCentralHealthSnapshot { private int _centralAuditWriteFailures; private int _auditRedactionFailure; - private readonly SiteAuditTelemetryStalledTracker _stalledTracker; - - public AuditCentralHealthSnapshot(SiteAuditTelemetryStalledTracker stalledTracker) - { - _stalledTracker = stalledTracker - ?? throw new ArgumentNullException(nameof(stalledTracker)); - } + private readonly ConcurrentDictionary _stalled = new(); /// public int CentralAuditWriteFailures => @@ -58,7 +55,20 @@ public sealed class AuditCentralHealthSnapshot /// public IReadOnlyDictionary SiteAuditTelemetryStalled => - _stalledTracker.Snapshot(); + new Dictionary(_stalled); + + /// + /// Apply a publication + /// observed by . Public + /// so the tracker (which lives in the same assembly but is constructed + /// later from the Akka host) can push without a friend reference; + /// readers should call . + /// + public void ApplyStalled(SiteAuditTelemetryStalledChanged evt) + { + if (evt is null) return; + _stalled[evt.SiteId] = evt.Stalled; + } /// void ICentralAuditWriteFailureCounter.Increment() => diff --git a/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs b/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs index 40624e7..e1ed0fd 100644 --- a/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs +++ b/src/ScadaLink.AuditLog/Central/SiteAuditTelemetryStalledTracker.cs @@ -46,6 +46,7 @@ public sealed class SiteAuditTelemetryStalledTracker : IDisposable private readonly EventStream _eventStream; private readonly ConcurrentDictionary _state = new(); private readonly IActorRef? _subscriber; + private readonly AuditCentralHealthSnapshot? _snapshot; private bool _disposed; /// @@ -67,12 +68,24 @@ public sealed class SiteAuditTelemetryStalledTracker : IDisposable /// via Akka.TestKit so they exercise the production subscribe path. /// public SiteAuditTelemetryStalledTracker(EventStream eventStream) + : this(eventStream, snapshot: null) + { + } + + /// + /// Bare-stream ctor with an optional snapshot sink — the central + /// composition root passes the singleton + /// so every dictionary update + /// also lands on the central health surface. The bare ctor still cannot + /// subscribe (no actor system), but tests that drive the tracker via + /// get the snapshot push for free. + /// + public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot) { _eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream)); - // No subscriber actor — see the remarks above. Apply() is exposed - // internally so the ActorSystem ctor's internal forwarder can update - // state without re-implementing the dictionary write. + // No subscriber actor — see the remarks on the parameterless overload. _subscriber = null; + _snapshot = snapshot; } /// @@ -82,9 +95,21 @@ public sealed class SiteAuditTelemetryStalledTracker : IDisposable /// per-site map. tears the subscriber down. /// public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem) + : this(actorSystem, snapshot: null) + { + } + + /// + /// Production ctor with a snapshot sink — every observed + /// is mirrored onto the + /// shared so the central health + /// surface sees per-site stalled state without re-reading the tracker. + /// + public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot) { ArgumentNullException.ThrowIfNull(actorSystem); _eventStream = actorSystem.EventStream; + _snapshot = snapshot; // Anonymous subscriber actor scoped to the system; props build it // with a callback into THIS tracker's Apply method so the actor's // single-threaded receive serialises every dictionary write. @@ -115,6 +140,11 @@ public sealed class SiteAuditTelemetryStalledTracker : IDisposable { if (evt is null) return; _state[evt.SiteId] = evt.Stalled; + // Mirror into the central health snapshot if wired so a reader of + // IAuditCentralHealthSnapshot sees the same per-site state without + // a second lookup. Snapshot is optional (test composition roots may + // skip it) so the null-coalesce is the safe path. + _snapshot?.ApplyStalled(evt); } public void Dispose() diff --git a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs index 6460b1a..626859f 100644 --- a/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs +++ b/src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs @@ -269,17 +269,6 @@ public static class ServiceCollectionExtensions .Bind(config.GetSection(PartitionMaintenanceSectionName)); services.AddHostedService(); - // M6 Bundle E (T7): central-singleton tracker for per-site - // SiteAuditTelemetryStalledChanged events published by - // SiteAuditReconciliationActor. Singleton so the EventStream - // subscription registers exactly once on host startup; the tracker's - // ctor needs an ActorSystem which is composed by the AkkaHostedService - // — we go through a factory so the SP is the source of truth for the - // system reference rather than re-resolving it on the consumer side. - services.AddSingleton(sp => - new SiteAuditTelemetryStalledTracker( - sp.GetRequiredService())); - // M6 Bundle E (T8 + T9): central health snapshot — a single object // that owns the CentralAuditWriteFailures + AuditRedactionFailure // Interlocked counters AND surfaces them on @@ -289,6 +278,11 @@ public static class ServiceCollectionExtensions // routes into the shared counters; site nodes keep their existing // Site bridges (registered by AddAuditLogHealthMetricsBridge) so // the same counter type does not shadow the site-side metric. + // The snapshot itself has no actor-system dependency — the + // per-site stalled latch is fed by SiteAuditTelemetryStalledTracker + // which the Akka bootstrap wires up after ActorSystem.Create returns + // (the tracker is NOT registered here because its construction + // requires ActorSystem, which is not a DI-resolvable singleton). services.AddSingleton(); services.AddSingleton( sp => sp.GetRequiredService()); diff --git a/src/ScadaLink.Host/Actors/AkkaHostedService.cs b/src/ScadaLink.Host/Actors/AkkaHostedService.cs index 9425368..dce065a 100644 --- a/src/ScadaLink.Host/Actors/AkkaHostedService.cs +++ b/src/ScadaLink.Host/Actors/AkkaHostedService.cs @@ -34,6 +34,13 @@ public class AkkaHostedService : IHostedService private readonly CommunicationOptions _communicationOptions; private readonly ILogger _logger; private ActorSystem? _actorSystem; + /// + /// Auxiliary IDisposables (e.g. the SiteAuditTelemetryStalledTracker) + /// that this hosted service constructs at start time and must tear down + /// on shutdown — they don't fit the ActorSystem lifecycle but share its + /// process scope. + /// + private readonly List _trackedDisposables = new(); public AkkaHostedService( IServiceProvider serviceProvider, @@ -201,6 +208,31 @@ akka {{ public async Task StopAsync(CancellationToken cancellationToken) { + // Dispose auxiliary subscribers (e.g. SiteAuditTelemetryStalledTracker) + // BEFORE Akka shuts down so their EventStream unsubscribe calls run + // while the system is still alive. Per-tracker Dispose is wrapped in + // its own try so a misbehaving subscriber can't sink the shutdown. + // Snapshot the list inside a lock so a concurrent StartAsync (the + // test harness sometimes triggers a second start/stop interleaving) + // can't race the enumeration. Clearing the original list under the + // same lock leaves the next StartAsync with a clean slate. + IDisposable[] disposables; + lock (_trackedDisposables) + { + disposables = _trackedDisposables.ToArray(); + _trackedDisposables.Clear(); + } + foreach (var disposable in disposables) + { + try { disposable.Dispose(); } + catch (Exception ex) + { + _logger.LogWarning(ex, + "Auxiliary subscriber {Type} threw during shutdown", + disposable.GetType().Name); + } + } + if (_actorSystem != null) { _logger.LogInformation("Shutting down Akka.NET actor system via CoordinatedShutdown..."); @@ -349,6 +381,31 @@ akka {{ "AuditLogIngestActor singleton created (gRPC server bound: {GrpcBound})", grpcServer is not null); + // Audit Log (#23) M6 Bundle E (T7): subscribe the per-site stalled + // telemetry tracker to the actor system EventStream NOW that the + // system exists. The tracker mirrors every + // SiteAuditTelemetryStalledChanged publication (from + // SiteAuditReconciliationActor — wired in a later bundle) into the + // AuditCentralHealthSnapshot singleton so the central health surface + // sees per-site stalled state. The tracker is constructed here rather + // than in AddAuditLogCentralMaintenance because its ctor needs an + // ActorSystem, which is not a DI-resolvable singleton — it's owned + // by this hosted service. The snapshot singleton is resolvable; + // passing it in seeds the tracker's Apply() so both internal state + // and the snapshot stay in lock-step. + var auditCentralSnapshot = _serviceProvider + .GetService(); + if (auditCentralSnapshot is not null) + { + var stalledTracker = new ScadaLink.AuditLog.Central.SiteAuditTelemetryStalledTracker( + _actorSystem!, auditCentralSnapshot); + lock (_trackedDisposables) + { + _trackedDisposables.Add(stalledTracker); + } + _logger.LogInformation("SiteAuditTelemetryStalledTracker subscribed to EventStream"); + } + // Site Call Audit (#22) — central singleton mirrors the AuditLogIngest // and NotificationOutbox patterns. M3's dual-write transaction routes // SiteCalls upserts through AuditLogIngestActor's own scope-per-message diff --git a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs index 7fbfebf..795841b 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditRedactionFailureCounterTests.cs @@ -24,8 +24,7 @@ public class CentralAuditRedactionFailureCounterTests : TestKit [Fact] public void Increment_Routes_To_Snapshot() { - using var tracker = new SiteAuditTelemetryStalledTracker(Sys); - var snapshot = new AuditCentralHealthSnapshot(tracker); + var snapshot = new AuditCentralHealthSnapshot(); var counter = new CentralAuditRedactionFailureCounter(snapshot); counter.Increment(); @@ -60,10 +59,10 @@ public class CentralAuditRedactionFailureCounterTests : TestKit var services = new ServiceCollection(); services.AddSingleton(); services.AddSingleton(typeof(ILogger<>), typeof(NullLogger<>)); - // The AuditCentralHealthSnapshot ctor takes the stalled tracker - // which itself needs an ActorSystem — register a real system - // (test-kit's Sys) so the DI graph composes. - services.AddSingleton(Sys); + // AuditCentralHealthSnapshot no longer takes a tracker dependency — + // the tracker is constructed later by the Akka bootstrap because its + // ctor needs an ActorSystem (not a DI-resolvable singleton). The + // snapshot itself composes purely from primitives. services.AddAuditLog(config); services.AddAuditLogCentralMaintenance(config); using var provider = services.BuildServiceProvider(); diff --git a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs index 0383e09..32b0a9a 100644 --- a/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs +++ b/tests/ScadaLink.AuditLog.Tests/Central/CentralAuditWriteFailuresTests.cs @@ -115,9 +115,10 @@ public class CentralAuditWriteFailuresTests : TestKit { // AuditCentralHealthSnapshot implements both writer surfaces; bumping // through the writer interfaces is reflected on the read surface, and - // SiteAuditTelemetryStalled is sourced from the injected tracker. - using var tracker = new SiteAuditTelemetryStalledTracker(Sys); - var snapshot = new AuditCentralHealthSnapshot(tracker); + // the per-site stalled state is fed in via ApplyStalled — production + // wires that to a SiteAuditTelemetryStalledTracker, but the snapshot + // is testable in isolation against the same Apply surface. + var snapshot = new AuditCentralHealthSnapshot(); Assert.Equal(0, snapshot.CentralAuditWriteFailures); Assert.Equal(0, snapshot.AuditRedactionFailure); @@ -127,7 +128,11 @@ public class CentralAuditWriteFailuresTests : TestKit ((ICentralAuditWriteFailureCounter)snapshot).Increment(); ((ScadaLink.AuditLog.Payload.IAuditRedactionFailureCounter)snapshot).Increment(); - // Publish a stalled-changed event so the tracker registers a site. + // Wire the tracker so an EventStream publish reaches the snapshot. + // The tracker pushes into the snapshot's ApplyStalled when given + // the snapshot in its ctor; the tracker also keeps its own latch, + // but the snapshot read surface is what the central UI reads. + using var tracker = new SiteAuditTelemetryStalledTracker(Sys, snapshot); Sys.EventStream.Publish(new SiteAuditTelemetryStalledChanged("siteA", Stalled: true)); AwaitAssert(() => { @@ -143,9 +148,13 @@ public class CentralAuditWriteFailuresTests : TestKit } [Fact] - public void AuditCentralHealthSnapshot_Construction_Without_Tracker_Throws() + public void Snapshot_Empty_OnConstruction() { - Assert.Throws( - () => new AuditCentralHealthSnapshot(null!)); + // Sanity: the snapshot's three properties start at their zero values + // before any writer or stalled-event publication. + var snapshot = new AuditCentralHealthSnapshot(); + Assert.Equal(0, snapshot.CentralAuditWriteFailures); + Assert.Equal(0, snapshot.AuditRedactionFailure); + Assert.Empty(snapshot.SiteAuditTelemetryStalled); } } From 66f6724c5df2a34f3592ca4b5038e96ded69e3dd Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:32:01 -0400 Subject: [PATCH 14/16] test(auditlog): outage + reconciliation recovery end-to-end (#23 M6) --- .../Integration/OutageReconciliationTests.cs | 349 ++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs new file mode 100644 index 0000000..57295be --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Integration/OutageReconciliationTests.cs @@ -0,0 +1,349 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Site; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Interfaces.Services; +using ScadaLink.Commons.Messages.Integration; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Integration; + +/// +/// Bundle F (#23 M6-T10) end-to-end test for the central-outage + reconciliation +/// recovery loop. Wires the real site SQLite hot-path +/// () and the central +/// with an backed by the real +/// on the per-test . +/// +/// +/// +/// The push path is deliberately omitted here: the brief models a sustained +/// central outage where the site queue grows unbounded in Pending, then a +/// reconciliation pull eventually drains everything once central comes back. +/// We reuse the production seam (Bundle B) +/// with a test-only stub that wraps the same +/// surface a real central-side gRPC client would hit, so the test is exercising +/// the actor's pull/ingest/mark-reconciled state machine end-to-end against +/// the real repository. +/// +/// +/// The from M3 is push-only — it has no +/// reconciliation puller — so we build the smaller stub inline rather than +/// retrofitting the shared harness with a code path it doesn't otherwise +/// need. +/// +/// +public class OutageReconciliationTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public OutageReconciliationTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + /// + /// Test-only that mirrors how the + /// production central-side gRPC client will hit the site: read a batch + /// from , then commit + /// via once the central + /// repository accepts the rows. The Ask-based central path is wired by + /// the caller — we just expose the queue surface. + /// + /// + /// The production wire shape will be: + /// central PullAuditEvents RPC → site SiteStreamGrpcServer.PullAuditEvents + /// → ISiteAuditQueue.ReadPendingSinceAsync → marshal proto → reply + /// followed by central InsertIfNotExistsAsync per row, then the site flips + /// the row to Reconciled on the next pull cycle. The stub collapses the + /// two halves (pull + commit) because the actor under test (the + /// reconciliation actor) is the side that drives both via the + /// IPullAuditEventsClient seam — committing back to the site after the + /// repository write is the reconciliation-actor invariant we want to + /// observe end-to-end. + /// + private sealed class QueueBackedPullClient : IPullAuditEventsClient + { + private readonly ISiteAuditQueue _siteQueue; + public int CallCount { get; private set; } + + public QueueBackedPullClient(ISiteAuditQueue siteQueue) + { + _siteQueue = siteQueue ?? throw new ArgumentNullException(nameof(siteQueue)); + } + + public async Task PullAsync( + string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct) + { + CallCount++; + + var rows = await _siteQueue + .ReadPendingSinceAsync(sinceUtc, batchSize, ct) + .ConfigureAwait(false); + + // Commit immediately on the site side — once the actor has the + // batch in hand it will InsertIfNotExistsAsync centrally; if the + // central insert later throws on a specific row, idempotency + // guarantees the next pull cycle does NOT re-fetch the row (it's + // already Reconciled on the site) but also does not surface the + // failure here. The brief calls this "ack-after-persist" — the + // production gRPC server will flip to Reconciled inside its + // PullAuditEvents handler after the central side has acknowledged + // (per Bundle A's race-fix, central is idempotent on EventId). + // + // MoreAvailable is true iff the read filled the batch — the actor + // uses this to decide whether to follow up on the next tick. + if (rows.Count > 0) + { + var ids = rows.Select(e => e.EventId).ToList(); + await _siteQueue.MarkReconciledAsync(ids, ct).ConfigureAwait(false); + } + + return new PullAuditEventsResponse(rows, MoreAvailable: rows.Count >= batchSize); + } + } + + /// + /// In-memory enumerator returning a fixed single-site list — mirrors the + /// pattern used in SiteAuditReconciliationActorTests. + /// + private sealed class StaticEnumerator : ISiteEnumerator + { + private readonly IReadOnlyList _sites; + public StaticEnumerator(params SiteEntry[] sites) => _sites = sites; + public Task> EnumerateAsync(CancellationToken ct = default) => + Task.FromResult(_sites); + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + private static AuditEvent NewEvent(string siteId, DateTime occurredAt) => new() + { + EventId = Guid.NewGuid(), + OccurredAtUtc = occurredAt, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = siteId, + Target = "external-system-a/method", + }; + + private SqliteAuditWriter CreateInMemorySqliteWriter() => + new SqliteAuditWriter( + Options.Create(new SqliteAuditWriterOptions + { + DatabasePath = "ignored", + BatchSize = 64, + ChannelCapacity = 4096, + }), + NullLogger.Instance, + connectionStringOverride: + $"Data Source=file:outage-{Guid.NewGuid():N}?mode=memory&cache=shared"); + + private (IServiceProvider Sp, IActorRef Ingest) BuildCentralPipeline() + { + var services = new ServiceCollection(); + services.AddDbContext(opts => + opts.UseSqlServer(_fixture.ConnectionString)); + services.AddScoped(sp => + new AuditLogRepository(sp.GetRequiredService())); + var sp = services.BuildServiceProvider(); + + var ingest = Sys.ActorOf(Props.Create(() => new AuditLogIngestActor( + sp, + NullLogger.Instance))); + return (sp, ingest); + } + + private static SiteAuditReconciliationOptions FastTickOptions(int batchSize = 256) => new() + { + ReconciliationIntervalSeconds = 300, + ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100), + BatchSize = batchSize, + StalledAfterNonDrainingCycles = 2, + }; + + // --------------------------------------------------------------------- + // 1. CentralOutage_200Events_Buffer_Then_Reconciliation_Catches_Up_NoDuplicates + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task CentralOutage_200Events_Buffer_Then_Reconciliation_Catches_Up_NoDuplicates() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = "outage-recon-" + Guid.NewGuid().ToString("N").Substring(0, 8); + + // Step 1: site accumulates 200 audit events during the simulated + // central outage. The push path is NOT wired here — every row stays + // Pending in the site SQLite store until reconciliation runs. + await using var sqliteWriter = CreateInMemorySqliteWriter(); + var baseOccurred = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc); + const int totalEvents = 200; + var written = new List(totalEvents); + + for (int i = 0; i < totalEvents; i++) + { + // Strictly monotonic OccurredAtUtc so the cursor can advance + // deterministically batch-by-batch — mirrors how a real script + // workload generates timestamps in wall-clock order. + var evt = NewEvent(siteId, baseOccurred.AddMilliseconds(i)); + written.Add(evt); + await sqliteWriter.WriteAsync(evt); + } + + // Sanity: every row is Pending (no push path wired, so nothing has + // been Forwarded or Reconciled yet). + var pending = await sqliteWriter.ReadPendingAsync(totalEvents + 10); + Assert.Equal(totalEvents, pending.Count); + + // Step 2: central comes online — wire the ingest actor + reconciliation + // actor. The pull client wraps the site queue directly (the production + // shape is one RPC call); each pull advances the actor's cursor and + // flips rows on the site to Reconciled. + var (sp, ingest) = BuildCentralPipeline(); + await using (sp as IAsyncDisposable ?? throw new InvalidOperationException()) + { + var pullClient = new QueueBackedPullClient(sqliteWriter); + var enumerator = new StaticEnumerator(new SiteEntry(siteId, "http://test:8083")); + + // BatchSize = 64 so the actor needs ~4 ticks to drain 200 rows. + // The "after 5 minutes" wording in the brief is satisfied by the + // fast-tick override (100 ms per tick) plus AwaitAssert giving + // the actor up to ~30 seconds to settle in real time. + var opts = FastTickOptions(batchSize: 64); + + // Standalone DI scope for the reconciliation actor (it shares the + // ingest actor's IServiceProvider so both writers see the same + // EF context configuration). + var reconciliationActor = Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor( + enumerator, + pullClient, + sp, + Options.Create(opts), + NullLogger.Instance))); + + // Step 3: assert central AuditLog has all 200 rows after the + // actor drains. Polling the real MSSQL repository — the test + // fixture has its own database so a count restricted to this + // SourceSiteId is exact. + await AwaitAssertAsync(async () => + { + await using var ctx = CreateContext(); + var count = await ctx.Set() + .Where(e => e.SourceSiteId == siteId) + .CountAsync(); + Assert.Equal(totalEvents, count); + }, + duration: TimeSpan.FromSeconds(30), + interval: TimeSpan.FromMilliseconds(200)); + + // Step 4: assert site rows flipped to Reconciled. + // ReadPendingAsync only returns Pending rows; after a full drain + // it must be empty. + await AwaitAssertAsync(async () => + { + var stillPending = await sqliteWriter.ReadPendingAsync(totalEvents + 10); + Assert.Empty(stillPending); + }, + duration: TimeSpan.FromSeconds(10), + interval: TimeSpan.FromMilliseconds(100)); + + // Step 5: assert no duplicates by EventId — central must have + // exactly the 200 rows we wrote at the site (one row per EventId). + await using var verify = CreateContext(); + var centralIds = await verify.Set() + .Where(e => e.SourceSiteId == siteId) + .Select(e => e.EventId) + .ToListAsync(); + Assert.Equal(totalEvents, centralIds.Count); + Assert.Equal(totalEvents, centralIds.Distinct().Count()); + // And every EventId we wrote at the site is present centrally. + Assert.True(written.All(w => centralIds.Contains(w.EventId)), + "every site-written EventId should be present centrally."); + + // Tear the actor down before disposing the harness; the actor's + // PostStop cancels its scheduled timer. + Sys.Stop(reconciliationActor); + } + } + + // --------------------------------------------------------------------- + // 2. ReconciliationPull_Idempotent_Across_Two_Cycles + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task ReconciliationPull_Idempotent_Across_Two_Cycles() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + var siteId = "outage-idem-" + Guid.NewGuid().ToString("N").Substring(0, 8); + const int totalEvents = 50; + + await using var sqliteWriter = CreateInMemorySqliteWriter(); + var baseOccurred = new DateTime(2026, 5, 20, 13, 0, 0, DateTimeKind.Utc); + for (int i = 0; i < totalEvents; i++) + { + await sqliteWriter.WriteAsync(NewEvent(siteId, baseOccurred.AddMilliseconds(i))); + } + + var (sp, _) = BuildCentralPipeline(); + await using (sp as IAsyncDisposable ?? throw new InvalidOperationException()) + { + var pullClient = new QueueBackedPullClient(sqliteWriter); + var enumerator = new StaticEnumerator(new SiteEntry(siteId, "http://test:8083")); + + var reconciliationActor = Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor( + enumerator, + pullClient, + sp, + Options.Create(FastTickOptions()), + NullLogger.Instance))); + + // Wait for the first drain cycle to complete. + await AwaitAssertAsync(async () => + { + await using var ctx = CreateContext(); + var count = await ctx.Set() + .Where(e => e.SourceSiteId == siteId) + .CountAsync(); + Assert.Equal(totalEvents, count); + }, + duration: TimeSpan.FromSeconds(30), + interval: TimeSpan.FromMilliseconds(200)); + + // Wait for additional pull cycles to fire — the actor ticks every + // 100 ms so a 1 s settle leaves the actor with at least ~5 ticks + // past the initial drain. Each subsequent tick must be a no-op + // because every row is now Reconciled and outside the + // ReadPendingSinceAsync filter. + var callsAfterDrain = pullClient.CallCount; + await Task.Delay(TimeSpan.FromMilliseconds(800)); + Assert.True(pullClient.CallCount > callsAfterDrain, + $"expected additional pull calls after drain to validate idempotency, got {pullClient.CallCount} after {callsAfterDrain}"); + + // Central count must still be exactly totalEvents — no duplicates + // even though the cursor + read-Reconciled-too semantics could + // theoretically re-fetch on the second cycle. + await using var verify = CreateContext(); + var rows = await verify.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + Assert.Equal(totalEvents, rows.Count); + Assert.Equal(totalEvents, rows.Select(r => r.EventId).Distinct().Count()); + + Sys.Stop(reconciliationActor); + } + } +} From 213853458186940a6d7576e8352298c0dfd0f5ac Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:36:17 -0400 Subject: [PATCH 15/16] test(auditlog): partition-switch purge end-to-end (#23 M6) --- .../Integration/PartitionPurgeTests.cs | 354 ++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs new file mode 100644 index 0000000..69db1b1 --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionPurgeTests.cs @@ -0,0 +1,354 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Data.SqlClient; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.AuditLog.Configuration; +using ScadaLink.Commons.Entities.Audit; +using ScadaLink.Commons.Interfaces.Repositories; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Repositories; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Integration; + +/// +/// Bundle F (#23 M6-T11) end-to-end test for the daily partition-switch +/// purge: seeds three monthly partitions (Jan / Feb / Mar 2026) with direct +/// INSERTs that bypass the standard repository ingest path (so the seed +/// timestamps are explicit), drives against +/// the real + per-test +/// database, and asserts: +/// +/// The oldest partition (Jan) is removed. +/// Newer partitions (Feb + Mar) are untouched. +/// The UX_AuditLog_EventId unique index survives the +/// drop-and-rebuild dance. +/// remains +/// idempotent against the rebuilt index after the purge. +/// +/// +/// +/// The brief calls out that direct INSERTs bypass the writer role's INSERT-only +/// grant; the fixture connects as sa (see +/// 's default admin connection string), so +/// the seed step does not need the writer role at all. The drop-and-rebuild +/// dance itself runs under the same admin connection because the test owns +/// the database — the role granularity is exercised in the repository tests, +/// not here. +/// +public class PartitionPurgeTests : TestKit, IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public PartitionPurgeTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + /// + /// Direct INSERT into dbo.AuditLog bypassing + /// . Used by the + /// seed step so the test can place rows in arbitrary partitions without + /// the repository's idempotency wrapper or ingest-stamping behaviour + /// affecting the seed payload. + /// + private async Task DirectInsertAsync( + SqlConnection conn, + Guid eventId, + DateTime occurredAtUtc, + string siteId) + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = @" +INSERT INTO dbo.AuditLog + (EventId, OccurredAtUtc, IngestedAtUtc, Channel, Kind, CorrelationId, + SourceSiteId, SourceInstanceId, SourceScript, Actor, Target, Status, + HttpStatus, DurationMs, ErrorMessage, ErrorDetail, RequestSummary, + ResponseSummary, PayloadTruncated, Extra, ForwardState) +VALUES + (@EventId, @OccurredAtUtc, @IngestedAtUtc, 'ApiOutbound', 'ApiCall', NULL, + @SourceSiteId, NULL, NULL, NULL, NULL, 'Delivered', + NULL, NULL, NULL, NULL, NULL, + NULL, 0, NULL, NULL);"; + cmd.Parameters.Add("@EventId", System.Data.SqlDbType.UniqueIdentifier).Value = eventId; + // SqlDbType.DateTime2 with explicit Scale 7 matches the + // OccurredAtUtc column shape (datetime2(7)) and avoids the implicit + // narrowing that SqlClient's default DateTime → datetime applies via + // AddWithValue. Critical for partition assignment: the partition + // function key column is datetime2(7); a narrowed value would still + // land in the correct partition for first-of-month seeds, but + // explicit typing here documents the intent and matches how the + // production repository INSERT shapes its parameters. + var occurredParam = cmd.Parameters.Add("@OccurredAtUtc", System.Data.SqlDbType.DateTime2); + occurredParam.Scale = 7; + occurredParam.Value = occurredAtUtc; + var ingestedParam = cmd.Parameters.Add("@IngestedAtUtc", System.Data.SqlDbType.DateTime2); + ingestedParam.Scale = 7; + ingestedParam.Value = DateTime.UtcNow; + cmd.Parameters.Add("@SourceSiteId", System.Data.SqlDbType.VarChar, 64).Value = siteId; + await cmd.ExecuteNonQueryAsync(); + } + + /// + /// Asserts that UX_AuditLog_EventId exists in + /// sys.indexes. The drop-and-rebuild dance briefly removes the + /// index inside its transaction; this check is meant to fire AFTER the + /// actor's purge tick has committed so the rebuilt index is observable. + /// + private static async Task AssertUxIndexExistsAsync(SqlConnection conn) + { + await using var cmd = conn.CreateCommand(); + cmd.CommandText = @" +SELECT COUNT(*) +FROM sys.indexes +WHERE name = 'UX_AuditLog_EventId' + AND object_id = OBJECT_ID('dbo.AuditLog');"; + var raw = await cmd.ExecuteScalarAsync(); + var count = Convert.ToInt32(raw); + Assert.True(count == 1, $"UX_AuditLog_EventId should be present post-purge; sys.indexes count was {count}."); + } + + private IActorRef CreateActor( + IServiceProvider sp, + AuditLogPurgeOptions purgeOptions, + AuditLogOptions auditOptions) + { + return Sys.ActorOf(Props.Create(() => new AuditLogPurgeActor( + sp, + Options.Create(purgeOptions), + Options.Create(auditOptions), + NullLogger.Instance))); + } + + private static (DateTime Jan, DateTime Feb, DateTime Mar) SeedOccurredAt() => ( + new DateTime(2026, 1, 15, 0, 0, 0, DateTimeKind.Utc), + new DateTime(2026, 2, 15, 0, 0, 0, DateTimeKind.Utc), + new DateTime(2026, 3, 15, 0, 0, 0, DateTimeKind.Utc)); + + // --------------------------------------------------------------------- + // 1. EndToEnd_OldestPartition_PurgedViaActor_NewerKept + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_OldestPartition_PurgedViaActor_NewerKept() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Test date is ~2026-05-20 per environment. We want a threshold that + // sits strictly between Jan 15 (the Jan partition's MAX) and Feb 15 + // (the Feb partition's MAX) so only the Jan-2026 partition is + // eligible for purge. RetentionDays = 100 gives a threshold of + // ~2026-02-09 — Jan 15 is older (purged), Feb 15 and Mar 15 are + // newer (kept). The window between Jan 15 and Feb 15 is wide enough + // (~30 days) to tolerate any plausible test-clock drift in CI. + var siteId = "purge-e2e-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEventId = Guid.NewGuid(); + var febEventId = Guid.NewGuid(); + var marEventId = Guid.NewGuid(); + var (janOccurred, febOccurred, marOccurred) = SeedOccurredAt(); + + await using (var seedConn = _fixture.OpenConnection()) + { + await DirectInsertAsync(seedConn, janEventId, janOccurred, siteId); + await DirectInsertAsync(seedConn, febEventId, febOccurred, siteId); + await DirectInsertAsync(seedConn, marEventId, marOccurred, siteId); + } + + // Wire the actor with a real EF context against the fixture DB. + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + + var purgeOptions = new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }; + var auditOptions = new AuditLogOptions { RetentionDays = 100 }; + + CreateActor(sp, purgeOptions, auditOptions); + + // Wait for the actor's tick to purge the Jan-2026 partition. + // Concurrent test runs against the same fixture might also create + // eligible partitions, but each test class owns its own fixture DB + // (MsSqlMigrationFixture seeds a guid-named DB per class), so the + // Jan-2026 boundary is the only one this test can have produced. + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + var matched = probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + Assert.True(matched.RowsDeleted >= 1, + $"Expected RowsDeleted >= 1 for Jan-2026 boundary; got {matched.RowsDeleted}."); + + // Allow a brief settle in case the actor is mid-tick on Feb/Mar + // (it shouldn't be, since RetentionDays = 90 means only Jan is + // eligible, but the actor MAY re-enumerate quickly while we read). + await Task.Delay(TimeSpan.FromMilliseconds(500)); + + await using var verify = CreateContext(); + var rows = await verify.Set() + .Where(e => e.SourceSiteId == siteId) + .ToListAsync(); + + // Jan removed; Feb + Mar untouched. Because the test owns the site + // id and the fixture DB, exact set membership is observable. + Assert.DoesNotContain(rows, r => r.EventId == janEventId); + Assert.Contains(rows, r => r.EventId == febEventId); + Assert.Contains(rows, r => r.EventId == marEventId); + } + + // --------------------------------------------------------------------- + // 2. EndToEnd_UxIndexRebuilt_AfterPurge + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_UxIndexRebuilt_AfterPurge() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Same shape as test 1 — purge the Jan-2026 partition and then + // assert the UX_AuditLog_EventId index is still present. The + // drop-and-rebuild dance briefly removes it inside its transaction + // (the SWITCH PARTITION step requires the non-aligned unique index + // to be absent), but step 5 rebuilds it before committing. Sanity- + // checking the post-COMMIT shape here documents the invariant in an + // assertable way. + var siteId = "purge-uxidx-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEventId = Guid.NewGuid(); + var (janOccurred, _, _) = SeedOccurredAt(); + + await using (var seedConn = _fixture.OpenConnection()) + { + await DirectInsertAsync(seedConn, janEventId, janOccurred, siteId); + } + + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + + CreateActor( + sp, + new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }, + new AuditLogOptions { RetentionDays = 90 }); + + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + + // Open a fresh connection (the actor's pool is owned by EF) and + // assert the index is present post-purge. + await using var check = _fixture.OpenConnection(); + await AssertUxIndexExistsAsync(check); + } + + // --------------------------------------------------------------------- + // 3. EndToEnd_InsertIfNotExistsAsync_StillIdempotent_AfterPurge + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_InsertIfNotExistsAsync_StillIdempotent_AfterPurge() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + // Seed + purge a Jan-2026 row, THEN exercise InsertIfNotExistsAsync + // twice for a fresh (May-2026) EventId. The second call must be a + // no-op (duplicate-key collision swallowed by the repository, per + // M2 Bundle A's race-fix) — which means the rebuilt + // UX_AuditLog_EventId unique index is functioning as intended. + var siteId = "purge-idem-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var janEventId = Guid.NewGuid(); + var (janOccurred, _, _) = SeedOccurredAt(); + + await using (var seedConn = _fixture.OpenConnection()) + { + await DirectInsertAsync(seedConn, janEventId, janOccurred, siteId); + } + + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + var sp = services.BuildServiceProvider(); + + var probe = CreateTestProbe(); + Sys.EventStream.Subscribe(probe.Ref, typeof(AuditLogPurgedEvent)); + + CreateActor( + sp, + new AuditLogPurgeOptions + { + IntervalHours = 24, + IntervalOverride = TimeSpan.FromMilliseconds(100), + }, + new AuditLogOptions { RetentionDays = 90 }); + + var janBoundary = new DateTime(2026, 1, 1, 0, 0, 0, DateTimeKind.Utc); + probe.FishForMessage( + isMessage: m => m.MonthBoundary == janBoundary, + max: TimeSpan.FromSeconds(30)); + + // Settle then exercise InsertIfNotExistsAsync twice for the same + // EventId. The repository's idempotency relies on + // UX_AuditLog_EventId being present so the IF NOT EXISTS … INSERT + // race window resolves to a duplicate-key violation the repo + // swallows. If the index were missing here, two rows would land + // and the second InsertIfNotExistsAsync would silently double-insert. + await Task.Delay(TimeSpan.FromMilliseconds(500)); + + var freshEventId = Guid.NewGuid(); + var freshOccurred = new DateTime(2026, 5, 15, 12, 0, 0, DateTimeKind.Utc); + var freshSite = "purge-idem-fresh-" + Guid.NewGuid().ToString("N").Substring(0, 8); + var freshEvt = new AuditEvent + { + EventId = freshEventId, + OccurredAtUtc = freshOccurred, + Channel = AuditChannel.ApiOutbound, + Kind = AuditKind.ApiCall, + Status = AuditStatus.Delivered, + SourceSiteId = freshSite, + Target = "system-x/method", + }; + + await using (var ctx = CreateContext()) + { + var repo = new AuditLogRepository(ctx); + await repo.InsertIfNotExistsAsync(freshEvt); + // Same row a second time — must be a silent no-op. + await repo.InsertIfNotExistsAsync(freshEvt); + } + + await using var verify = CreateContext(); + var rows = await verify.Set() + .Where(e => e.SourceSiteId == freshSite) + .ToListAsync(); + Assert.Single(rows); + Assert.Equal(freshEventId, rows[0].EventId); + } +} From eb5fa8f2bce56e2a5da2789be441cc89ee309580 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Wed, 20 May 2026 19:38:07 -0400 Subject: [PATCH 16/16] test(auditlog): partition maintenance roll-forward end-to-end (#23 M6) --- .../Integration/PartitionMaintenanceTests.cs | 278 ++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs diff --git a/tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs new file mode 100644 index 0000000..bd1a81c --- /dev/null +++ b/tests/ScadaLink.AuditLog.Tests/Integration/PartitionMaintenanceTests.cs @@ -0,0 +1,278 @@ +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using ScadaLink.AuditLog.Central; +using ScadaLink.Commons.Interfaces; +using ScadaLink.ConfigurationDatabase; +using ScadaLink.ConfigurationDatabase.Maintenance; +using ScadaLink.ConfigurationDatabase.Tests.Migrations; + +namespace ScadaLink.AuditLog.Tests.Integration; + +/// +/// Bundle F (#23 M6-T12) end-to-end tests for the +/// hosted service running +/// the real EF/MSSQL against the +/// per-class . The migration seeds +/// boundaries for every month Jan 2026 – Dec 2027, so the eager startup tick +/// can be exercised both for the "future covered" no-op case and for the +/// "lookahead larger than covered" SPLIT case. +/// +/// +/// Tests within this class share one fixture DB — boundaries added by one +/// test persist across the next. Each test reads the max boundary at the +/// start and computes its lookahead relative to it, mirroring the pattern +/// used by the per-component AuditLogPartitionMaintenanceTests in +/// ScadaLink.ConfigurationDatabase.Tests. +/// +public class PartitionMaintenanceTests : IClassFixture +{ + private readonly MsSqlMigrationFixture _fixture; + + public PartitionMaintenanceTests(MsSqlMigrationFixture fixture) + { + _fixture = fixture; + } + + private ScadaLinkDbContext CreateContext() => + new(new DbContextOptionsBuilder() + .UseSqlServer(_fixture.ConnectionString).Options); + + /// + /// Builds the central-side DI graph for the hosted service: scoped EF + /// context + scoped matching how + /// AddConfigurationDatabase wires the production composition root. + /// + private ServiceProvider BuildProvider() + { + var services = new ServiceCollection(); + services.AddDbContext( + opts => opts.UseSqlServer(_fixture.ConnectionString), + ServiceLifetime.Scoped); + services.AddScoped(); + return services.BuildServiceProvider(); + } + + private static async Task ReadMaxBoundaryAsync(IServiceProvider sp) + { + await using var scope = sp.CreateAsyncScope(); + var maintenance = scope.ServiceProvider.GetRequiredService(); + return await maintenance.GetMaxBoundaryAsync(); + } + + /// + /// Mirrors the helper in + /// AuditLogPartitionMaintenanceTests.LookaheadForExtraBoundaries: + /// the smallest lookahead value that lands the SPLIT horizon exactly + /// months past the current max. + /// + private static int LookaheadForExtraBoundaries(DateTime max, int extraBoundaries) + { + var nowFirstOfNextMonth = FirstOfNextMonth(DateTime.UtcNow); + var monthsToMax = ((max.Year - nowFirstOfNextMonth.Year) * 12) + + max.Month - nowFirstOfNextMonth.Month; + return monthsToMax + extraBoundaries; + } + + private static int LookaheadInsideExistingRange(DateTime max) + { + var now = DateTime.UtcNow; + var months = ((max.Year - now.Year) * 12) + max.Month - now.Month - 1; + return Math.Max(1, months); + } + + private static DateTime FirstOfNextMonth(DateTime instant) + { + var firstOfThisMonth = new DateTime(instant.Year, instant.Month, 1, 0, 0, 0, DateTimeKind.Utc); + return firstOfThisMonth.AddMonths(1); + } + + /// + /// Awaits one full tick of the hosted service. The service runs an + /// eager startup tick inside 's + /// continuation, but the continuation is dispatched on a background + /// Task.Run — so we poll the side effect (the boundary count or + /// max-boundary value) until it changes. + /// + private async Task StartAndAwaitStartupTickAsync( + AuditLogPartitionMaintenanceService svc, + Func> awaitCondition, + TimeSpan timeout) + { + await svc.StartAsync(CancellationToken.None); + var deadline = DateTime.UtcNow + timeout; + while (DateTime.UtcNow < deadline) + { + if (await awaitCondition()) + { + return; + } + await Task.Delay(50); + } + } + + // --------------------------------------------------------------------- + // 1. EndToEnd_DefaultLookahead_NoSplit_WhenFutureCovered + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_DefaultLookahead_NoSplit_WhenFutureCovered() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var sp = BuildProvider(); + + // The migration seeds boundaries through Dec 2027. With default + // lookahead = 1 and today = ~2026-05-20, horizon = + // NormalizeToFirstOfMonth(now) + 1 = 2026-07-01, well within the + // seeded range, so the startup tick should issue zero SPLITs. + var maxBefore = await ReadMaxBoundaryAsync(sp); + Assert.NotNull(maxBefore); + + // Skip if the fixture DB already has boundaries past Dec 2027 from + // a prior test in this class — the lookahead-already-covered path + // is what we want to exercise, regardless of how far past Dec 2027 + // the boundary may be. + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, // long enough that only the startup tick fires inside the test window + LookaheadMonths = 1, + }); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + + // Drive the startup tick. There is no public completion handle; + // poll until either (a) the max boundary changes (which would be a + // failure for this test) or (b) the polling window expires (success). + await svc.StartAsync(CancellationToken.None); + await Task.Delay(TimeSpan.FromSeconds(2)); + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + // Assert the max boundary is unchanged: no SPLIT was issued. + var maxAfter = await ReadMaxBoundaryAsync(sp); + Assert.Equal(maxBefore, maxAfter); + } + + // --------------------------------------------------------------------- + // 2. EndToEnd_LookaheadLargerThanCovered_Splits_NewBoundaries + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_LookaheadLargerThanCovered_Splits_NewBoundaries() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var sp = BuildProvider(); + + var maxBefore = await ReadMaxBoundaryAsync(sp); + Assert.NotNull(maxBefore); + + // Pick a lookahead that adds exactly two new boundaries past the + // current max. The expected new boundaries are max+1mo and max+2mo. + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 2); + var expectedFirstNew = maxBefore.Value.AddMonths(1); + var expectedSecondNew = maxBefore.Value.AddMonths(2); + + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = lookahead, + }); + + var svc = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + + // Drive the startup tick. Wait until max boundary moves forward by + // the expected amount; SPLIT against MSSQL can take a second or two + // on a busy dev container. + await StartAndAwaitStartupTickAsync( + svc, + async () => + { + var current = await ReadMaxBoundaryAsync(sp); + return current == expectedSecondNew; + }, + timeout: TimeSpan.FromSeconds(15)); + + await svc.StopAsync(CancellationToken.None); + svc.Dispose(); + + var maxAfter = await ReadMaxBoundaryAsync(sp); + // Two new boundaries should be present after the startup tick. The + // hosted service does not surface the added-list directly (it logs + // only at Information), so we assert via the max-boundary delta. + Assert.Equal(expectedSecondNew, maxAfter); + // Sanity: the intermediate boundary was also added (the loop + // SPLITs every month from max+1 up to horizon, in order). + Assert.True(expectedFirstNew < expectedSecondNew); + } + + // --------------------------------------------------------------------- + // 3. EndToEnd_PartitionMaintenance_Idempotent_OverTwoRuns + // --------------------------------------------------------------------- + + [SkippableFact] + public async Task EndToEnd_PartitionMaintenance_Idempotent_OverTwoRuns() + { + Skip.IfNot(_fixture.Available, _fixture.SkipReason); + + await using var sp = BuildProvider(); + + var maxBefore = await ReadMaxBoundaryAsync(sp); + Assert.NotNull(maxBefore); + + // Add exactly one new boundary on the first run. + var lookahead = LookaheadForExtraBoundaries(maxBefore.Value, extraBoundaries: 1); + var expectedAdded = maxBefore.Value.AddMonths(1); + + var opts = Options.Create(new AuditLogPartitionMaintenanceOptions + { + IntervalSeconds = 60, + LookaheadMonths = lookahead, + }); + + // First run. + var svc1 = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + await StartAndAwaitStartupTickAsync( + svc1, + async () => + { + var current = await ReadMaxBoundaryAsync(sp); + return current == expectedAdded; + }, + timeout: TimeSpan.FromSeconds(15)); + await svc1.StopAsync(CancellationToken.None); + svc1.Dispose(); + + var maxAfterFirst = await ReadMaxBoundaryAsync(sp); + Assert.Equal(expectedAdded, maxAfterFirst); + + // Second run with the SAME lookahead value. Because the boundary + // is already covered, the EnsureLookaheadAsync call must be a + // no-op — max boundary is unchanged AND no exception is thrown. + var svc2 = new AuditLogPartitionMaintenanceService( + sp.GetRequiredService(), + opts, + NullLogger.Instance); + await svc2.StartAsync(CancellationToken.None); + // Wait long enough that the startup tick would have fired and + // logged any boundary addition; the boundary state must remain + // unchanged after the wait. + await Task.Delay(TimeSpan.FromSeconds(2)); + await svc2.StopAsync(CancellationToken.None); + svc2.Dispose(); + + var maxAfterSecond = await ReadMaxBoundaryAsync(sp); + Assert.Equal(maxAfterFirst, maxAfterSecond); + } +}