feat(auditlog): combined telemetry dual-write transaction (#23 M3)

This commit is contained in:
Joseph Doherty
2026-05-20 14:33:14 -04:00
parent 2b54290c7f
commit 0a97fff906
6 changed files with 777 additions and 0 deletions

View File

@@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Messages.Audit;
using ScadaLink.ConfigurationDatabase;
namespace ScadaLink.AuditLog.Central;
@@ -61,6 +62,11 @@ public class AuditLogIngestActor : ReceiveActor
_logger = logger;
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
// The single-repository test ctor cannot service the M3 dual-write —
// it has no SiteCalls repo and no DbContext. The handler still
// registers (so callers don't dead-letter) but replies empty so the
// test surface stays explicit about what this ctor supports.
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
}
/// <summary>
@@ -81,6 +87,7 @@ public class AuditLogIngestActor : ReceiveActor
_logger = logger;
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
}
/// <summary>
@@ -150,4 +157,98 @@ public class AuditLogIngestActor : ReceiveActor
replyTo.Tell(new IngestAuditEventsReply(accepted));
}
/// <summary>
/// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
/// actor opens a fresh MS SQL transaction, inserts the AuditLog row
/// idempotently AND upserts the SiteCalls row monotonically. Both succeed
/// or both roll back, so the audit and operational mirrors never drift
/// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
/// downstream join lines up cleanly.
/// </summary>
/// <remarks>
/// Per-entry isolation — one entry's failed transaction does NOT abort
/// other entries in the batch (each gets its own
/// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
/// scope and a try/catch around it). Audit-write failure NEVER aborts the
/// user-facing action — the site keeps the row Pending and retries on the
/// next drain.
/// </remarks>
private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
{
var replyTo = Sender;
var accepted = new List<Guid>(cmd.Entries.Count);
try
{
await using var scope = _serviceProvider!.CreateAsyncScope();
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
foreach (var entry in cmd.Entries)
{
try
{
await using var tx = await dbContext.Database
.BeginTransactionAsync()
.ConfigureAwait(false);
// Stamp IngestedAtUtc on both rows from a single
// central-side instant so a join on the two tables sees
// matching timestamps (debugging convenience, not a
// correctness invariant).
var ingestedAt = DateTime.UtcNow;
var auditStamped = entry.Audit with { IngestedAtUtc = ingestedAt };
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
await auditRepo.InsertIfNotExistsAsync(auditStamped)
.ConfigureAwait(false);
await siteCallRepo.UpsertAsync(siteCallStamped)
.ConfigureAwait(false);
await tx.CommitAsync().ConfigureAwait(false);
accepted.Add(entry.Audit.EventId);
}
catch (Exception ex)
{
// Both rows rolled back via the disposing transaction. The
// EventId is NOT added to `accepted` so the site keeps its
// row Pending and retries on the next drain. Other entries
// in the batch continue with their own transactions.
_logger.LogError(
ex,
"Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
entry.Audit.EventId,
entry.SiteCall.TrackedOperationId);
}
}
}
catch (Exception ex)
{
// Resolving the scope itself threw (e.g. DI mis-wiring). Log and
// reply with whatever we managed to accept (likely empty) — the
// central singleton MUST stay alive.
_logger.LogError(
ex,
"Combined telemetry batch ingest failed before per-entry processing.");
}
replyTo.Tell(new IngestCachedTelemetryReply(accepted));
}
/// <summary>
/// Fallback handler installed on the single-repository test ctor — that
/// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
/// it cannot service the dual-write. Logs a warning and replies with an
/// empty ack so callers fall through to their retry path.
/// </summary>
private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
{
_logger.LogWarning(
"AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
cmd.Entries.Count);
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
return Task.CompletedTask;
}
}

View File

@@ -0,0 +1,30 @@
using ScadaLink.Commons.Entities.Audit;
namespace ScadaLink.Commons.Messages.Audit;
/// <summary>
/// Akka message sent to the central <c>AuditLogIngestActor</c> (Audit Log #23 M3
/// Bundle D dual-write transaction) carrying a batch of combined audit +
/// site-call telemetry packets decoded by the <c>SiteStreamGrpcServer</c> from a
/// site's <c>IngestCachedTelemetry</c> gRPC RPC. For each entry the actor writes
/// the <see cref="AuditEvent"/> row AND the <see cref="SiteCall"/> upsert inside
/// a single MS SQL transaction — both succeed or both roll back, so the audit
/// and operational mirrors never drift mid-row.
/// </summary>
/// <remarks>
/// Lives in <c>ScadaLink.Commons</c> for the same reason as
/// <c>IngestAuditEventsCommand</c>: the gRPC server in
/// <c>ScadaLink.Communication</c> constructs it and <c>ScadaLink.AuditLog</c>
/// already references Communication. Putting the message in Commons avoids a
/// project-reference cycle.
/// </remarks>
public sealed record IngestCachedTelemetryCommand(IReadOnlyList<CachedTelemetryEntry> Entries);
/// <summary>
/// One lifecycle event of a cached call: the <see cref="AuditEvent"/> to insert
/// (idempotent on <see cref="AuditEvent.EventId"/>) plus the
/// <see cref="SiteCall"/> to upsert (monotonic on
/// <see cref="SiteCall.TrackedOperationId"/>). The two rows are paired so the
/// central dual-write transaction can commit them atomically.
/// </summary>
public sealed record CachedTelemetryEntry(AuditEvent Audit, SiteCall SiteCall);

View File

@@ -0,0 +1,10 @@
namespace ScadaLink.Commons.Messages.Audit;
/// <summary>
/// Reply from the central <c>AuditLogIngestActor</c> for an
/// <see cref="IngestCachedTelemetryCommand"/>. <see cref="AcceptedEventIds"/>
/// lists every entry whose dual-write transaction (AuditLog INSERT + SiteCalls
/// UPSERT) committed; entries whose transaction rolled back are absent so the
/// site can leave the row Pending and retry on the next drain.
/// </summary>
public sealed record IngestCachedTelemetryReply(IReadOnlyList<Guid> AcceptedEventIds);

View File

@@ -6,6 +6,7 @@ using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Messages.Audit;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
using GrpcStatus = Grpc.Core.Status;
@@ -298,9 +299,132 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
return ack;
}
/// <summary>
/// Audit Log (#23) M3 site→central combined-telemetry push RPC. Decodes a
/// batch of <see cref="CachedTelemetryPacket"/> entries into matched
/// (AuditEvent, SiteCall) pairs, Asks the central <c>AuditLogIngestActor</c>
/// proxy to persist them in dual-write transactions, and echoes the
/// AuditEvent EventIds that committed back so the site can flip its local
/// rows to <c>Forwarded</c>.
/// </summary>
/// <remarks>
/// Same wiring-incomplete fallback as <see cref="IngestAuditEvents"/>: when
/// the actor proxy has not been set the RPC replies with an empty ack so
/// sites treat the outcome as transient and retry, never a hard fault.
/// </remarks>
public override async Task<IngestAck> IngestCachedTelemetry(
CachedTelemetryBatch request,
ServerCallContext context)
{
if (request.Packets.Count == 0)
{
return new IngestAck();
}
var actor = _auditIngestActor;
if (actor is null)
{
_logger.LogWarning(
"IngestCachedTelemetry received {Count} packets before SetAuditIngestActor was called; returning empty ack.",
request.Packets.Count);
return new IngestAck();
}
var entries = new List<CachedTelemetryEntry>(request.Packets.Count);
foreach (var packet in request.Packets)
{
var auditEvent = MapAuditEventFromDto(packet.AuditEvent);
var siteCall = MapSiteCallFromDto(packet.Operational);
entries.Add(new CachedTelemetryEntry(auditEvent, siteCall));
}
var cmd = new IngestCachedTelemetryCommand(entries);
IngestCachedTelemetryReply reply;
try
{
reply = await actor.Ask<IngestCachedTelemetryReply>(
cmd, AuditIngestAskTimeout, context.CancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex,
"AuditLogIngestActor Ask failed for combined telemetry batch of {Count} packets; returning empty ack.",
request.Packets.Count);
return new IngestAck();
}
var ack = new IngestAck();
foreach (var id in reply.AcceptedEventIds)
{
ack.AcceptedEventIds.Add(id.ToString());
}
return ack;
}
private static string? NullIfEmpty(string? value) =>
string.IsNullOrEmpty(value) ? null : value;
/// <summary>
/// Inlined audit-event DTO→entity translation, kept in sync with the
/// <see cref="IngestAuditEvents"/> handler above. Extracted to a private
/// helper so the M3 dual-write RPC can reuse it without duplicating yet
/// another copy. The shape still mirrors
/// <c>AuditEventMapper.FromDto</c> in <c>ScadaLink.AuditLog.Telemetry</c>;
/// the two must evolve together (the project-reference cycle that
/// prevents calling the AuditLog mapper directly is documented on
/// <see cref="IngestAuditEvents"/>).
/// </summary>
private static AuditEvent MapAuditEventFromDto(AuditEventDto dto) =>
new()
{
EventId = Guid.Parse(dto.EventId),
OccurredAtUtc = DateTime.SpecifyKind(dto.OccurredAtUtc.ToDateTime(), DateTimeKind.Utc),
IngestedAtUtc = null,
Channel = Enum.Parse<AuditChannel>(dto.Channel),
Kind = Enum.Parse<AuditKind>(dto.Kind),
CorrelationId = NullIfEmpty(dto.CorrelationId) is { } cid ? Guid.Parse(cid) : null,
SourceSiteId = NullIfEmpty(dto.SourceSiteId),
SourceInstanceId = NullIfEmpty(dto.SourceInstanceId),
SourceScript = NullIfEmpty(dto.SourceScript),
Actor = NullIfEmpty(dto.Actor),
Target = NullIfEmpty(dto.Target),
Status = Enum.Parse<AuditStatus>(dto.Status),
HttpStatus = dto.HttpStatus,
DurationMs = dto.DurationMs,
ErrorMessage = NullIfEmpty(dto.ErrorMessage),
ErrorDetail = NullIfEmpty(dto.ErrorDetail),
RequestSummary = NullIfEmpty(dto.RequestSummary),
ResponseSummary = NullIfEmpty(dto.ResponseSummary),
PayloadTruncated = dto.PayloadTruncated,
Extra = NullIfEmpty(dto.Extra),
ForwardState = null,
};
/// <summary>
/// Translates a <see cref="SiteCallOperationalDto"/> into the persistence
/// entity. <see cref="SiteCall.IngestedAtUtc"/> is stamped here as a
/// placeholder; the central ingest actor overwrites it inside the
/// dual-write transaction so the AuditLog and SiteCalls rows share one
/// instant.
/// </summary>
private static SiteCall MapSiteCallFromDto(SiteCallOperationalDto dto) => new()
{
TrackedOperationId = TrackedOperationId.Parse(dto.TrackedOperationId),
Channel = dto.Channel,
Target = dto.Target,
SourceSite = dto.SourceSite,
Status = dto.Status,
RetryCount = dto.RetryCount,
LastError = string.IsNullOrEmpty(dto.LastError) ? null : dto.LastError,
HttpStatus = dto.HttpStatus,
CreatedAtUtc = DateTime.SpecifyKind(dto.CreatedAtUtc.ToDateTime(), DateTimeKind.Utc),
UpdatedAtUtc = DateTime.SpecifyKind(dto.UpdatedAtUtc.ToDateTime(), DateTimeKind.Utc),
TerminalAtUtc = dto.TerminalAtUtc is null
? null
: DateTime.SpecifyKind(dto.TerminalAtUtc.ToDateTime(), DateTimeKind.Utc),
IngestedAtUtc = DateTime.UtcNow, // overwritten by AuditLogIngestActor
};
/// <summary>
/// Tracks a single active stream so cleanup only removes its own entry.
/// </summary>