feat(auditlog): combined telemetry dual-write transaction (#23 M3)
This commit is contained in:
@@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Messages.Audit;
|
||||
using ScadaLink.ConfigurationDatabase;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
@@ -61,6 +62,11 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
|
||||
// The single-repository test ctor cannot service the M3 dual-write —
|
||||
// it has no SiteCalls repo and no DbContext. The handler still
|
||||
// registers (so callers don't dead-letter) but replies empty so the
|
||||
// test surface stays explicit about what this ctor supports.
|
||||
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -81,6 +87,7 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
|
||||
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -150,4 +157,98 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
|
||||
replyTo.Tell(new IngestAuditEventsReply(accepted));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
|
||||
/// actor opens a fresh MS SQL transaction, inserts the AuditLog row
|
||||
/// idempotently AND upserts the SiteCalls row monotonically. Both succeed
|
||||
/// or both roll back, so the audit and operational mirrors never drift
|
||||
/// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
|
||||
/// downstream join lines up cleanly.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per-entry isolation — one entry's failed transaction does NOT abort
|
||||
/// other entries in the batch (each gets its own
|
||||
/// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
|
||||
/// scope and a try/catch around it). Audit-write failure NEVER aborts the
|
||||
/// user-facing action — the site keeps the row Pending and retries on the
|
||||
/// next drain.
|
||||
/// </remarks>
|
||||
private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
|
||||
{
|
||||
var replyTo = Sender;
|
||||
var accepted = new List<Guid>(cmd.Entries.Count);
|
||||
|
||||
try
|
||||
{
|
||||
await using var scope = _serviceProvider!.CreateAsyncScope();
|
||||
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
||||
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
|
||||
|
||||
foreach (var entry in cmd.Entries)
|
||||
{
|
||||
try
|
||||
{
|
||||
await using var tx = await dbContext.Database
|
||||
.BeginTransactionAsync()
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// Stamp IngestedAtUtc on both rows from a single
|
||||
// central-side instant so a join on the two tables sees
|
||||
// matching timestamps (debugging convenience, not a
|
||||
// correctness invariant).
|
||||
var ingestedAt = DateTime.UtcNow;
|
||||
var auditStamped = entry.Audit with { IngestedAtUtc = ingestedAt };
|
||||
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
|
||||
|
||||
await auditRepo.InsertIfNotExistsAsync(auditStamped)
|
||||
.ConfigureAwait(false);
|
||||
await siteCallRepo.UpsertAsync(siteCallStamped)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await tx.CommitAsync().ConfigureAwait(false);
|
||||
accepted.Add(entry.Audit.EventId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Both rows rolled back via the disposing transaction. The
|
||||
// EventId is NOT added to `accepted` so the site keeps its
|
||||
// row Pending and retries on the next drain. Other entries
|
||||
// in the batch continue with their own transactions.
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
|
||||
entry.Audit.EventId,
|
||||
entry.SiteCall.TrackedOperationId);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Resolving the scope itself threw (e.g. DI mis-wiring). Log and
|
||||
// reply with whatever we managed to accept (likely empty) — the
|
||||
// central singleton MUST stay alive.
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Combined telemetry batch ingest failed before per-entry processing.");
|
||||
}
|
||||
|
||||
replyTo.Tell(new IngestCachedTelemetryReply(accepted));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fallback handler installed on the single-repository test ctor — that
|
||||
/// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
|
||||
/// it cannot service the dual-write. Logs a warning and replies with an
|
||||
/// empty ack so callers fall through to their retry path.
|
||||
/// </summary>
|
||||
private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
|
||||
cmd.Entries.Count);
|
||||
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
|
||||
namespace ScadaLink.Commons.Messages.Audit;
|
||||
|
||||
/// <summary>
|
||||
/// Akka message sent to the central <c>AuditLogIngestActor</c> (Audit Log #23 M3
|
||||
/// Bundle D dual-write transaction) carrying a batch of combined audit +
|
||||
/// site-call telemetry packets decoded by the <c>SiteStreamGrpcServer</c> from a
|
||||
/// site's <c>IngestCachedTelemetry</c> gRPC RPC. For each entry the actor writes
|
||||
/// the <see cref="AuditEvent"/> row AND the <see cref="SiteCall"/> upsert inside
|
||||
/// a single MS SQL transaction — both succeed or both roll back, so the audit
|
||||
/// and operational mirrors never drift mid-row.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Lives in <c>ScadaLink.Commons</c> for the same reason as
|
||||
/// <c>IngestAuditEventsCommand</c>: the gRPC server in
|
||||
/// <c>ScadaLink.Communication</c> constructs it and <c>ScadaLink.AuditLog</c>
|
||||
/// already references Communication. Putting the message in Commons avoids a
|
||||
/// project-reference cycle.
|
||||
/// </remarks>
|
||||
public sealed record IngestCachedTelemetryCommand(IReadOnlyList<CachedTelemetryEntry> Entries);
|
||||
|
||||
/// <summary>
|
||||
/// One lifecycle event of a cached call: the <see cref="AuditEvent"/> to insert
|
||||
/// (idempotent on <see cref="AuditEvent.EventId"/>) plus the
|
||||
/// <see cref="SiteCall"/> to upsert (monotonic on
|
||||
/// <see cref="SiteCall.TrackedOperationId"/>). The two rows are paired so the
|
||||
/// central dual-write transaction can commit them atomically.
|
||||
/// </summary>
|
||||
public sealed record CachedTelemetryEntry(AuditEvent Audit, SiteCall SiteCall);
|
||||
@@ -0,0 +1,10 @@
|
||||
namespace ScadaLink.Commons.Messages.Audit;
|
||||
|
||||
/// <summary>
|
||||
/// Reply from the central <c>AuditLogIngestActor</c> for an
|
||||
/// <see cref="IngestCachedTelemetryCommand"/>. <see cref="AcceptedEventIds"/>
|
||||
/// lists every entry whose dual-write transaction (AuditLog INSERT + SiteCalls
|
||||
/// UPSERT) committed; entries whose transaction rolled back are absent so the
|
||||
/// site can leave the row Pending and retry on the next drain.
|
||||
/// </summary>
|
||||
public sealed record IngestCachedTelemetryReply(IReadOnlyList<Guid> AcceptedEventIds);
|
||||
@@ -6,6 +6,7 @@ using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Messages.Audit;
|
||||
using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
using GrpcStatus = Grpc.Core.Status;
|
||||
|
||||
@@ -298,9 +299,132 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
|
||||
return ack;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M3 site→central combined-telemetry push RPC. Decodes a
|
||||
/// batch of <see cref="CachedTelemetryPacket"/> entries into matched
|
||||
/// (AuditEvent, SiteCall) pairs, Asks the central <c>AuditLogIngestActor</c>
|
||||
/// proxy to persist them in dual-write transactions, and echoes the
|
||||
/// AuditEvent EventIds that committed back so the site can flip its local
|
||||
/// rows to <c>Forwarded</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Same wiring-incomplete fallback as <see cref="IngestAuditEvents"/>: when
|
||||
/// the actor proxy has not been set the RPC replies with an empty ack so
|
||||
/// sites treat the outcome as transient and retry, never a hard fault.
|
||||
/// </remarks>
|
||||
public override async Task<IngestAck> IngestCachedTelemetry(
|
||||
CachedTelemetryBatch request,
|
||||
ServerCallContext context)
|
||||
{
|
||||
if (request.Packets.Count == 0)
|
||||
{
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var actor = _auditIngestActor;
|
||||
if (actor is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"IngestCachedTelemetry received {Count} packets before SetAuditIngestActor was called; returning empty ack.",
|
||||
request.Packets.Count);
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var entries = new List<CachedTelemetryEntry>(request.Packets.Count);
|
||||
foreach (var packet in request.Packets)
|
||||
{
|
||||
var auditEvent = MapAuditEventFromDto(packet.AuditEvent);
|
||||
var siteCall = MapSiteCallFromDto(packet.Operational);
|
||||
entries.Add(new CachedTelemetryEntry(auditEvent, siteCall));
|
||||
}
|
||||
|
||||
var cmd = new IngestCachedTelemetryCommand(entries);
|
||||
IngestCachedTelemetryReply reply;
|
||||
try
|
||||
{
|
||||
reply = await actor.Ask<IngestCachedTelemetryReply>(
|
||||
cmd, AuditIngestAskTimeout, context.CancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"AuditLogIngestActor Ask failed for combined telemetry batch of {Count} packets; returning empty ack.",
|
||||
request.Packets.Count);
|
||||
return new IngestAck();
|
||||
}
|
||||
|
||||
var ack = new IngestAck();
|
||||
foreach (var id in reply.AcceptedEventIds)
|
||||
{
|
||||
ack.AcceptedEventIds.Add(id.ToString());
|
||||
}
|
||||
return ack;
|
||||
}
|
||||
|
||||
private static string? NullIfEmpty(string? value) =>
|
||||
string.IsNullOrEmpty(value) ? null : value;
|
||||
|
||||
/// <summary>
|
||||
/// Inlined audit-event DTO→entity translation, kept in sync with the
|
||||
/// <see cref="IngestAuditEvents"/> handler above. Extracted to a private
|
||||
/// helper so the M3 dual-write RPC can reuse it without duplicating yet
|
||||
/// another copy. The shape still mirrors
|
||||
/// <c>AuditEventMapper.FromDto</c> in <c>ScadaLink.AuditLog.Telemetry</c>;
|
||||
/// the two must evolve together (the project-reference cycle that
|
||||
/// prevents calling the AuditLog mapper directly is documented on
|
||||
/// <see cref="IngestAuditEvents"/>).
|
||||
/// </summary>
|
||||
private static AuditEvent MapAuditEventFromDto(AuditEventDto dto) =>
|
||||
new()
|
||||
{
|
||||
EventId = Guid.Parse(dto.EventId),
|
||||
OccurredAtUtc = DateTime.SpecifyKind(dto.OccurredAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
IngestedAtUtc = null,
|
||||
Channel = Enum.Parse<AuditChannel>(dto.Channel),
|
||||
Kind = Enum.Parse<AuditKind>(dto.Kind),
|
||||
CorrelationId = NullIfEmpty(dto.CorrelationId) is { } cid ? Guid.Parse(cid) : null,
|
||||
SourceSiteId = NullIfEmpty(dto.SourceSiteId),
|
||||
SourceInstanceId = NullIfEmpty(dto.SourceInstanceId),
|
||||
SourceScript = NullIfEmpty(dto.SourceScript),
|
||||
Actor = NullIfEmpty(dto.Actor),
|
||||
Target = NullIfEmpty(dto.Target),
|
||||
Status = Enum.Parse<AuditStatus>(dto.Status),
|
||||
HttpStatus = dto.HttpStatus,
|
||||
DurationMs = dto.DurationMs,
|
||||
ErrorMessage = NullIfEmpty(dto.ErrorMessage),
|
||||
ErrorDetail = NullIfEmpty(dto.ErrorDetail),
|
||||
RequestSummary = NullIfEmpty(dto.RequestSummary),
|
||||
ResponseSummary = NullIfEmpty(dto.ResponseSummary),
|
||||
PayloadTruncated = dto.PayloadTruncated,
|
||||
Extra = NullIfEmpty(dto.Extra),
|
||||
ForwardState = null,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Translates a <see cref="SiteCallOperationalDto"/> into the persistence
|
||||
/// entity. <see cref="SiteCall.IngestedAtUtc"/> is stamped here as a
|
||||
/// placeholder; the central ingest actor overwrites it inside the
|
||||
/// dual-write transaction so the AuditLog and SiteCalls rows share one
|
||||
/// instant.
|
||||
/// </summary>
|
||||
private static SiteCall MapSiteCallFromDto(SiteCallOperationalDto dto) => new()
|
||||
{
|
||||
TrackedOperationId = TrackedOperationId.Parse(dto.TrackedOperationId),
|
||||
Channel = dto.Channel,
|
||||
Target = dto.Target,
|
||||
SourceSite = dto.SourceSite,
|
||||
Status = dto.Status,
|
||||
RetryCount = dto.RetryCount,
|
||||
LastError = string.IsNullOrEmpty(dto.LastError) ? null : dto.LastError,
|
||||
HttpStatus = dto.HttpStatus,
|
||||
CreatedAtUtc = DateTime.SpecifyKind(dto.CreatedAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
UpdatedAtUtc = DateTime.SpecifyKind(dto.UpdatedAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
TerminalAtUtc = dto.TerminalAtUtc is null
|
||||
? null
|
||||
: DateTime.SpecifyKind(dto.TerminalAtUtc.ToDateTime(), DateTimeKind.Utc),
|
||||
IngestedAtUtc = DateTime.UtcNow, // overwritten by AuditLogIngestActor
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Tracks a single active stream so cleanup only removes its own entry.
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user