test(auditlog): outage + reconciliation recovery end-to-end (#23 M6)
This commit is contained in:
@@ -0,0 +1,349 @@
|
||||
using Akka.Actor;
|
||||
using Akka.TestKit.Xunit2;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.AuditLog.Central;
|
||||
using ScadaLink.AuditLog.Site;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Messages.Integration;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
using ScadaLink.ConfigurationDatabase;
|
||||
using ScadaLink.ConfigurationDatabase.Repositories;
|
||||
using ScadaLink.ConfigurationDatabase.Tests.Migrations;
|
||||
|
||||
namespace ScadaLink.AuditLog.Tests.Integration;
|
||||
|
||||
/// <summary>
|
||||
/// Bundle F (#23 M6-T10) end-to-end test for the central-outage + reconciliation
|
||||
/// recovery loop. Wires the real site SQLite hot-path
|
||||
/// (<see cref="SqliteAuditWriter"/>) and the central <see cref="SiteAuditReconciliationActor"/>
|
||||
/// with an <see cref="AuditLogIngestActor"/> backed by the real
|
||||
/// <see cref="AuditLogRepository"/> on the per-test <see cref="MsSqlMigrationFixture"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The push path is deliberately omitted here: the brief models a sustained
|
||||
/// central outage where the site queue grows unbounded in Pending, then a
|
||||
/// reconciliation pull eventually drains everything once central comes back.
|
||||
/// We reuse the production <see cref="IPullAuditEventsClient"/> seam (Bundle B)
|
||||
/// with a test-only stub that wraps the same <see cref="ISiteAuditQueue.ReadPendingSinceAsync"/>
|
||||
/// surface a real central-side gRPC client would hit, so the test is exercising
|
||||
/// the actor's pull/ingest/mark-reconciled state machine end-to-end against
|
||||
/// the real repository.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The <see cref="CombinedTelemetryHarness"/> from M3 is push-only — it has no
|
||||
/// reconciliation puller — so we build the smaller stub inline rather than
|
||||
/// retrofitting the shared harness with a code path it doesn't otherwise
|
||||
/// need.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class OutageReconciliationTests : TestKit, IClassFixture<MsSqlMigrationFixture>
|
||||
{
|
||||
private readonly MsSqlMigrationFixture _fixture;
|
||||
|
||||
public OutageReconciliationTests(MsSqlMigrationFixture fixture)
|
||||
{
|
||||
_fixture = fixture;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test-only <see cref="IPullAuditEventsClient"/> that mirrors how the
|
||||
/// production central-side gRPC client will hit the site: read a batch
|
||||
/// from <see cref="ISiteAuditQueue.ReadPendingSinceAsync"/>, then commit
|
||||
/// via <see cref="ISiteAuditQueue.MarkReconciledAsync"/> once the central
|
||||
/// repository accepts the rows. The Ask-based central path is wired by
|
||||
/// the caller — we just expose the queue surface.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The production wire shape will be:
|
||||
/// central PullAuditEvents RPC → site SiteStreamGrpcServer.PullAuditEvents
|
||||
/// → ISiteAuditQueue.ReadPendingSinceAsync → marshal proto → reply
|
||||
/// followed by central InsertIfNotExistsAsync per row, then the site flips
|
||||
/// the row to Reconciled on the next pull cycle. The stub collapses the
|
||||
/// two halves (pull + commit) because the actor under test (the
|
||||
/// reconciliation actor) is the side that drives both via the
|
||||
/// IPullAuditEventsClient seam — committing back to the site after the
|
||||
/// repository write is the reconciliation-actor invariant we want to
|
||||
/// observe end-to-end.
|
||||
/// </remarks>
|
||||
private sealed class QueueBackedPullClient : IPullAuditEventsClient
|
||||
{
|
||||
private readonly ISiteAuditQueue _siteQueue;
|
||||
public int CallCount { get; private set; }
|
||||
|
||||
public QueueBackedPullClient(ISiteAuditQueue siteQueue)
|
||||
{
|
||||
_siteQueue = siteQueue ?? throw new ArgumentNullException(nameof(siteQueue));
|
||||
}
|
||||
|
||||
public async Task<PullAuditEventsResponse> PullAsync(
|
||||
string siteId, DateTime sinceUtc, int batchSize, CancellationToken ct)
|
||||
{
|
||||
CallCount++;
|
||||
|
||||
var rows = await _siteQueue
|
||||
.ReadPendingSinceAsync(sinceUtc, batchSize, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// Commit immediately on the site side — once the actor has the
|
||||
// batch in hand it will InsertIfNotExistsAsync centrally; if the
|
||||
// central insert later throws on a specific row, idempotency
|
||||
// guarantees the next pull cycle does NOT re-fetch the row (it's
|
||||
// already Reconciled on the site) but also does not surface the
|
||||
// failure here. The brief calls this "ack-after-persist" — the
|
||||
// production gRPC server will flip to Reconciled inside its
|
||||
// PullAuditEvents handler after the central side has acknowledged
|
||||
// (per Bundle A's race-fix, central is idempotent on EventId).
|
||||
//
|
||||
// MoreAvailable is true iff the read filled the batch — the actor
|
||||
// uses this to decide whether to follow up on the next tick.
|
||||
if (rows.Count > 0)
|
||||
{
|
||||
var ids = rows.Select(e => e.EventId).ToList();
|
||||
await _siteQueue.MarkReconciledAsync(ids, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
return new PullAuditEventsResponse(rows, MoreAvailable: rows.Count >= batchSize);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory enumerator returning a fixed single-site list — mirrors the
|
||||
/// pattern used in <c>SiteAuditReconciliationActorTests</c>.
|
||||
/// </summary>
|
||||
private sealed class StaticEnumerator : ISiteEnumerator
|
||||
{
|
||||
private readonly IReadOnlyList<SiteEntry> _sites;
|
||||
public StaticEnumerator(params SiteEntry[] sites) => _sites = sites;
|
||||
public Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default) =>
|
||||
Task.FromResult(_sites);
|
||||
}
|
||||
|
||||
private ScadaLinkDbContext CreateContext() =>
|
||||
new(new DbContextOptionsBuilder<ScadaLinkDbContext>()
|
||||
.UseSqlServer(_fixture.ConnectionString).Options);
|
||||
|
||||
private static AuditEvent NewEvent(string siteId, DateTime occurredAt) => new()
|
||||
{
|
||||
EventId = Guid.NewGuid(),
|
||||
OccurredAtUtc = occurredAt,
|
||||
Channel = AuditChannel.ApiOutbound,
|
||||
Kind = AuditKind.ApiCall,
|
||||
Status = AuditStatus.Delivered,
|
||||
SourceSiteId = siteId,
|
||||
Target = "external-system-a/method",
|
||||
};
|
||||
|
||||
private SqliteAuditWriter CreateInMemorySqliteWriter() =>
|
||||
new SqliteAuditWriter(
|
||||
Options.Create(new SqliteAuditWriterOptions
|
||||
{
|
||||
DatabasePath = "ignored",
|
||||
BatchSize = 64,
|
||||
ChannelCapacity = 4096,
|
||||
}),
|
||||
NullLogger<SqliteAuditWriter>.Instance,
|
||||
connectionStringOverride:
|
||||
$"Data Source=file:outage-{Guid.NewGuid():N}?mode=memory&cache=shared");
|
||||
|
||||
private (IServiceProvider Sp, IActorRef Ingest) BuildCentralPipeline()
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
services.AddDbContext<ScadaLinkDbContext>(opts =>
|
||||
opts.UseSqlServer(_fixture.ConnectionString));
|
||||
services.AddScoped<IAuditLogRepository>(sp =>
|
||||
new AuditLogRepository(sp.GetRequiredService<ScadaLinkDbContext>()));
|
||||
var sp = services.BuildServiceProvider();
|
||||
|
||||
var ingest = Sys.ActorOf(Props.Create(() => new AuditLogIngestActor(
|
||||
sp,
|
||||
NullLogger<AuditLogIngestActor>.Instance)));
|
||||
return (sp, ingest);
|
||||
}
|
||||
|
||||
private static SiteAuditReconciliationOptions FastTickOptions(int batchSize = 256) => new()
|
||||
{
|
||||
ReconciliationIntervalSeconds = 300,
|
||||
ReconciliationIntervalOverride = TimeSpan.FromMilliseconds(100),
|
||||
BatchSize = batchSize,
|
||||
StalledAfterNonDrainingCycles = 2,
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
// 1. CentralOutage_200Events_Buffer_Then_Reconciliation_Catches_Up_NoDuplicates
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
[SkippableFact]
|
||||
public async Task CentralOutage_200Events_Buffer_Then_Reconciliation_Catches_Up_NoDuplicates()
|
||||
{
|
||||
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
|
||||
|
||||
var siteId = "outage-recon-" + Guid.NewGuid().ToString("N").Substring(0, 8);
|
||||
|
||||
// Step 1: site accumulates 200 audit events during the simulated
|
||||
// central outage. The push path is NOT wired here — every row stays
|
||||
// Pending in the site SQLite store until reconciliation runs.
|
||||
await using var sqliteWriter = CreateInMemorySqliteWriter();
|
||||
var baseOccurred = new DateTime(2026, 5, 20, 12, 0, 0, DateTimeKind.Utc);
|
||||
const int totalEvents = 200;
|
||||
var written = new List<AuditEvent>(totalEvents);
|
||||
|
||||
for (int i = 0; i < totalEvents; i++)
|
||||
{
|
||||
// Strictly monotonic OccurredAtUtc so the cursor can advance
|
||||
// deterministically batch-by-batch — mirrors how a real script
|
||||
// workload generates timestamps in wall-clock order.
|
||||
var evt = NewEvent(siteId, baseOccurred.AddMilliseconds(i));
|
||||
written.Add(evt);
|
||||
await sqliteWriter.WriteAsync(evt);
|
||||
}
|
||||
|
||||
// Sanity: every row is Pending (no push path wired, so nothing has
|
||||
// been Forwarded or Reconciled yet).
|
||||
var pending = await sqliteWriter.ReadPendingAsync(totalEvents + 10);
|
||||
Assert.Equal(totalEvents, pending.Count);
|
||||
|
||||
// Step 2: central comes online — wire the ingest actor + reconciliation
|
||||
// actor. The pull client wraps the site queue directly (the production
|
||||
// shape is one RPC call); each pull advances the actor's cursor and
|
||||
// flips rows on the site to Reconciled.
|
||||
var (sp, ingest) = BuildCentralPipeline();
|
||||
await using (sp as IAsyncDisposable ?? throw new InvalidOperationException())
|
||||
{
|
||||
var pullClient = new QueueBackedPullClient(sqliteWriter);
|
||||
var enumerator = new StaticEnumerator(new SiteEntry(siteId, "http://test:8083"));
|
||||
|
||||
// BatchSize = 64 so the actor needs ~4 ticks to drain 200 rows.
|
||||
// The "after 5 minutes" wording in the brief is satisfied by the
|
||||
// fast-tick override (100 ms per tick) plus AwaitAssert giving
|
||||
// the actor up to ~30 seconds to settle in real time.
|
||||
var opts = FastTickOptions(batchSize: 64);
|
||||
|
||||
// Standalone DI scope for the reconciliation actor (it shares the
|
||||
// ingest actor's IServiceProvider so both writers see the same
|
||||
// EF context configuration).
|
||||
var reconciliationActor = Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor(
|
||||
enumerator,
|
||||
pullClient,
|
||||
sp,
|
||||
Options.Create(opts),
|
||||
NullLogger<SiteAuditReconciliationActor>.Instance)));
|
||||
|
||||
// Step 3: assert central AuditLog has all 200 rows after the
|
||||
// actor drains. Polling the real MSSQL repository — the test
|
||||
// fixture has its own database so a count restricted to this
|
||||
// SourceSiteId is exact.
|
||||
await AwaitAssertAsync(async () =>
|
||||
{
|
||||
await using var ctx = CreateContext();
|
||||
var count = await ctx.Set<AuditEvent>()
|
||||
.Where(e => e.SourceSiteId == siteId)
|
||||
.CountAsync();
|
||||
Assert.Equal(totalEvents, count);
|
||||
},
|
||||
duration: TimeSpan.FromSeconds(30),
|
||||
interval: TimeSpan.FromMilliseconds(200));
|
||||
|
||||
// Step 4: assert site rows flipped to Reconciled.
|
||||
// ReadPendingAsync only returns Pending rows; after a full drain
|
||||
// it must be empty.
|
||||
await AwaitAssertAsync(async () =>
|
||||
{
|
||||
var stillPending = await sqliteWriter.ReadPendingAsync(totalEvents + 10);
|
||||
Assert.Empty(stillPending);
|
||||
},
|
||||
duration: TimeSpan.FromSeconds(10),
|
||||
interval: TimeSpan.FromMilliseconds(100));
|
||||
|
||||
// Step 5: assert no duplicates by EventId — central must have
|
||||
// exactly the 200 rows we wrote at the site (one row per EventId).
|
||||
await using var verify = CreateContext();
|
||||
var centralIds = await verify.Set<AuditEvent>()
|
||||
.Where(e => e.SourceSiteId == siteId)
|
||||
.Select(e => e.EventId)
|
||||
.ToListAsync();
|
||||
Assert.Equal(totalEvents, centralIds.Count);
|
||||
Assert.Equal(totalEvents, centralIds.Distinct().Count());
|
||||
// And every EventId we wrote at the site is present centrally.
|
||||
Assert.True(written.All(w => centralIds.Contains(w.EventId)),
|
||||
"every site-written EventId should be present centrally.");
|
||||
|
||||
// Tear the actor down before disposing the harness; the actor's
|
||||
// PostStop cancels its scheduled timer.
|
||||
Sys.Stop(reconciliationActor);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
// 2. ReconciliationPull_Idempotent_Across_Two_Cycles
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
[SkippableFact]
|
||||
public async Task ReconciliationPull_Idempotent_Across_Two_Cycles()
|
||||
{
|
||||
Skip.IfNot(_fixture.Available, _fixture.SkipReason);
|
||||
|
||||
var siteId = "outage-idem-" + Guid.NewGuid().ToString("N").Substring(0, 8);
|
||||
const int totalEvents = 50;
|
||||
|
||||
await using var sqliteWriter = CreateInMemorySqliteWriter();
|
||||
var baseOccurred = new DateTime(2026, 5, 20, 13, 0, 0, DateTimeKind.Utc);
|
||||
for (int i = 0; i < totalEvents; i++)
|
||||
{
|
||||
await sqliteWriter.WriteAsync(NewEvent(siteId, baseOccurred.AddMilliseconds(i)));
|
||||
}
|
||||
|
||||
var (sp, _) = BuildCentralPipeline();
|
||||
await using (sp as IAsyncDisposable ?? throw new InvalidOperationException())
|
||||
{
|
||||
var pullClient = new QueueBackedPullClient(sqliteWriter);
|
||||
var enumerator = new StaticEnumerator(new SiteEntry(siteId, "http://test:8083"));
|
||||
|
||||
var reconciliationActor = Sys.ActorOf(Props.Create(() => new SiteAuditReconciliationActor(
|
||||
enumerator,
|
||||
pullClient,
|
||||
sp,
|
||||
Options.Create(FastTickOptions()),
|
||||
NullLogger<SiteAuditReconciliationActor>.Instance)));
|
||||
|
||||
// Wait for the first drain cycle to complete.
|
||||
await AwaitAssertAsync(async () =>
|
||||
{
|
||||
await using var ctx = CreateContext();
|
||||
var count = await ctx.Set<AuditEvent>()
|
||||
.Where(e => e.SourceSiteId == siteId)
|
||||
.CountAsync();
|
||||
Assert.Equal(totalEvents, count);
|
||||
},
|
||||
duration: TimeSpan.FromSeconds(30),
|
||||
interval: TimeSpan.FromMilliseconds(200));
|
||||
|
||||
// Wait for additional pull cycles to fire — the actor ticks every
|
||||
// 100 ms so a 1 s settle leaves the actor with at least ~5 ticks
|
||||
// past the initial drain. Each subsequent tick must be a no-op
|
||||
// because every row is now Reconciled and outside the
|
||||
// ReadPendingSinceAsync filter.
|
||||
var callsAfterDrain = pullClient.CallCount;
|
||||
await Task.Delay(TimeSpan.FromMilliseconds(800));
|
||||
Assert.True(pullClient.CallCount > callsAfterDrain,
|
||||
$"expected additional pull calls after drain to validate idempotency, got {pullClient.CallCount} after {callsAfterDrain}");
|
||||
|
||||
// Central count must still be exactly totalEvents — no duplicates
|
||||
// even though the cursor + read-Reconciled-too semantics could
|
||||
// theoretically re-fetch on the second cycle.
|
||||
await using var verify = CreateContext();
|
||||
var rows = await verify.Set<AuditEvent>()
|
||||
.Where(e => e.SourceSiteId == siteId)
|
||||
.ToListAsync();
|
||||
Assert.Equal(totalEvents, rows.Count);
|
||||
Assert.Equal(totalEvents, rows.Select(r => r.EventId).Distinct().Count());
|
||||
|
||||
Sys.Stop(reconciliationActor);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user