fix(store-and-forward): resolve S&F delivery + replication wiring (3 Critical findings)

Resolves StoreAndForward-001, ExternalSystemGateway-001, NotificationService-001
— one systemic gap where buffered messages were persisted but never delivered,
and the active node never replicated its buffer to the standby.

Delivery handlers (ExternalSystemGateway-001 / NotificationService-001):
- AkkaHostedService registers delivery handlers for the ExternalSystem,
  CachedDbWrite and Notification categories after StoreAndForwardService starts;
  each resolves its scoped consumer in a fresh DI scope.
- ExternalSystemClient, DatabaseGateway and NotificationDeliveryService each
  gain a DeliverBufferedAsync method: re-resolve the target and re-attempt
  delivery, returning true/false/throwing per the transient-vs-permanent contract.
- EnqueueAsync gains an attemptImmediateDelivery flag; CachedCallAsync and
  NotificationDeliveryService.SendAsync pass false (they already attempted
  delivery themselves) so registering a handler does not dispatch twice.

Replication (StoreAndForward-001):
- ReplicationService is injected into StoreAndForwardService; a new BufferAsync
  helper replicates every enqueue, and successful-retry removes and parks are
  replicated too. Fire-and-forget, no-op when replication is disabled.

Tests: StoreAndForwardReplicationTests (Add/Remove/Park observed),
attemptImmediateDelivery behaviour, and DeliverBufferedAsync paths for each
consumer. Full solution builds; StoreAndForward/ExternalSystemGateway/
NotificationService suites green.
This commit is contained in:
Joseph Doherty
2026-05-16 18:58:11 -04:00
parent a9bd7ee37c
commit 61253e3269
15 changed files with 538 additions and 37 deletions

View File

@@ -0,0 +1,108 @@
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging.Abstractions;
using ScadaLink.Commons.Types.Enums;
namespace ScadaLink.StoreAndForward.Tests;
/// <summary>
/// StoreAndForward-001: the active node must forward every buffer operation
/// (add / remove / park) to the standby via the ReplicationService, so a
/// failover does not lose the buffer.
/// </summary>
public class StoreAndForwardReplicationTests : IAsyncLifetime, IDisposable
{
private readonly SqliteConnection _keepAlive;
private readonly StoreAndForwardStorage _storage;
private readonly StoreAndForwardService _service;
private readonly List<ReplicationOperation> _replicated = new();
public StoreAndForwardReplicationTests()
{
var connStr = $"Data Source=ReplTests_{Guid.NewGuid():N};Mode=Memory;Cache=Shared";
_keepAlive = new SqliteConnection(connStr);
_keepAlive.Open();
_storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
var options = new StoreAndForwardOptions
{
DefaultRetryInterval = TimeSpan.Zero,
DefaultMaxRetries = 1,
RetryTimerInterval = TimeSpan.FromMinutes(10),
ReplicationEnabled = true,
};
var replication = new ReplicationService(options, NullLogger<ReplicationService>.Instance);
replication.SetReplicationHandler(op =>
{
lock (_replicated) _replicated.Add(op);
return Task.CompletedTask;
});
_service = new StoreAndForwardService(
_storage, options, NullLogger<StoreAndForwardService>.Instance, replication);
}
public async Task InitializeAsync() => await _storage.InitializeAsync();
public Task DisposeAsync() => Task.CompletedTask;
public void Dispose() => _keepAlive.Dispose();
/// <summary>Replication is fire-and-forget (Task.Run); poll until the expected ops arrive.</summary>
private async Task<List<ReplicationOperation>> WaitForReplicationAsync(int count)
{
for (var i = 0; i < 100; i++)
{
lock (_replicated)
if (_replicated.Count >= count) return _replicated.ToList();
await Task.Delay(20);
}
lock (_replicated) return _replicated.ToList();
}
[Fact]
public async Task BufferingAMessage_ReplicatesAnAddOperation()
{
// No handler registered → message is buffered → an Add is replicated.
var result = await _service.EnqueueAsync(
StoreAndForwardCategory.ExternalSystem, "api", """{}""");
Assert.True(result.WasBuffered);
var ops = await WaitForReplicationAsync(1);
Assert.Contains(ops, o =>
o.OperationType == ReplicationOperationType.Add && o.MessageId == result.MessageId);
}
[Fact]
public async Task SuccessfulRetry_ReplicatesARemoveOperation()
{
var calls = 0;
_service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem,
_ => ++calls == 1
? throw new HttpRequestException("transient")
: Task.FromResult(true));
var result = await _service.EnqueueAsync(
StoreAndForwardCategory.ExternalSystem, "api", """{}""");
await _service.RetryPendingMessagesAsync();
var ops = await WaitForReplicationAsync(2);
Assert.Contains(ops, o => o.OperationType == ReplicationOperationType.Add);
Assert.Contains(ops, o =>
o.OperationType == ReplicationOperationType.Remove && o.MessageId == result.MessageId);
}
[Fact]
public async Task ParkedMessage_ReplicatesAParkOperation()
{
_service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem,
_ => throw new HttpRequestException("always fails"));
var result = await _service.EnqueueAsync(
StoreAndForwardCategory.ExternalSystem, "api", """{}""", maxRetries: 1);
await _service.RetryPendingMessagesAsync();
var ops = await WaitForReplicationAsync(2);
Assert.Contains(ops, o =>
o.OperationType == ReplicationOperationType.Park && o.MessageId == result.MessageId);
}
}