feat(site): replicate config by id + standby fetch (kills the intra-site frame trap)

This commit is contained in:
Joseph Doherty
2026-06-26 14:13:58 -04:00
parent 631ce5bfce
commit 5c2db9fe70
6 changed files with 349 additions and 26 deletions
@@ -0,0 +1,226 @@
using System.Collections.Concurrent;
using Akka.Actor;
using Akka.TestKit.Xunit2;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
using ZB.MOM.WW.ScadaBridge.StoreAndForward;
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors;
/// <summary>
/// Tests for <see cref="SiteReplicationActor"/>'s notify-and-fetch config replication:
/// the active node now replicates an id-only <see cref="ReplicateConfigDeploy"/> (no inline
/// config JSON — killing the intra-site 128 KB frame trap), and the standby fetches the
/// config from central over HTTP and writes it with the older-write guard.
/// </summary>
public class SiteReplicationActorTests : TestKit, IDisposable
{
// Cluster provider is required because SiteReplicationActor calls Cluster.Get in its ctor
// and subscribes to cluster events in PreStart. We use the in-memory TestTransport (not
// dot-netty) so no real socket is bound and no DNS lookup happens — the actor only needs
// the cluster extension to load; these tests never form a real two-node cluster.
private const string ClusterConfig = @"
akka {
actor { provider = cluster }
remote {
enabled-transports = [""akka.remote.test""]
test {
transport-class = ""Akka.Remote.Transport.TestTransport, Akka.Remote""
applied-adapters = []
registry-key = site-repl-test
local-address = ""test://site-repl@localhost:1""
maximum-payload-bytes = 128000b
scheme-identifier = test
}
}
cluster { roles = [""site-test""] }
loglevel = WARNING
}";
private const string SiteRole = "site-test";
private readonly SiteStorageService _storage;
private readonly StoreAndForwardStorage _sfStorage;
private readonly ReplicationService _replicationService;
private readonly string _dbFile;
private readonly string _sfDbFile;
public SiteReplicationActorTests() : base(ClusterConfig, "site-repl")
{
_dbFile = Path.Combine(Path.GetTempPath(), $"site-repl-test-{Guid.NewGuid():N}.db");
_sfDbFile = Path.Combine(Path.GetTempPath(), $"site-repl-sf-{Guid.NewGuid():N}.db");
_storage = new SiteStorageService(
$"Data Source={_dbFile}", NullLogger<SiteStorageService>.Instance);
_storage.InitializeAsync().GetAwaiter().GetResult();
_sfStorage = new StoreAndForwardStorage(
$"Data Source={_sfDbFile}", NullLogger<StoreAndForwardStorage>.Instance);
_sfStorage.InitializeAsync().GetAwaiter().GetResult();
_replicationService = new ReplicationService(
new StoreAndForwardOptions(), NullLogger<ReplicationService>.Instance);
}
void IDisposable.Dispose()
{
Shutdown();
try { File.Delete(_dbFile); } catch { /* cleanup */ }
try { File.Delete(_sfDbFile); } catch { /* cleanup */ }
}
private IActorRef CreateReplicationActor(IDeploymentConfigFetcher fetcher) =>
ActorOf(Props.Create(() => new SiteReplicationActor(
_storage, _sfStorage, _replicationService, SiteRole,
NullLogger<SiteReplicationActor>.Instance, fetcher)));
[Fact]
public async Task ApplyConfigDeploy_StandbyFetchesConfigAndGuardedWrites()
{
// The standby receives an id-only ApplyConfigDeploy; it fetches the config from
// central using the message's coords, then guarded-writes the fetched config.
const string configJson = "{\"instanceUniqueName\":\"Pump1\"}";
var fetcher = new FakeConfigFetcher(_ => Task.FromResult(configJson));
var actor = CreateReplicationActor(fetcher);
actor.Tell(new ApplyConfigDeploy(
"Pump1", "dep-100", "sha256:abc", true,
"http://central:9000", "tok-xyz"));
// The continuation runs off-thread; await the guarded write landing.
await AwaitAssertAsync(async () =>
{
var configs = await _storage.GetAllDeployedConfigsAsync();
var row = Assert.Single(configs, c => c.InstanceUniqueName == "Pump1");
Assert.Equal(configJson, row.ConfigJson);
Assert.Equal("dep-100", row.DeploymentId);
Assert.Equal("sha256:abc", row.RevisionHash);
Assert.True(row.IsEnabled);
}, TimeSpan.FromSeconds(5));
// The fetcher was called with the message's coords.
var call = Assert.Single(fetcher.Calls);
Assert.Equal("http://central:9000", call.BaseUrl);
Assert.Equal("dep-100", call.DeploymentId);
Assert.Equal("tok-xyz", call.Token);
}
[Fact]
public async Task ApplyConfigDeploy_Superseded404_SkipsWriteAndActorSurvives()
{
// A 404 (superseded/expired) surfaces as DeploymentConfigFetchException{IsSuperseded}.
// The standby must skip the write, observe the exception (no crash), and stay alive.
var fetcher = new FakeConfigFetcher(_ =>
Task.FromException<string>(
new DeploymentConfigFetchException("expired", isSuperseded: true)));
var actor = CreateReplicationActor(fetcher);
actor.Tell(new ApplyConfigDeploy(
"GonePump", "dep-stale", "sha256:gone", true,
"http://central:9000", "tok-stale"));
// The fetch was attempted...
await AwaitAssertAsync(() =>
{
Assert.Single(fetcher.Calls);
return Task.CompletedTask;
}, TimeSpan.FromSeconds(5));
// ...the actor did not crash (no Terminated to its watcher within the window)...
Watch(actor);
ExpectNoMsg(TimeSpan.FromMilliseconds(500));
// ...and nothing was written for the superseded instance.
var configs = await _storage.GetAllDeployedConfigsAsync();
Assert.DoesNotContain(configs, c => c.InstanceUniqueName == "GonePump");
}
[Fact]
public async Task ApplyConfigDeploy_EmptyFetchCoords_SkipsFetchAndWrite()
{
// The direct DeployInstanceCommand wire path (retired in Task 14) replicates with
// empty coords; the guard must skip quietly — no FetchAsync("") call, no write.
var fetcher = new FakeConfigFetcher(_ => Task.FromResult("never"));
var actor = CreateReplicationActor(fetcher);
actor.Tell(new ApplyConfigDeploy(
"NoCoordsPump", "dep-direct", "sha256:nc", true,
CentralFetchBaseUrl: "", FetchToken: ""));
// Give any (erroneous) async continuation time to run, then prove neither happened.
Watch(actor);
ExpectNoMsg(TimeSpan.FromMilliseconds(500));
Assert.Empty(fetcher.Calls);
var configs = await _storage.GetAllDeployedConfigsAsync();
Assert.DoesNotContain(configs, c => c.InstanceUniqueName == "NoCoordsPump");
}
[Fact]
public void ReplicateConfigDeploy_MapsToIdOnlyApplyConfigDeploy_ForPeer()
{
// The outbound mapping must forward an id-only ApplyConfigDeploy carrying the fetch
// coords (and NO inline config) to the peer.
var probe = CreateTestProbe();
var fetcher = new FakeConfigFetcher(_ => Task.FromResult("unused"));
var actor = ActorOf(Props.Create(() => new ProbeForwardingReplicationActor(
_storage, _sfStorage, _replicationService, SiteRole,
NullLogger<SiteReplicationActor>.Instance, fetcher, probe.Ref)));
actor.Tell(new ReplicateConfigDeploy(
"Pump2", "dep-200", "sha256:def", false,
"http://central:9000", "tok-abc"));
var applied = probe.ExpectMsg<ApplyConfigDeploy>(TimeSpan.FromSeconds(3));
Assert.Equal("Pump2", applied.InstanceName);
Assert.Equal("dep-200", applied.DeploymentId);
Assert.Equal("sha256:def", applied.RevisionHash);
Assert.False(applied.IsEnabled);
Assert.Equal("http://central:9000", applied.CentralFetchBaseUrl);
Assert.Equal("tok-abc", applied.FetchToken);
}
/// <summary>
/// Test subclass exposing the peer send: <see cref="SiteReplicationActor.SendToPeer"/> is
/// overridden to forward to a probe so the outbound mapping can be asserted without a real
/// two-node cluster (a single-node TestKit has no peer address, so the real send is dropped).
/// </summary>
private sealed class ProbeForwardingReplicationActor : SiteReplicationActor
{
private readonly IActorRef _peerProbe;
public ProbeForwardingReplicationActor(
SiteStorageService storage, StoreAndForwardStorage sfStorage,
ReplicationService replicationService, string siteRole,
ILogger<SiteReplicationActor> logger, IDeploymentConfigFetcher configFetcher,
IActorRef peerProbe)
: base(storage, sfStorage, replicationService, siteRole, logger, configFetcher)
=> _peerProbe = peerProbe;
protected override void SendToPeer(object message) => _peerProbe.Tell(message, Self);
}
/// <summary>
/// In-test fake <see cref="IDeploymentConfigFetcher"/>: runs a per-deploymentId behavior
/// (return config JSON or throw, as a Task — mirroring the real async HTTP fetcher) and
/// records every call's coords thread-safely (the continuation runs on a pool thread).
/// </summary>
private sealed class FakeConfigFetcher : IDeploymentConfigFetcher
{
private readonly Func<string, Task<string>> _behavior;
public ConcurrentQueue<(string BaseUrl, string DeploymentId, string Token)> Calls { get; } = new();
public FakeConfigFetcher(Func<string, Task<string>> behavior) => _behavior = behavior;
public async Task<string> FetchAsync(
string centralFetchBaseUrl, string deploymentId, string token, CancellationToken ct)
{
Calls.Enqueue((centralFetchBaseUrl, deploymentId, token));
await Task.Yield();
return await _behavior(deploymentId);
}
}
}