Add LMDB oplog migration path with dual-write cutover support
All checks were successful
NuGet Package Publish / nuget (push) Successful in 1m16s

Introduce LMDB oplog store, migration flags, telemetry/backfill tooling, and parity tests to enable staged Surreal-to-LMDB rollout with rollback coverage.
This commit is contained in:
Joseph Doherty
2026-02-22 17:44:57 -05:00
parent 3b9ff69adc
commit cce24fa8f3
16 changed files with 3601 additions and 6 deletions

View File

@@ -10,6 +10,7 @@ using ZB.MOM.WW.CBDDC.Core.Storage;
using ZB.MOM.WW.CBDDC.Core.Sync;
using ZB.MOM.WW.CBDDC.Network;
using ZB.MOM.WW.CBDDC.Network.Security;
using ZB.MOM.WW.CBDDC.Persistence.Lmdb;
using ZB.MOM.WW.CBDDC.Persistence.Surreal;
namespace ZB.MOM.WW.CBDDC.E2E.Tests;
@@ -240,6 +241,92 @@ public class ClusterCrudSyncE2ETests
}, 60, "Node B did not catch up missed reconnect mutations.", () => BuildDiagnostics(nodeA, nodeB));
}
/// <summary>
/// Verifies reconnect catch-up still works when reads are cut over to LMDB with dual-write enabled.
/// </summary>
[Fact]
public async Task PeerReconnect_ShouldCatchUpMissedChanges_WithLmdbPreferredReads()
{
var clusterToken = Guid.NewGuid().ToString("N");
int nodeAPort = GetAvailableTcpPort();
int nodeBPort = GetAvailableTcpPort();
while (nodeBPort == nodeAPort) nodeBPort = GetAvailableTcpPort();
await using var nodeA = TestPeerNode.Create(
"node-a",
nodeAPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-b",
Host = "127.0.0.1",
Port = nodeBPort
}
],
useLmdbOplog: true,
dualWriteOplog: true,
preferLmdbReads: true);
await using var nodeB = TestPeerNode.Create(
"node-b",
nodeBPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-a",
Host = "127.0.0.1",
Port = nodeAPort
}
],
useLmdbOplog: true,
dualWriteOplog: true,
preferLmdbReads: true);
await nodeA.StartAsync();
await nodeB.StartAsync();
await nodeB.StopAsync();
const string userId = "reconnect-lmdb-user";
await nodeA.UpsertUserAsync(new User
{
Id = userId,
Name = "Offline Create",
Age = 20,
Address = new Address { City = "Rome" }
});
await nodeA.UpsertUserAsync(new User
{
Id = userId,
Name = "Offline Update",
Age = 21,
Address = new Address { City = "Milan" }
});
await nodeA.UpsertUserAsync(new User
{
Id = userId,
Name = "Offline Final",
Age = 22,
Address = new Address { City = "Turin" }
});
await nodeB.StartAsync();
await AssertEventuallyAsync(() =>
{
var replicated = nodeB.ReadUser(userId);
return replicated is not null &&
replicated.Name == "Offline Final" &&
replicated.Age == 22 &&
replicated.Address?.City == "Turin";
}, 60, "Node B did not catch up missed reconnect mutations with LMDB preferred reads.",
() => BuildDiagnostics(nodeA, nodeB));
}
/// <summary>
/// Verifies a burst of rapid multi-node mutations converges to a deterministic final state.
/// </summary>
@@ -572,6 +659,9 @@ public class ClusterCrudSyncE2ETests
/// <param name="workDirOverride">An optional working directory override for test artifacts.</param>
/// <param name="preserveWorkDirOnDispose">A value indicating whether to preserve the working directory on dispose.</param>
/// <param name="useFaultInjectedCheckpointStore">A value indicating whether to inject a checkpoint persistence that fails once.</param>
/// <param name="useLmdbOplog">A value indicating whether to enable the LMDB oplog migration path.</param>
/// <param name="dualWriteOplog">A value indicating whether oplog writes should be mirrored to Surreal + LMDB.</param>
/// <param name="preferLmdbReads">A value indicating whether reads should prefer LMDB.</param>
/// <returns>A configured <see cref="TestPeerNode" /> instance.</returns>
public static TestPeerNode Create(
string nodeId,
@@ -580,7 +670,10 @@ public class ClusterCrudSyncE2ETests
IReadOnlyList<KnownPeerConfiguration> knownPeers,
string? workDirOverride = null,
bool preserveWorkDirOnDispose = false,
bool useFaultInjectedCheckpointStore = false)
bool useFaultInjectedCheckpointStore = false,
bool useLmdbOplog = false,
bool dualWriteOplog = true,
bool preferLmdbReads = false)
{
string workDir = workDirOverride ?? Path.Combine(Path.GetTempPath(), $"cbddc-e2e-{nodeId}-{Guid.NewGuid():N}");
Directory.CreateDirectory(workDir);
@@ -620,13 +713,47 @@ public class ClusterCrudSyncE2ETests
if (useFaultInjectedCheckpointStore)
{
services.AddSingleton<ISurrealCdcCheckpointPersistence, CrashAfterFirstAdvanceCheckpointPersistence>();
coreBuilder.AddCBDDCSurrealEmbedded<FaultInjectedSampleDocumentStore>(surrealOptionsFactory)
.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
var registration = coreBuilder.AddCBDDCSurrealEmbedded<FaultInjectedSampleDocumentStore>(surrealOptionsFactory);
if (useLmdbOplog)
registration.AddCBDDCLmdbOplog(
_ => new LmdbOplogOptions
{
EnvironmentPath = Path.Combine(workDir, "oplog-lmdb"),
MapSizeBytes = 128L * 1024 * 1024,
MaxDatabases = 16,
PruneBatchSize = 256
},
flags =>
{
flags.UseLmdbOplog = true;
flags.DualWriteOplog = dualWriteOplog;
flags.PreferLmdbReads = preferLmdbReads;
flags.ReconciliationInterval = TimeSpan.Zero;
});
registration.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
}
else
{
coreBuilder.AddCBDDCSurrealEmbedded<SampleDocumentStore>(surrealOptionsFactory)
.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
var registration = coreBuilder.AddCBDDCSurrealEmbedded<SampleDocumentStore>(surrealOptionsFactory);
if (useLmdbOplog)
registration.AddCBDDCLmdbOplog(
_ => new LmdbOplogOptions
{
EnvironmentPath = Path.Combine(workDir, "oplog-lmdb"),
MapSizeBytes = 128L * 1024 * 1024,
MaxDatabases = 16,
PruneBatchSize = 256
},
flags =>
{
flags.UseLmdbOplog = true;
flags.DualWriteOplog = dualWriteOplog;
flags.PreferLmdbReads = preferLmdbReads;
flags.ReconciliationInterval = TimeSpan.Zero;
});
registration.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
}
// Deterministic tests: sync uses explicit known peers, so disable UDP discovery.