Replace BLite with Surreal embedded persistence
All checks were successful
NuGet Package Publish / nuget (push) Successful in 1m21s

This commit is contained in:
Joseph Doherty
2026-02-22 05:21:53 -05:00
parent 7ebc2cb567
commit 9c2a77dc3c
56 changed files with 6613 additions and 3177 deletions

View File

@@ -7,9 +7,10 @@ using Microsoft.Extensions.Logging;
using ZB.MOM.WW.CBDDC.Core;
using ZB.MOM.WW.CBDDC.Core.Network;
using ZB.MOM.WW.CBDDC.Core.Storage;
using ZB.MOM.WW.CBDDC.Core.Sync;
using ZB.MOM.WW.CBDDC.Network;
using ZB.MOM.WW.CBDDC.Network.Security;
using ZB.MOM.WW.CBDDC.Persistence.BLite;
using ZB.MOM.WW.CBDDC.Persistence.Surreal;
namespace ZB.MOM.WW.CBDDC.E2E.Tests;
@@ -26,7 +27,7 @@ public class ClusterCrudSyncE2ETests
int nodeBPort = GetAvailableTcpPort();
while (nodeBPort == nodeAPort) nodeBPort = GetAvailableTcpPort();
await using var nodeA = TestPeerNode.Create(
var nodeA = TestPeerNode.Create(
"node-a",
nodeAPort,
clusterToken,
@@ -160,6 +161,287 @@ public class ClusterCrudSyncE2ETests
() => BuildDiagnostics(nodeA, nodeB));
}
/// <summary>
/// Verifies a reconnecting peer catches up mutations that happened while it was offline.
/// </summary>
[Fact]
public async Task PeerReconnect_ShouldCatchUpMissedChanges()
{
var clusterToken = Guid.NewGuid().ToString("N");
int nodeAPort = GetAvailableTcpPort();
int nodeBPort = GetAvailableTcpPort();
while (nodeBPort == nodeAPort) nodeBPort = GetAvailableTcpPort();
var nodeA = TestPeerNode.Create(
"node-a",
nodeAPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-b",
Host = "127.0.0.1",
Port = nodeBPort
}
]);
await using var nodeB = TestPeerNode.Create(
"node-b",
nodeBPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-a",
Host = "127.0.0.1",
Port = nodeAPort
}
]);
await nodeA.StartAsync();
await nodeB.StartAsync();
await nodeB.StopAsync();
const string userId = "reconnect-user";
await nodeA.UpsertUserAsync(new User
{
Id = userId,
Name = "Offline Create",
Age = 20,
Address = new Address { City = "Rome" }
});
await nodeA.UpsertUserAsync(new User
{
Id = userId,
Name = "Offline Update",
Age = 21,
Address = new Address { City = "Milan" }
});
await nodeA.UpsertUserAsync(new User
{
Id = userId,
Name = "Offline Final",
Age = 22,
Address = new Address { City = "Turin" }
});
await nodeB.StartAsync();
await AssertEventuallyAsync(() =>
{
var replicated = nodeB.ReadUser(userId);
return replicated is not null &&
replicated.Name == "Offline Final" &&
replicated.Age == 22 &&
replicated.Address?.City == "Turin";
}, 60, "Node B did not catch up missed reconnect mutations.", () => BuildDiagnostics(nodeA, nodeB));
}
/// <summary>
/// Verifies a burst of rapid multi-node mutations converges to a deterministic final state.
/// </summary>
[Fact]
public async Task MultiChangeBurst_ShouldConvergeDeterministically()
{
var clusterToken = Guid.NewGuid().ToString("N");
int nodeAPort = GetAvailableTcpPort();
int nodeBPort = GetAvailableTcpPort();
while (nodeBPort == nodeAPort) nodeBPort = GetAvailableTcpPort();
await using var nodeA = TestPeerNode.Create(
"node-a",
nodeAPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-b",
Host = "127.0.0.1",
Port = nodeBPort
}
]);
await using var nodeB = TestPeerNode.Create(
"node-b",
nodeBPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-a",
Host = "127.0.0.1",
Port = nodeAPort
}
]);
await nodeA.StartAsync();
await nodeB.StartAsync();
const int burstCount = 8;
for (var i = 0; i < burstCount; i++)
{
string aId = $"burst-a-{i:D2}";
string bId = $"burst-b-{i:D2}";
await nodeA.UpsertUserAsync(new User
{
Id = aId,
Name = $"A-{i}",
Age = 30 + i,
Address = new Address { City = $"CityA-{i}" }
});
await nodeB.UpsertUserAsync(new User
{
Id = bId,
Name = $"B-{i}",
Age = 40 + i,
Address = new Address { City = $"CityB-{i}" }
});
}
await AssertEventuallyAsync(
() => nodeA.Context.Users.FindAll().Count() == burstCount * 2 &&
nodeB.Context.Users.FindAll().Count() == burstCount * 2,
60,
"Burst convergence did not reach expected document counts.",
() => BuildDiagnostics(nodeA, nodeB));
await AssertEventuallyAsync(() =>
{
for (var i = 0; i < burstCount; i++)
{
var aOnB = nodeB.ReadUser($"burst-a-{i:D2}");
var bOnA = nodeA.ReadUser($"burst-b-{i:D2}");
if (aOnB is null || bOnA is null) return false;
if (aOnB.Name != $"A-{i}" || bOnA.Name != $"B-{i}") return false;
}
return true;
}, 60, "Burst convergence content mismatch.", () => BuildDiagnostics(nodeA, nodeB));
}
/// <summary>
/// Verifies recovery safety when a process crashes after oplog commit but before checkpoint advance.
/// </summary>
[Fact]
public async Task CrashBetweenOplogAndCheckpoint_ShouldReplaySafelyOnRestart()
{
var clusterToken = Guid.NewGuid().ToString("N");
int nodeAPort = GetAvailableTcpPort();
int nodeBPort = GetAvailableTcpPort();
while (nodeBPort == nodeAPort) nodeBPort = GetAvailableTcpPort();
string sharedWorkDir = Path.Combine(Path.GetTempPath(), $"cbddc-e2e-crash-{Guid.NewGuid():N}");
Directory.CreateDirectory(sharedWorkDir);
await using var nodeA = TestPeerNode.Create(
"node-a",
nodeAPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-b",
Host = "127.0.0.1",
Port = nodeBPort
}
],
workDirOverride: sharedWorkDir,
preserveWorkDirOnDispose: true,
useFaultInjectedCheckpointStore: true);
bool nodeADisposed = false;
try
{
await using var nodeB = TestPeerNode.Create(
"node-b",
nodeBPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-a",
Host = "127.0.0.1",
Port = nodeAPort
}
]);
await nodeA.StartAsync();
await nodeB.StartAsync();
const string userId = "crash-window-user";
var payload = new User
{
Id = userId,
Name = "Crash Recovered",
Age = 45,
Address = new Address { City = "Naples" }
};
await Should.ThrowAsync<InvalidOperationException>(() => nodeA.UpsertUserAsync(payload));
nodeA.ReadUser(userId).ShouldNotBeNull();
nodeA.GetLocalOplogCountForKey("Users", userId).ShouldBe(1);
await nodeA.StopAsync();
await nodeA.DisposeAsync();
nodeADisposed = true;
TestPeerNode? recoveredNodeA = null;
for (var attempt = 0; attempt < 10; attempt++)
try
{
recoveredNodeA = TestPeerNode.Create(
"node-a",
nodeAPort,
clusterToken,
[
new KnownPeerConfiguration
{
NodeId = "node-b",
Host = "127.0.0.1",
Port = nodeBPort
}
],
workDirOverride: sharedWorkDir);
break;
}
catch (Exception ex) when (IsRocksDbLockContention(ex) && attempt < 9)
{
await Task.Delay(100);
}
recoveredNodeA.ShouldNotBeNull();
await using (recoveredNodeA)
{
await recoveredNodeA.StartAsync();
await AssertEventuallyAsync(() =>
{
var replicated = nodeB.ReadUser(userId);
return replicated is not null
&& replicated.Name == payload.Name
&& replicated.Age == payload.Age
&& replicated.Address?.City == payload.Address?.City;
}, 60, "Node B did not converge after crash-window recovery.", () => BuildDiagnostics(recoveredNodeA, nodeB));
await AssertEventuallyAsync(
() => recoveredNodeA.GetOplogCountForKey("Users", userId) == 1 &&
nodeB.GetOplogCountForKey("Users", userId) == 1,
60,
"Crash-window recovery created duplicate oplog entries.",
() => BuildDiagnostics(recoveredNodeA, nodeB));
}
}
finally
{
if (!nodeADisposed) await nodeA.DisposeAsync();
}
}
private static async Task AssertEventuallyAsync(
Func<bool> predicate,
int timeoutSeconds,
@@ -222,6 +504,11 @@ public class ClusterCrudSyncE2ETests
return ((IPEndPoint)listener.LocalEndpoint).Port;
}
private static bool IsRocksDbLockContention(Exception exception)
{
return exception.ToString().Contains("No locks available", StringComparison.OrdinalIgnoreCase);
}
private sealed class TestPeerNode : IAsyncDisposable
{
private readonly InMemoryLogSink _logSink;
@@ -230,6 +517,7 @@ public class ClusterCrudSyncE2ETests
private readonly IOplogStore _oplogStore;
private readonly ServiceProvider _services;
private readonly string _workDir;
private readonly bool _preserveWorkDirOnDispose;
private long _lastPhysicalTime = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
private int _logicalCounter;
private bool _started;
@@ -241,7 +529,8 @@ public class ClusterCrudSyncE2ETests
SampleDbContext context,
InMemoryLogSink logSink,
string workDir,
string nodeId)
string nodeId,
bool preserveWorkDirOnDispose)
{
_services = services;
_node = node;
@@ -250,10 +539,11 @@ public class ClusterCrudSyncE2ETests
_logSink = logSink;
_workDir = workDir;
_nodeId = nodeId;
_preserveWorkDirOnDispose = preserveWorkDirOnDispose;
}
/// <summary>
/// Gets the LiteDB-backed context used by this test peer.
/// Gets the Surreal-backed context used by this test peer.
/// </summary>
public SampleDbContext Context { get; }
@@ -269,7 +559,7 @@ public class ClusterCrudSyncE2ETests
}
_services.Dispose();
TryDeleteDirectory(_workDir);
if (!_preserveWorkDirOnDispose) TryDeleteDirectory(_workDir);
}
/// <summary>
@@ -284,11 +574,15 @@ public class ClusterCrudSyncE2ETests
string nodeId,
int tcpPort,
string authToken,
IReadOnlyList<KnownPeerConfiguration> knownPeers)
IReadOnlyList<KnownPeerConfiguration> knownPeers,
string? workDirOverride = null,
bool preserveWorkDirOnDispose = false,
bool useFaultInjectedCheckpointStore = false)
{
string workDir = Path.Combine(Path.GetTempPath(), $"cbddc-e2e-{nodeId}-{Guid.NewGuid():N}");
string workDir = workDirOverride ?? Path.Combine(Path.GetTempPath(), $"cbddc-e2e-{nodeId}-{Guid.NewGuid():N}");
Directory.CreateDirectory(workDir);
string dbPath = Path.Combine(workDir, "node.blite");
string dbPath = Path.Combine(workDir, "node.rocksdb");
string surrealDatabase = nodeId.Replace("-", "_", StringComparison.Ordinal);
var configProvider = new StaticPeerNodeConfigurationProvider(new PeerNodeConfiguration
{
@@ -304,9 +598,33 @@ public class ClusterCrudSyncE2ETests
services.AddLogging(builder => builder.SetMinimumLevel(LogLevel.Debug));
services.AddSingleton(configProvider);
services.AddSingleton<IPeerNodeConfigurationProvider>(configProvider);
services.AddCBDDCCore()
.AddCBDDCBLite<SampleDbContext, SampleDocumentStore>(_ => new SampleDbContext(dbPath))
.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
services.AddSingleton<ICBDDCSurrealSchemaInitializer, SampleSurrealSchemaInitializer>();
services.AddSingleton<SampleDbContext>();
var surrealOptionsFactory = new Func<IServiceProvider, CBDDCSurrealEmbeddedOptions>(_ => new CBDDCSurrealEmbeddedOptions
{
Endpoint = "rocksdb://local",
DatabasePath = dbPath,
Namespace = "cbddc_e2e",
Database = surrealDatabase,
Cdc = new CBDDCSurrealCdcOptions
{
Enabled = true,
ConsumerId = $"{nodeId}-main"
}
});
var coreBuilder = services.AddCBDDCCore();
if (useFaultInjectedCheckpointStore)
{
services.AddSingleton<ISurrealCdcCheckpointPersistence, CrashAfterFirstAdvanceCheckpointPersistence>();
coreBuilder.AddCBDDCSurrealEmbedded<FaultInjectedSampleDocumentStore>(surrealOptionsFactory)
.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
}
else
{
coreBuilder.AddCBDDCSurrealEmbedded<SampleDocumentStore>(surrealOptionsFactory)
.AddCBDDCNetwork<StaticPeerNodeConfigurationProvider>(false);
}
// Deterministic tests: sync uses explicit known peers, so disable UDP discovery.
services.AddSingleton<IDiscoveryService, PassiveDiscoveryService>();
@@ -317,7 +635,15 @@ public class ClusterCrudSyncE2ETests
var oplogStore = provider.GetRequiredService<IOplogStore>();
var context = provider.GetRequiredService<SampleDbContext>();
var logSink = provider.GetRequiredService<InMemoryLogSink>();
return new TestPeerNode(provider, node, oplogStore, context, logSink, workDir, nodeId);
return new TestPeerNode(
provider,
node,
oplogStore,
context,
logSink,
workDir,
nodeId,
preserveWorkDirOnDispose);
}
/// <summary>
@@ -340,7 +666,17 @@ public class ClusterCrudSyncE2ETests
{
if (!_started) return;
await _node.Stop();
try
{
await _node.Stop();
}
catch (ObjectDisposedException)
{
}
catch (AggregateException ex) when (ex.InnerExceptions.All(e => e is ObjectDisposedException))
{
}
_started = false;
}
@@ -354,6 +690,23 @@ public class ClusterCrudSyncE2ETests
return Context.Users.Find(u => u.Id == userId).FirstOrDefault();
}
public int GetLocalOplogCountForKey(string collection, string key)
{
return Context.OplogEntries.FindAll()
.Count(e =>
string.Equals(e.Collection, collection, StringComparison.Ordinal) &&
string.Equals(e.Key, key, StringComparison.Ordinal) &&
string.Equals(e.TimestampNodeId, _nodeId, StringComparison.Ordinal));
}
public int GetOplogCountForKey(string collection, string key)
{
return Context.OplogEntries.FindAll()
.Count(e =>
string.Equals(e.Collection, collection, StringComparison.Ordinal) &&
string.Equals(e.Key, key, StringComparison.Ordinal));
}
/// <summary>
/// Inserts or updates a user and persists the matching oplog entry.
/// </summary>
@@ -466,6 +819,183 @@ public class ClusterCrudSyncE2ETests
}
}
private sealed class FaultInjectedSampleDocumentStore : SurrealDocumentStore<SampleDbContext>
{
private const string UsersCollection = "Users";
private const string TodoListsCollection = "TodoLists";
public FaultInjectedSampleDocumentStore(
SampleDbContext context,
IPeerNodeConfigurationProvider configProvider,
IVectorClockService vectorClockService,
ISurrealCdcCheckpointPersistence checkpointPersistence,
ILogger<FaultInjectedSampleDocumentStore>? logger = null)
: base(
context,
context.SurrealEmbeddedClient,
context.SchemaInitializer,
configProvider,
vectorClockService,
new LastWriteWinsConflictResolver(),
checkpointPersistence,
new SurrealCdcPollingOptions
{
Enabled = false,
EnableLiveSelectAccelerator = false
},
logger)
{
WatchCollection(UsersCollection, context.Users, u => u.Id);
WatchCollection(TodoListsCollection, context.TodoLists, t => t.Id);
}
protected override async Task ApplyContentToEntityAsync(
string collection,
string key,
JsonElement content,
CancellationToken cancellationToken)
{
await UpsertEntityAsync(collection, key, content, cancellationToken);
}
protected override async Task ApplyContentToEntitiesBatchAsync(
IEnumerable<(string Collection, string Key, JsonElement Content)> documents,
CancellationToken cancellationToken)
{
foreach ((string collection, string key, var content) in documents)
await UpsertEntityAsync(collection, key, content, cancellationToken);
}
protected override async Task<JsonElement?> GetEntityAsJsonAsync(
string collection,
string key,
CancellationToken cancellationToken)
{
return collection switch
{
UsersCollection => SerializeEntity(await _context.Users.FindByIdAsync(key, cancellationToken)),
TodoListsCollection => SerializeEntity(await _context.TodoLists.FindByIdAsync(key, cancellationToken)),
_ => null
};
}
protected override async Task RemoveEntityAsync(
string collection,
string key,
CancellationToken cancellationToken)
{
await DeleteEntityAsync(collection, key, cancellationToken);
}
protected override async Task RemoveEntitiesBatchAsync(
IEnumerable<(string Collection, string Key)> documents,
CancellationToken cancellationToken)
{
foreach ((string collection, string key) in documents)
await DeleteEntityAsync(collection, key, cancellationToken);
}
protected override async Task<IEnumerable<(string Key, JsonElement Content)>> GetAllEntitiesAsJsonAsync(
string collection,
CancellationToken cancellationToken)
{
return collection switch
{
UsersCollection => (await _context.Users.FindAllAsync(cancellationToken))
.Select(u => (u.Id, SerializeEntity(u)!.Value))
.ToList(),
TodoListsCollection => (await _context.TodoLists.FindAllAsync(cancellationToken))
.Select(t => (t.Id, SerializeEntity(t)!.Value))
.ToList(),
_ => []
};
}
private async Task UpsertEntityAsync(
string collection,
string key,
JsonElement content,
CancellationToken cancellationToken)
{
switch (collection)
{
case UsersCollection:
var user = content.Deserialize<User>() ??
throw new InvalidOperationException("Failed to deserialize user.");
user.Id = key;
if (await _context.Users.FindByIdAsync(key, cancellationToken) == null)
await _context.Users.InsertAsync(user, cancellationToken);
else
await _context.Users.UpdateAsync(user, cancellationToken);
break;
case TodoListsCollection:
var todo = content.Deserialize<TodoList>() ??
throw new InvalidOperationException("Failed to deserialize todo list.");
todo.Id = key;
if (await _context.TodoLists.FindByIdAsync(key, cancellationToken) == null)
await _context.TodoLists.InsertAsync(todo, cancellationToken);
else
await _context.TodoLists.UpdateAsync(todo, cancellationToken);
break;
default:
throw new NotSupportedException($"Collection '{collection}' is not supported for sync.");
}
}
private async Task DeleteEntityAsync(string collection, string key, CancellationToken cancellationToken)
{
switch (collection)
{
case UsersCollection:
await _context.Users.DeleteAsync(key, cancellationToken);
break;
case TodoListsCollection:
await _context.TodoLists.DeleteAsync(key, cancellationToken);
break;
}
}
private static JsonElement? SerializeEntity<T>(T? entity) where T : class
{
return entity == null ? null : JsonSerializer.SerializeToElement(entity);
}
}
private sealed class CrashAfterFirstAdvanceCheckpointPersistence : ISurrealCdcCheckpointPersistence
{
private int _failOnNextAdvance = 1;
public Task<SurrealCdcCheckpoint?> GetCheckpointAsync(
string? consumerId = null,
CancellationToken cancellationToken = default)
{
return Task.FromResult<SurrealCdcCheckpoint?>(null);
}
public Task UpsertCheckpointAsync(
HlcTimestamp timestamp,
string lastHash,
string? consumerId = null,
CancellationToken cancellationToken = default,
long? versionstampCursor = null)
{
return Task.CompletedTask;
}
public Task AdvanceCheckpointAsync(
OplogEntry entry,
string? consumerId = null,
CancellationToken cancellationToken = default)
{
if (Interlocked.Exchange(ref _failOnNextAdvance, 0) == 1)
throw new InvalidOperationException("Injected crash between oplog commit and checkpoint advance.");
return Task.CompletedTask;
}
}
private sealed class PassiveDiscoveryService : IDiscoveryService
{
/// <inheritdoc />
@@ -637,4 +1167,4 @@ public class ClusterCrudSyncE2ETests
{
}
}
}
}