Resolve code-review findings Server-033..037
Server-033 (Medium): TryRestoreFromDiskAsync now completes the _firstLoad gate once the restored snapshot is published, so a browse call racing the first refresh is served immediately instead of waiting out the 5s bootstrap budget while an unreachable-database query runs. Server-034 (Low): GalaxyHierarchySnapshotStore.TryLoadAsync catches JsonException / IOException / UnauthorizedAccessException and returns null, honoring the Try contract for a corrupt or unreadable snapshot file. Server-035 (Low): SaveAsync bounds the write with a linked CancellationToken (CommandTimeoutSeconds budget) so a stuck disk cannot pin the refresh loop. Server-036 (Low): PersistSnapshotAsync no longer logs a save cancelled by gateway shutdown as a persistence failure. Server-037 (Low): added cache tests for the corrupt-snapshot restore path and for PersistSnapshot=false, plus a store test for corrupt JSON. All 100 Galaxy tests pass; gateway builds clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -306,6 +306,12 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
|
||||
attributes: snapshot.Attributes);
|
||||
Volatile.Write(ref _current, restored);
|
||||
|
||||
// Restored data is a valid completed first load: unblock callers waiting on
|
||||
// the bootstrap gate immediately, rather than making them wait out the full
|
||||
// wait budget for a live query that — when the database is unreachable, the
|
||||
// scenario this restore exists for — may not return for seconds.
|
||||
_firstLoad.TrySetResult();
|
||||
|
||||
_notifier.Publish(new GalaxyDeployEventInfo(
|
||||
Sequence: sequence,
|
||||
ObservedAt: _timeProvider.GetUtcNow(),
|
||||
@@ -342,6 +348,11 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
|
||||
new GalaxyHierarchySnapshot(deployTime, savedAt, hierarchy, attributes),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// The refresh was cancelled (gateway shutdown) before the write finished.
|
||||
// That is not a persistence failure — do not log it as a warning.
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
_logger?.LogWarning(exception, "Failed to persist the Galaxy hierarchy snapshot to disk.");
|
||||
|
||||
@@ -27,6 +27,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
};
|
||||
|
||||
private readonly string? _path;
|
||||
private readonly TimeSpan _writeTimeout;
|
||||
private readonly ILogger<GalaxyHierarchySnapshotStore>? _logger;
|
||||
private readonly SemaphoreSlim _ioGate = new(1, 1);
|
||||
|
||||
@@ -41,6 +42,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
_path = value.PersistSnapshot && !string.IsNullOrWhiteSpace(value.SnapshotCachePath)
|
||||
? value.SnapshotCachePath
|
||||
: null;
|
||||
_writeTimeout = TimeSpan.FromSeconds(Math.Max(1, value.CommandTimeoutSeconds));
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
@@ -58,6 +60,12 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Bound the write so a stuck disk — e.g. a SnapshotCachePath on an
|
||||
// unresponsive network share — cannot stall the caller. On the cache
|
||||
// refresh path that would otherwise pin the whole refresh loop.
|
||||
using CancellationTokenSource writeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
writeCts.CancelAfter(_writeTimeout);
|
||||
|
||||
string? directory = Path.GetDirectoryName(_path);
|
||||
if (!string.IsNullOrEmpty(directory))
|
||||
{
|
||||
@@ -67,7 +75,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
string tempPath = _path + ".tmp";
|
||||
await using (FileStream stream = new(tempPath, FileMode.Create, FileAccess.Write, FileShare.None))
|
||||
{
|
||||
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, writeCts.Token).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
File.Move(tempPath, _path, overwrite: true);
|
||||
@@ -111,6 +119,17 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
|
||||
return file.Snapshot;
|
||||
}
|
||||
catch (Exception exception) when (exception is JsonException or IOException or UnauthorizedAccessException)
|
||||
{
|
||||
// A corrupt, truncated, locked, or access-denied snapshot file is an
|
||||
// expected failure mode for a disk cache — honor the Try contract and
|
||||
// return null rather than throwing.
|
||||
_logger?.LogWarning(
|
||||
exception,
|
||||
"Ignoring Galaxy hierarchy snapshot at {Path}: the file is unreadable or not valid JSON.",
|
||||
_path);
|
||||
return null;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_ioGate.Release();
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Galaxy;
|
||||
using MxGateway.Contracts.Proto.Galaxy;
|
||||
@@ -208,6 +209,108 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
|
||||
Assert.Equal(0, repository.GetAttributesCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that a restored on-disk snapshot completes the first-load gate
|
||||
/// immediately, so a browse call racing the first refresh is not blocked for
|
||||
/// the full bootstrap budget while the live Galaxy query is still running.
|
||||
/// Regression test for Server-033.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns()
|
||||
{
|
||||
GalaxyHierarchySnapshotStore store = CreateStore();
|
||||
await store.SaveAsync(
|
||||
new GalaxyHierarchySnapshot(
|
||||
LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero),
|
||||
SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero),
|
||||
Hierarchy: [SampleHierarchyRow()],
|
||||
Attributes: [SampleAttributeRow()]),
|
||||
CancellationToken.None);
|
||||
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
BlockingGalaxyRepository repository = new();
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
|
||||
|
||||
Task refresh = cache.RefreshAsync(CancellationToken.None);
|
||||
|
||||
// The live query is blocked inside the repository; first-load must still
|
||||
// complete — from the restored snapshot — well within the wait budget.
|
||||
await cache.WaitForFirstLoadAsync(CancellationToken.None).WaitAsync(TimeSpan.FromSeconds(5));
|
||||
Assert.True(cache.Current.HasData);
|
||||
Assert.Equal(GalaxyCacheStatus.Stale, cache.Current.Status);
|
||||
|
||||
repository.Release();
|
||||
await refresh.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies a corrupt on-disk snapshot does not crash startup: the cache
|
||||
/// ignores the unreadable file and comes up Unavailable when the database is
|
||||
/// also unreachable. Regression test for Server-037.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing()
|
||||
{
|
||||
string path = CreateTempPath();
|
||||
await File.WriteAllTextAsync(path, "{ this is not valid json");
|
||||
GalaxyHierarchySnapshotStore store = CreateStore(path);
|
||||
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
|
||||
|
||||
await cache.RefreshAsync(CancellationToken.None);
|
||||
|
||||
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
|
||||
Assert.False(cache.Current.HasData);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that with snapshot persistence disabled the cache does not
|
||||
/// restore from disk — an unreachable database leaves it Unavailable.
|
||||
/// Regression test for Server-037.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk()
|
||||
{
|
||||
GalaxyHierarchySnapshotStore store = CreateStore(CreateTempPath(), persist: false);
|
||||
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
|
||||
|
||||
await cache.RefreshAsync(CancellationToken.None);
|
||||
|
||||
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
|
||||
Assert.False(cache.Current.HasData);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that a snapshot save aborted because the gateway is shutting down
|
||||
/// (the refresh token is cancelled) is not logged as a persistence failure.
|
||||
/// Regression test for Server-036.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure()
|
||||
{
|
||||
using CancellationTokenSource cts = new();
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
StubGalaxyRepository repository = new(
|
||||
deployTime: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc),
|
||||
hierarchy: [SampleHierarchyRow()],
|
||||
attributes: [SampleAttributeRow()]);
|
||||
CancellingSaveStore store = new(cts);
|
||||
RecordingLogger<GalaxyHierarchyCache> logger = new();
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), logger, store);
|
||||
|
||||
await cache.RefreshAsync(cts.Token);
|
||||
|
||||
Assert.DoesNotContain(
|
||||
logger.Entries,
|
||||
entry => entry.Level == LogLevel.Warning
|
||||
&& entry.Message.Contains("persist", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
private static GalaxyHierarchyRow SampleHierarchyRow() => new()
|
||||
{
|
||||
GobjectId = 99,
|
||||
@@ -228,20 +331,93 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
|
||||
DataTypeName = "Float",
|
||||
};
|
||||
|
||||
private GalaxyHierarchySnapshotStore CreateStore()
|
||||
private string CreateTempPath()
|
||||
{
|
||||
string path = Path.Combine(
|
||||
Path.GetTempPath(),
|
||||
$"mxgw-galaxy-cache-test-{Guid.NewGuid():N}.json");
|
||||
_tempPaths.Add(path);
|
||||
return path;
|
||||
}
|
||||
|
||||
private GalaxyHierarchySnapshotStore CreateStore() => CreateStore(CreateTempPath());
|
||||
|
||||
private static GalaxyHierarchySnapshotStore CreateStore(string path, bool persist = true)
|
||||
{
|
||||
GalaxyRepositoryOptions options = new()
|
||||
{
|
||||
PersistSnapshot = true,
|
||||
PersistSnapshot = persist,
|
||||
SnapshotCachePath = path,
|
||||
};
|
||||
return new GalaxyHierarchySnapshotStore(Options.Create(options));
|
||||
}
|
||||
|
||||
/// <summary><see cref="IGalaxyRepository"/> whose deploy-time query blocks until released.</summary>
|
||||
private sealed class BlockingGalaxyRepository : IGalaxyRepository
|
||||
{
|
||||
private readonly TaskCompletionSource _release = new(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
|
||||
public void Release() => _release.TrySetResult();
|
||||
|
||||
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(false);
|
||||
|
||||
public async Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default)
|
||||
{
|
||||
await _release.Task.WaitAsync(ct).ConfigureAwait(false);
|
||||
throw new InvalidOperationException("Galaxy repository unreachable");
|
||||
}
|
||||
|
||||
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default)
|
||||
=> throw new InvalidOperationException("GetHierarchyAsync should not be reached");
|
||||
|
||||
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default)
|
||||
=> throw new InvalidOperationException("GetAttributesAsync should not be reached");
|
||||
}
|
||||
|
||||
/// <summary>Snapshot store whose <see cref="SaveAsync"/> cancels the token mid-save.</summary>
|
||||
private sealed class CancellingSaveStore(CancellationTokenSource cts) : IGalaxyHierarchySnapshotStore
|
||||
{
|
||||
public Task<GalaxyHierarchySnapshot?> TryLoadAsync(CancellationToken cancellationToken)
|
||||
=> Task.FromResult<GalaxyHierarchySnapshot?>(null);
|
||||
|
||||
public Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken)
|
||||
{
|
||||
cts.Cancel();
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Minimal <see cref="ILogger{T}"/> that records every emitted log entry.</summary>
|
||||
private sealed class RecordingLogger<T> : ILogger<T>
|
||||
{
|
||||
public List<(LogLevel Level, string Message)> Entries { get; } = [];
|
||||
|
||||
public IDisposable BeginScope<TState>(TState state)
|
||||
where TState : notnull => NullScope.Instance;
|
||||
|
||||
public bool IsEnabled(LogLevel logLevel) => true;
|
||||
|
||||
public void Log<TState>(
|
||||
LogLevel logLevel,
|
||||
EventId eventId,
|
||||
TState state,
|
||||
Exception? exception,
|
||||
Func<TState, Exception?, string> formatter)
|
||||
{
|
||||
Entries.Add((logLevel, formatter(state, exception)));
|
||||
}
|
||||
|
||||
private sealed class NullScope : IDisposable
|
||||
{
|
||||
public static readonly NullScope Instance = new();
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>In-memory <see cref="IGalaxyRepository"/> that returns fixed rowsets.</summary>
|
||||
private sealed class StubGalaxyRepository(
|
||||
DateTime? deployTime,
|
||||
|
||||
@@ -59,6 +59,16 @@ public sealed class GalaxyHierarchySnapshotStoreTests : IDisposable
|
||||
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull()
|
||||
{
|
||||
string path = CreateTempPath();
|
||||
await File.WriteAllTextAsync(path, "{ this is not valid json");
|
||||
GalaxyHierarchySnapshotStore store = CreateStore(path);
|
||||
|
||||
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TryLoadAsync_WhenSchemaVersionUnrecognized_ReturnsNull()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user