Resolve code-review findings Server-033..037

Server-033 (Medium): TryRestoreFromDiskAsync now completes the _firstLoad
gate once the restored snapshot is published, so a browse call racing the
first refresh is served immediately instead of waiting out the 5s bootstrap
budget while an unreachable-database query runs.

Server-034 (Low): GalaxyHierarchySnapshotStore.TryLoadAsync catches
JsonException / IOException / UnauthorizedAccessException and returns null,
honoring the Try contract for a corrupt or unreadable snapshot file.

Server-035 (Low): SaveAsync bounds the write with a linked CancellationToken
(CommandTimeoutSeconds budget) so a stuck disk cannot pin the refresh loop.

Server-036 (Low): PersistSnapshotAsync no longer logs a save cancelled by
gateway shutdown as a persistence failure.

Server-037 (Low): added cache tests for the corrupt-snapshot restore path
and for PersistSnapshot=false, plus a store test for corrupt JSON.

All 100 Galaxy tests pass; gateway builds clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-22 03:34:35 -04:00
parent fa491c752b
commit bdccdbf6dd
4 changed files with 219 additions and 3 deletions
@@ -306,6 +306,12 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
attributes: snapshot.Attributes);
Volatile.Write(ref _current, restored);
// Restored data is a valid completed first load: unblock callers waiting on
// the bootstrap gate immediately, rather than making them wait out the full
// wait budget for a live query that — when the database is unreachable, the
// scenario this restore exists for — may not return for seconds.
_firstLoad.TrySetResult();
_notifier.Publish(new GalaxyDeployEventInfo(
Sequence: sequence,
ObservedAt: _timeProvider.GetUtcNow(),
@@ -342,6 +348,11 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
new GalaxyHierarchySnapshot(deployTime, savedAt, hierarchy, attributes),
cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// The refresh was cancelled (gateway shutdown) before the write finished.
// That is not a persistence failure — do not log it as a warning.
}
catch (Exception exception)
{
_logger?.LogWarning(exception, "Failed to persist the Galaxy hierarchy snapshot to disk.");
@@ -27,6 +27,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
};
private readonly string? _path;
private readonly TimeSpan _writeTimeout;
private readonly ILogger<GalaxyHierarchySnapshotStore>? _logger;
private readonly SemaphoreSlim _ioGate = new(1, 1);
@@ -41,6 +42,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
_path = value.PersistSnapshot && !string.IsNullOrWhiteSpace(value.SnapshotCachePath)
? value.SnapshotCachePath
: null;
_writeTimeout = TimeSpan.FromSeconds(Math.Max(1, value.CommandTimeoutSeconds));
_logger = logger;
}
@@ -58,6 +60,12 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false);
try
{
// Bound the write so a stuck disk — e.g. a SnapshotCachePath on an
// unresponsive network share — cannot stall the caller. On the cache
// refresh path that would otherwise pin the whole refresh loop.
using CancellationTokenSource writeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
writeCts.CancelAfter(_writeTimeout);
string? directory = Path.GetDirectoryName(_path);
if (!string.IsNullOrEmpty(directory))
{
@@ -67,7 +75,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
string tempPath = _path + ".tmp";
await using (FileStream stream = new(tempPath, FileMode.Create, FileAccess.Write, FileShare.None))
{
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, cancellationToken).ConfigureAwait(false);
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, writeCts.Token).ConfigureAwait(false);
}
File.Move(tempPath, _path, overwrite: true);
@@ -111,6 +119,17 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
return file.Snapshot;
}
catch (Exception exception) when (exception is JsonException or IOException or UnauthorizedAccessException)
{
// A corrupt, truncated, locked, or access-denied snapshot file is an
// expected failure mode for a disk cache — honor the Try contract and
// return null rather than throwing.
_logger?.LogWarning(
exception,
"Ignoring Galaxy hierarchy snapshot at {Path}: the file is unreadable or not valid JSON.",
_path);
return null;
}
finally
{
_ioGate.Release();
@@ -1,3 +1,4 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MxGateway.Server.Galaxy;
using MxGateway.Contracts.Proto.Galaxy;
@@ -208,6 +209,108 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
Assert.Equal(0, repository.GetAttributesCount);
}
/// <summary>
/// Verifies that a restored on-disk snapshot completes the first-load gate
/// immediately, so a browse call racing the first refresh is not blocked for
/// the full bootstrap budget while the live Galaxy query is still running.
/// Regression test for Server-033.
/// </summary>
[Fact]
public async Task RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns()
{
GalaxyHierarchySnapshotStore store = CreateStore();
await store.SaveAsync(
new GalaxyHierarchySnapshot(
LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero),
SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero),
Hierarchy: [SampleHierarchyRow()],
Attributes: [SampleAttributeRow()]),
CancellationToken.None);
GalaxyDeployNotifier notifier = new();
BlockingGalaxyRepository repository = new();
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
Task refresh = cache.RefreshAsync(CancellationToken.None);
// The live query is blocked inside the repository; first-load must still
// complete — from the restored snapshot — well within the wait budget.
await cache.WaitForFirstLoadAsync(CancellationToken.None).WaitAsync(TimeSpan.FromSeconds(5));
Assert.True(cache.Current.HasData);
Assert.Equal(GalaxyCacheStatus.Stale, cache.Current.Status);
repository.Release();
await refresh.WaitAsync(TimeSpan.FromSeconds(5));
}
/// <summary>
/// Verifies a corrupt on-disk snapshot does not crash startup: the cache
/// ignores the unreadable file and comes up Unavailable when the database is
/// also unreachable. Regression test for Server-037.
/// </summary>
[Fact]
public async Task RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing()
{
string path = CreateTempPath();
await File.WriteAllTextAsync(path, "{ this is not valid json");
GalaxyHierarchySnapshotStore store = CreateStore(path);
GalaxyDeployNotifier notifier = new();
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
await cache.RefreshAsync(CancellationToken.None);
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
Assert.False(cache.Current.HasData);
}
/// <summary>
/// Verifies that with snapshot persistence disabled the cache does not
/// restore from disk — an unreachable database leaves it Unavailable.
/// Regression test for Server-037.
/// </summary>
[Fact]
public async Task RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk()
{
GalaxyHierarchySnapshotStore store = CreateStore(CreateTempPath(), persist: false);
GalaxyDeployNotifier notifier = new();
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
await cache.RefreshAsync(CancellationToken.None);
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
Assert.False(cache.Current.HasData);
}
/// <summary>
/// Verifies that a snapshot save aborted because the gateway is shutting down
/// (the refresh token is cancelled) is not logged as a persistence failure.
/// Regression test for Server-036.
/// </summary>
[Fact]
public async Task RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure()
{
using CancellationTokenSource cts = new();
GalaxyDeployNotifier notifier = new();
StubGalaxyRepository repository = new(
deployTime: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc),
hierarchy: [SampleHierarchyRow()],
attributes: [SampleAttributeRow()]);
CancellingSaveStore store = new(cts);
RecordingLogger<GalaxyHierarchyCache> logger = new();
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), logger, store);
await cache.RefreshAsync(cts.Token);
Assert.DoesNotContain(
logger.Entries,
entry => entry.Level == LogLevel.Warning
&& entry.Message.Contains("persist", StringComparison.OrdinalIgnoreCase));
}
private static GalaxyHierarchyRow SampleHierarchyRow() => new()
{
GobjectId = 99,
@@ -228,20 +331,93 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
DataTypeName = "Float",
};
private GalaxyHierarchySnapshotStore CreateStore()
private string CreateTempPath()
{
string path = Path.Combine(
Path.GetTempPath(),
$"mxgw-galaxy-cache-test-{Guid.NewGuid():N}.json");
_tempPaths.Add(path);
return path;
}
private GalaxyHierarchySnapshotStore CreateStore() => CreateStore(CreateTempPath());
private static GalaxyHierarchySnapshotStore CreateStore(string path, bool persist = true)
{
GalaxyRepositoryOptions options = new()
{
PersistSnapshot = true,
PersistSnapshot = persist,
SnapshotCachePath = path,
};
return new GalaxyHierarchySnapshotStore(Options.Create(options));
}
/// <summary><see cref="IGalaxyRepository"/> whose deploy-time query blocks until released.</summary>
private sealed class BlockingGalaxyRepository : IGalaxyRepository
{
private readonly TaskCompletionSource _release = new(TaskCreationOptions.RunContinuationsAsynchronously);
public void Release() => _release.TrySetResult();
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(false);
public async Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default)
{
await _release.Task.WaitAsync(ct).ConfigureAwait(false);
throw new InvalidOperationException("Galaxy repository unreachable");
}
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default)
=> throw new InvalidOperationException("GetHierarchyAsync should not be reached");
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default)
=> throw new InvalidOperationException("GetAttributesAsync should not be reached");
}
/// <summary>Snapshot store whose <see cref="SaveAsync"/> cancels the token mid-save.</summary>
private sealed class CancellingSaveStore(CancellationTokenSource cts) : IGalaxyHierarchySnapshotStore
{
public Task<GalaxyHierarchySnapshot?> TryLoadAsync(CancellationToken cancellationToken)
=> Task.FromResult<GalaxyHierarchySnapshot?>(null);
public Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken)
{
cts.Cancel();
cancellationToken.ThrowIfCancellationRequested();
return Task.CompletedTask;
}
}
/// <summary>Minimal <see cref="ILogger{T}"/> that records every emitted log entry.</summary>
private sealed class RecordingLogger<T> : ILogger<T>
{
public List<(LogLevel Level, string Message)> Entries { get; } = [];
public IDisposable BeginScope<TState>(TState state)
where TState : notnull => NullScope.Instance;
public bool IsEnabled(LogLevel logLevel) => true;
public void Log<TState>(
LogLevel logLevel,
EventId eventId,
TState state,
Exception? exception,
Func<TState, Exception?, string> formatter)
{
Entries.Add((logLevel, formatter(state, exception)));
}
private sealed class NullScope : IDisposable
{
public static readonly NullScope Instance = new();
public void Dispose()
{
}
}
}
/// <summary>In-memory <see cref="IGalaxyRepository"/> that returns fixed rowsets.</summary>
private sealed class StubGalaxyRepository(
DateTime? deployTime,
@@ -59,6 +59,16 @@ public sealed class GalaxyHierarchySnapshotStoreTests : IDisposable
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
}
[Fact]
public async Task TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull()
{
string path = CreateTempPath();
await File.WriteAllTextAsync(path, "{ this is not valid json");
GalaxyHierarchySnapshotStore store = CreateStore(path);
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
}
[Fact]
public async Task TryLoadAsync_WhenSchemaVersionUnrecognized_ReturnsNull()
{