From bdccdbf6dd8095c71bd237fc2998a9e04e6f128d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 22 May 2026 03:34:35 -0400 Subject: [PATCH] Resolve code-review findings Server-033..037 Server-033 (Medium): TryRestoreFromDiskAsync now completes the _firstLoad gate once the restored snapshot is published, so a browse call racing the first refresh is served immediately instead of waiting out the 5s bootstrap budget while an unreachable-database query runs. Server-034 (Low): GalaxyHierarchySnapshotStore.TryLoadAsync catches JsonException / IOException / UnauthorizedAccessException and returns null, honoring the Try contract for a corrupt or unreadable snapshot file. Server-035 (Low): SaveAsync bounds the write with a linked CancellationToken (CommandTimeoutSeconds budget) so a stuck disk cannot pin the refresh loop. Server-036 (Low): PersistSnapshotAsync no longer logs a save cancelled by gateway shutdown as a persistence failure. Server-037 (Low): added cache tests for the corrupt-snapshot restore path and for PersistSnapshot=false, plus a store test for corrupt JSON. All 100 Galaxy tests pass; gateway builds clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../Galaxy/GalaxyHierarchyCache.cs | 11 ++ .../Galaxy/GalaxyHierarchySnapshotStore.cs | 21 +- .../Galaxy/GalaxyHierarchyCacheTests.cs | 180 +++++++++++++++++- .../GalaxyHierarchySnapshotStoreTests.cs | 10 + 4 files changed, 219 insertions(+), 3 deletions(-) diff --git a/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs b/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs index 4ebadcb..e6cffc5 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs @@ -306,6 +306,12 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache attributes: snapshot.Attributes); Volatile.Write(ref _current, restored); + // Restored data is a valid completed first load: unblock callers waiting on + // the bootstrap gate immediately, rather than making them wait out the full + // wait budget for a live query that — when the database is unreachable, the + // scenario this restore exists for — may not return for seconds. + _firstLoad.TrySetResult(); + _notifier.Publish(new GalaxyDeployEventInfo( Sequence: sequence, ObservedAt: _timeProvider.GetUtcNow(), @@ -342,6 +348,11 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache new GalaxyHierarchySnapshot(deployTime, savedAt, hierarchy, attributes), cancellationToken).ConfigureAwait(false); } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // The refresh was cancelled (gateway shutdown) before the write finished. + // That is not a persistence failure — do not log it as a warning. + } catch (Exception exception) { _logger?.LogWarning(exception, "Failed to persist the Galaxy hierarchy snapshot to disk."); diff --git a/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs b/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs index 7960f4a..f1d4f1f 100644 --- a/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs +++ b/src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs @@ -27,6 +27,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore }; private readonly string? _path; + private readonly TimeSpan _writeTimeout; private readonly ILogger? _logger; private readonly SemaphoreSlim _ioGate = new(1, 1); @@ -41,6 +42,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore _path = value.PersistSnapshot && !string.IsNullOrWhiteSpace(value.SnapshotCachePath) ? value.SnapshotCachePath : null; + _writeTimeout = TimeSpan.FromSeconds(Math.Max(1, value.CommandTimeoutSeconds)); _logger = logger; } @@ -58,6 +60,12 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false); try { + // Bound the write so a stuck disk — e.g. a SnapshotCachePath on an + // unresponsive network share — cannot stall the caller. On the cache + // refresh path that would otherwise pin the whole refresh loop. + using CancellationTokenSource writeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + writeCts.CancelAfter(_writeTimeout); + string? directory = Path.GetDirectoryName(_path); if (!string.IsNullOrEmpty(directory)) { @@ -67,7 +75,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore string tempPath = _path + ".tmp"; await using (FileStream stream = new(tempPath, FileMode.Create, FileAccess.Write, FileShare.None)) { - await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, cancellationToken).ConfigureAwait(false); + await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, writeCts.Token).ConfigureAwait(false); } File.Move(tempPath, _path, overwrite: true); @@ -111,6 +119,17 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore return file.Snapshot; } + catch (Exception exception) when (exception is JsonException or IOException or UnauthorizedAccessException) + { + // A corrupt, truncated, locked, or access-denied snapshot file is an + // expected failure mode for a disk cache — honor the Try contract and + // return null rather than throwing. + _logger?.LogWarning( + exception, + "Ignoring Galaxy hierarchy snapshot at {Path}: the file is unreadable or not valid JSON.", + _path); + return null; + } finally { _ioGate.Release(); diff --git a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs index a5202d5..934bf04 100644 --- a/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs +++ b/src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs @@ -1,3 +1,4 @@ +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using MxGateway.Server.Galaxy; using MxGateway.Contracts.Proto.Galaxy; @@ -208,6 +209,108 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable Assert.Equal(0, repository.GetAttributesCount); } + /// + /// Verifies that a restored on-disk snapshot completes the first-load gate + /// immediately, so a browse call racing the first refresh is not blocked for + /// the full bootstrap budget while the live Galaxy query is still running. + /// Regression test for Server-033. + /// + [Fact] + public async Task RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns() + { + GalaxyHierarchySnapshotStore store = CreateStore(); + await store.SaveAsync( + new GalaxyHierarchySnapshot( + LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero), + SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero), + Hierarchy: [SampleHierarchyRow()], + Attributes: [SampleAttributeRow()]), + CancellationToken.None); + + GalaxyDeployNotifier notifier = new(); + BlockingGalaxyRepository repository = new(); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store); + + Task refresh = cache.RefreshAsync(CancellationToken.None); + + // The live query is blocked inside the repository; first-load must still + // complete — from the restored snapshot — well within the wait budget. + await cache.WaitForFirstLoadAsync(CancellationToken.None).WaitAsync(TimeSpan.FromSeconds(5)); + Assert.True(cache.Current.HasData); + Assert.Equal(GalaxyCacheStatus.Stale, cache.Current.Status); + + repository.Release(); + await refresh.WaitAsync(TimeSpan.FromSeconds(5)); + } + + /// + /// Verifies a corrupt on-disk snapshot does not crash startup: the cache + /// ignores the unreadable file and comes up Unavailable when the database is + /// also unreachable. Regression test for Server-037. + /// + [Fact] + public async Task RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing() + { + string path = CreateTempPath(); + await File.WriteAllTextAsync(path, "{ this is not valid json"); + GalaxyHierarchySnapshotStore store = CreateStore(path); + + GalaxyDeployNotifier notifier = new(); + ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable")); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store); + + await cache.RefreshAsync(CancellationToken.None); + + Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status); + Assert.False(cache.Current.HasData); + } + + /// + /// Verifies that with snapshot persistence disabled the cache does not + /// restore from disk — an unreachable database leaves it Unavailable. + /// Regression test for Server-037. + /// + [Fact] + public async Task RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk() + { + GalaxyHierarchySnapshotStore store = CreateStore(CreateTempPath(), persist: false); + + GalaxyDeployNotifier notifier = new(); + ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable")); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store); + + await cache.RefreshAsync(CancellationToken.None); + + Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status); + Assert.False(cache.Current.HasData); + } + + /// + /// Verifies that a snapshot save aborted because the gateway is shutting down + /// (the refresh token is cancelled) is not logged as a persistence failure. + /// Regression test for Server-036. + /// + [Fact] + public async Task RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure() + { + using CancellationTokenSource cts = new(); + GalaxyDeployNotifier notifier = new(); + StubGalaxyRepository repository = new( + deployTime: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc), + hierarchy: [SampleHierarchyRow()], + attributes: [SampleAttributeRow()]); + CancellingSaveStore store = new(cts); + RecordingLogger logger = new(); + GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), logger, store); + + await cache.RefreshAsync(cts.Token); + + Assert.DoesNotContain( + logger.Entries, + entry => entry.Level == LogLevel.Warning + && entry.Message.Contains("persist", StringComparison.OrdinalIgnoreCase)); + } + private static GalaxyHierarchyRow SampleHierarchyRow() => new() { GobjectId = 99, @@ -228,20 +331,93 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable DataTypeName = "Float", }; - private GalaxyHierarchySnapshotStore CreateStore() + private string CreateTempPath() { string path = Path.Combine( Path.GetTempPath(), $"mxgw-galaxy-cache-test-{Guid.NewGuid():N}.json"); _tempPaths.Add(path); + return path; + } + + private GalaxyHierarchySnapshotStore CreateStore() => CreateStore(CreateTempPath()); + + private static GalaxyHierarchySnapshotStore CreateStore(string path, bool persist = true) + { GalaxyRepositoryOptions options = new() { - PersistSnapshot = true, + PersistSnapshot = persist, SnapshotCachePath = path, }; return new GalaxyHierarchySnapshotStore(Options.Create(options)); } + /// whose deploy-time query blocks until released. + private sealed class BlockingGalaxyRepository : IGalaxyRepository + { + private readonly TaskCompletionSource _release = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public void Release() => _release.TrySetResult(); + + public Task TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(false); + + public async Task GetLastDeployTimeAsync(CancellationToken ct = default) + { + await _release.Task.WaitAsync(ct).ConfigureAwait(false); + throw new InvalidOperationException("Galaxy repository unreachable"); + } + + public Task> GetHierarchyAsync(CancellationToken ct = default) + => throw new InvalidOperationException("GetHierarchyAsync should not be reached"); + + public Task> GetAttributesAsync(CancellationToken ct = default) + => throw new InvalidOperationException("GetAttributesAsync should not be reached"); + } + + /// Snapshot store whose cancels the token mid-save. + private sealed class CancellingSaveStore(CancellationTokenSource cts) : IGalaxyHierarchySnapshotStore + { + public Task TryLoadAsync(CancellationToken cancellationToken) + => Task.FromResult(null); + + public Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken) + { + cts.Cancel(); + cancellationToken.ThrowIfCancellationRequested(); + return Task.CompletedTask; + } + } + + /// Minimal that records every emitted log entry. + private sealed class RecordingLogger : ILogger + { + public List<(LogLevel Level, string Message)> Entries { get; } = []; + + public IDisposable BeginScope(TState state) + where TState : notnull => NullScope.Instance; + + public bool IsEnabled(LogLevel logLevel) => true; + + public void Log( + LogLevel logLevel, + EventId eventId, + TState state, + Exception? exception, + Func formatter) + { + Entries.Add((logLevel, formatter(state, exception))); + } + + private sealed class NullScope : IDisposable + { + public static readonly NullScope Instance = new(); + + public void Dispose() + { + } + } + } + /// In-memory that returns fixed rowsets. private sealed class StubGalaxyRepository( DateTime? deployTime, diff --git a/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs b/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs index c28546f..935abc5 100644 --- a/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs +++ b/src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs @@ -59,6 +59,16 @@ public sealed class GalaxyHierarchySnapshotStoreTests : IDisposable Assert.Null(await store.TryLoadAsync(CancellationToken.None)); } + [Fact] + public async Task TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull() + { + string path = CreateTempPath(); + await File.WriteAllTextAsync(path, "{ this is not valid json"); + GalaxyHierarchySnapshotStore store = CreateStore(path); + + Assert.Null(await store.TryLoadAsync(CancellationToken.None)); + } + [Fact] public async Task TryLoadAsync_WhenSchemaVersionUnrecognized_ReturnsNull() {