Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dd7ca1634e | |||
| bdccdbf6dd |
+10
-2
@@ -17,7 +17,7 @@ Each module's `findings.md` is the source of truth; this file is generated from
|
||||
| [Client.Rust](Client.Rust/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 20 |
|
||||
| [Contracts](Contracts/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 15 |
|
||||
| [IntegrationTests](IntegrationTests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 21 |
|
||||
| [Server](Server/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 30 |
|
||||
| [Server](Server/findings.md) | Claude Code | 2026-05-22 | `fa491c7` | Reviewed | 2 | 37 |
|
||||
| [Tests](Tests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 24 |
|
||||
| [Worker](Worker/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 25 |
|
||||
| [Worker.Tests](Worker.Tests/findings.md) | Claude Code | 2026-05-20 | `a020350` | Reviewed | 0 | 30 |
|
||||
@@ -26,7 +26,10 @@ Each module's `findings.md` is the source of truth; this file is generated from
|
||||
|
||||
Findings with status `Open` or `In Progress`, ordered by severity.
|
||||
|
||||
_No pending findings._
|
||||
| ID | Severity | Category | Location | Description |
|
||||
|---|---|---|---|---|
|
||||
| Server-031 | Medium | Concurrency & thread safety | `src/MxGateway.Server/Workers/WorkerClient.cs:392-422` (gateway-side heartbeat watchdog); `src/MxGateway.Worker/Ipc/WorkerPipeSession.cs:588-617` (worker-side heartbeat loop); `src/MxGateway.Worker/Ipc/WorkerFrameWriter.cs:14,67-76` (shared `_writeLock`) | Surfaced during the 2026-05-20 cross-language e2e re-run against gateway `b794c46`. The .NET phase succeeded through `open-session`/`register`/`bulk-subscribe`/`bulk-read`/`bulk-unsubscribe`/`stream-events`/`write` but then failed on its t… |
|
||||
| Server-032 | Medium | Error handling & resilience | `src/MxGateway.Server/Workers/WorkerClient.cs:70-77,463-484` (gateway-side `_events` channel); `src/MxGateway.Server/Configuration/EventOptions.cs:8` (default capacity 10,000); `src/MxGateway.Server/Grpc/EventStreamService.cs` (consumer) | Surfaced during the 2026-05-20 cross-language e2e re-run against gateway `b794c46`. The Java phase advised ~55 items (`item-handle 63`) before failing on the next `advise` call with the Server-030 diagnostic `Session ... is not ready. Sess… |
|
||||
|
||||
## Closed findings
|
||||
|
||||
@@ -94,6 +97,7 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
|
||||
| Server-016 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Sessions/GatewaySession.cs:790-797`, `src/MxGateway.Server/Sessions/SessionManager.cs:237-258` |
|
||||
| Server-021 | Medium | Resolved | Testing coverage | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:266-664`, `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs` |
|
||||
| Server-030 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Sessions/GatewaySession.cs:952-980` |
|
||||
| Server-033 | Medium | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:265-323` (`TryRestoreFromDiskAsync`), `:84-99` (`_firstLoad` / `WaitForFirstLoadAsync`); `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:141-163` (`WaitForCacheBootstrap`) |
|
||||
| Tests-003 | Medium | Resolved | Performance & resource management | `src/MxGateway.Tests/Security/Authentication/SqliteAuthStoreTests.cs:170-176`, `src/MxGateway.Tests/Security/Authentication/ApiKeyAdminCliRunnerTests.cs:252-258` |
|
||||
| Tests-004 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs` |
|
||||
| Tests-005 | Medium | Resolved | Testing coverage | `src/MxGateway.Tests/Gateway/Grpc/EventStreamServiceTests.cs:239-261`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs` |
|
||||
@@ -235,6 +239,10 @@ Findings with status `Resolved`, `Won't Fix`, or `Deferred`.
|
||||
| Server-027 | Low | Resolved | Design-document adherence | `docs/Authorization.md:120-141,176-181` |
|
||||
| Server-028 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Security/Authorization/GatewayGrpcScopeResolverTests.cs:13-20`, `src/MxGateway.Tests/Gateway/Sessions/GatewaySessionTests.cs` |
|
||||
| Server-029 | Low | Resolved | Documentation & comments | `src/MxGateway.Server/Grpc/MxAccessGatewayService.cs:52-58` |
|
||||
| Server-034 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs:87-115` (`TryLoadAsync`) |
|
||||
| Server-035 | Low | Resolved | Performance & resource management | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:176` (call site), `:327-352` (`PersistSnapshotAsync`) |
|
||||
| Server-036 | Low | Resolved | Error handling & resilience | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:345-348` (`PersistSnapshotAsync` catch) |
|
||||
| Server-037 | Low | Resolved | Testing coverage | `src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs`, `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs` |
|
||||
| Tests-007 | Low | Resolved | Code organization & conventions | `src/MxGateway.Tests/Gateway/Grpc/MxAccessGatewayServiceTests.cs:682`, `src/MxGateway.Tests/Gateway/Grpc/GalaxyRepositoryGrpcServiceTests.cs:324`, `src/MxGateway.Tests/Gateway/GatewayEndToEndFakeWorkerSmokeTests.cs:460`, `src/MxGateway.Tests/Security/Authorization/GatewayGrpcAuthorizationInterceptorTests.cs:233` |
|
||||
| Tests-008 | Low | Resolved | mxaccessgw conventions | `src/MxGateway.Tests/Gateway/Sessions/WorkerAlarmRpcDispatcherTests.cs:1-9`, `src/MxGateway.Tests/Gateway/Sessions/NotWiredAlarmRpcDispatcherTests.cs:1-3`, `src/MxGateway.Tests/Gateway/Sessions/SessionManagerAlarmAutoSubscribeTests.cs:1` |
|
||||
| Tests-009 | Low | Resolved | Documentation & comments | `src/MxGateway.Tests/Gateway/Sessions/SessionManagerTests.cs:36-37,99,365` |
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
|---|---|
|
||||
| Module | `src/MxGateway.Server` |
|
||||
| Reviewer | Claude Code |
|
||||
| Review date | 2026-05-20 |
|
||||
| Commit reviewed | `a020350` |
|
||||
| Review date | 2026-05-22 |
|
||||
| Commit reviewed | `fa491c7` |
|
||||
| Status | Reviewed |
|
||||
| Open findings | 0 |
|
||||
| Open findings | 2 |
|
||||
|
||||
## Checklist coverage
|
||||
|
||||
@@ -47,6 +47,28 @@ Re-review pass at `a020350` — the cross-module sweep that resolved Server-015
|
||||
| 9 | Testing coverage | Issues found: Server-028 (`GatewayGrpcScopeResolverTests` does not exercise `WatchDeployEventsRequest` or `MxCommandKind.ReadBulk`; no `GatewaySessionTests` case asserts a `MarkFaulted` during in-flight Close). |
|
||||
| 10 | Documentation & comments | Issues found: Server-023 (`NotWiredAlarmRpcDispatcher` class XML doc still says "PR A.6/A.7 — default … shipped while the worker-side AlarmClient event subscription is gated on dev-rig validation"; contradicts the cleanup that Server-014/Server-022 applied to the interface, gateway service, and `WorkerAlarmRpcDispatcher`). Issues found: Server-029 (`OpenSession` capability list advertises `bulk-subscribe-commands` but not the now-shipping bulk-read or bulk-write families — clients that gate on capability strings have no signal that those families exist). |
|
||||
|
||||
### 2026-05-22 review (commit fa491c7)
|
||||
|
||||
Re-review pass at `fa491c7`, scoped to the Galaxy hierarchy snapshot-persistence
|
||||
change: the new `GalaxyHierarchySnapshot`, `IGalaxyHierarchySnapshotStore` /
|
||||
`GalaxyHierarchySnapshotStore`, the restore / persist paths added to
|
||||
`GalaxyHierarchyCache`, the two new `GalaxyRepositoryOptions`, and the
|
||||
`docs/GalaxyRepository.md` / `docs/GatewayConfiguration.md` updates. Prior
|
||||
findings (Server-001 through Server-032) are unchanged by this pass.
|
||||
|
||||
| # | Category | Result |
|
||||
|---|---|---|
|
||||
| 1 | Correctness & logic bugs | No issues found — restore/save sequencing and the shared `BuildEntry` materialization are sound. |
|
||||
| 2 | mxaccessgw conventions | No issues found — file-scoped namespaces, `sealed`, `Async` suffixes, Options pattern, and XML docs all conform; the snapshot persists Galaxy metadata (names/types), not tag values or secrets. |
|
||||
| 3 | Concurrency & thread safety | No issues found — `_restoreAttempted` and `_current` are touched only under `_refreshGate`; `_current` is published via `Volatile.Write`; the store serializes its file I/O on a private `SemaphoreSlim`. |
|
||||
| 4 | Error handling & resilience | Issues found: Server-033 (restore never completes `_firstLoad`, so a cold-start browse waits the full 5s bootstrap budget), Server-034 (`TryLoadAsync` throws on a corrupt file despite the `Try` prefix), Server-036 (a save cancelled at shutdown logs a misleading warning). |
|
||||
| 5 | Security | No issues found — the snapshot holds non-secret Galaxy metadata, is written under `C:\ProgramData\MxGateway` alongside the auth DB, and restored rows flow the same materialization path as live SQL with no injection surface. |
|
||||
| 6 | Performance & resource management | Issues found: Server-035 (the snapshot write is awaited on the refresh critical path under `_refreshGate` with no timeout). |
|
||||
| 7 | Design-document adherence | No issues found — `docs/GalaxyRepository.md` and `docs/GatewayConfiguration.md` were updated in the same commit; `docs/DesignDecisions.md` already defers to `GalaxyRepository.md` as the Galaxy authority. |
|
||||
| 8 | Code organization & conventions | No issues found — the new options live on `GalaxyRepositoryOptions`, the store is a registered singleton, and the on-disk envelope (`PersistedFile`) is a private nested record. |
|
||||
| 9 | Testing coverage | Issues found: Server-037 (no test for the corrupt-snapshot restore path or for `PersistSnapshot = false` at the cache level). |
|
||||
| 10 | Documentation & comments | No issues found — XML docs match behavior; the `GalaxyRepository.md` "On-disk snapshot" section documents the Stale-on-restore lifecycle. |
|
||||
|
||||
## Findings
|
||||
|
||||
### Server-001
|
||||
@@ -568,3 +590,78 @@ The diagnostic `"Worker event channel rejected an event."` also does not name th
|
||||
Add a regression test that advises N items without an active `StreamEvents` consumer, lets the channel fill, and asserts the produced fault message contains the channel-depth diagnostic (#2) — gated so that #3 is not required.
|
||||
|
||||
**Resolution:** _(empty until closed; on close, record the fixing commit SHA, the date, and a one-line description of the fix)_
|
||||
|
||||
### Server-033
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:265-323` (`TryRestoreFromDiskAsync`), `:84-99` (`_firstLoad` / `WaitForFirstLoadAsync`); `src/MxGateway.Server/Grpc/GalaxyRepositoryGrpcService.cs:141-163` (`WaitForCacheBootstrap`) |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `TryRestoreFromDiskAsync` populates `_current` with the on-disk snapshot (status `Stale`, `HasData == true`) but never completes the `_firstLoad` `TaskCompletionSource` — only the live-query paths (cheap / heavy / catch) in `RefreshCoreAsync` do. A `DiscoverHierarchy` or `GetLastDeployTime` call that arrives after gateway start but before the first refresh tick finishes sees `cache.Current` as `Empty` (status `Unknown`) when `WaitForCacheBootstrap` runs its initial check, so it falls through to `await WaitForFirstLoadAsync` with a 5-second budget. Restore then completes within milliseconds and makes the data available, but `_firstLoad` stays pending until the live query returns or fails. When the Galaxy database is unreachable — the exact scenario the snapshot feature exists for — the SQL connect attempt outlasts the 5s budget, so the caller waits the full 5 seconds before the budget elapses and the handler falls through to read the (already-restored) data. The result is correct, but the first browse calls after a cold offline start incur a needless ~5s latency, undercutting the feature's purpose.
|
||||
|
||||
**Recommendation:** Call `_firstLoad.TrySetResult()` at the end of `TryRestoreFromDiskAsync` once the restored entry is published — restored data is a valid completed first load. Add a regression test: a cache with a throwing repository plus a populated snapshot store should have `WaitForFirstLoadAsync` complete promptly after `RefreshAsync`, not block on the live query.
|
||||
|
||||
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `TryRestoreFromDiskAsync` calls `_firstLoad.TrySetResult()` immediately after publishing the restored entry, so a restored snapshot satisfies the bootstrap gate without waiting on the live query. New test `GalaxyHierarchyCacheTests.RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns` blocks the repository's deploy-time query and asserts `WaitForFirstLoadAsync` still completes from the snapshot.
|
||||
|
||||
### Server-034
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Severity | Low |
|
||||
| Category | Error handling & resilience |
|
||||
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchySnapshotStore.cs:87-115` (`TryLoadAsync`) |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `TryLoadAsync` carries the `Try` prefix and its XML doc says it returns `null` "when none exists, persistence is disabled, or the on-disk file uses an unrecognized schema version." But a corrupt or partially written JSON file makes `JsonSerializer.DeserializeAsync` throw `JsonException`, and an unreadable file (locked, denied ACL) throws `IOException` / `UnauthorizedAccessException` — none of which the method catches. End-to-end behavior is still safe because the sole caller, `GalaxyHierarchyCache.TryRestoreFromDiskAsync`, wraps the call in a `catch (Exception)`; but the store's own `Try`-prefixed contract is violated, and any future caller would be surprised by the throw.
|
||||
|
||||
**Recommendation:** Catch `JsonException` and `IOException` (the latter covers the `UnauthorizedAccessException` family) inside `TryLoadAsync`, log a warning, and return `null` — consistent with the unrecognized-schema-version branch already present and with the `Try` naming. A corrupt cache file is an expected failure mode for a disk cache.
|
||||
|
||||
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `TryLoadAsync` now has a `catch (Exception) when (exception is JsonException or IOException or UnauthorizedAccessException)` that logs a warning and returns `null`. New test `GalaxyHierarchySnapshotStoreTests.TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull`.
|
||||
|
||||
### Server-035
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Severity | Low |
|
||||
| Category | Performance & resource management |
|
||||
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:176` (call site), `:327-352` (`PersistSnapshotAsync`) |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** After a heavy refresh, `RefreshCoreAsync` `await`s `PersistSnapshotAsync` while still holding `_refreshGate`, and the `SaveAsync` write has no timeout. The only caller of `RefreshAsync` is the sequential `GalaxyHierarchyRefreshService` loop, so a write that hangs — e.g. a `SnapshotCachePath` pointed at an unresponsive network share — blocks the gate and stalls all subsequent cache refreshes until gateway shutdown. Impact is bounded: clients keep being served the last entry (which flips to `Stale` after the 5-minute threshold), so this is a degradation rather than an outage, and the default `C:\ProgramData` path is local disk where a hang is unlikely.
|
||||
|
||||
**Recommendation:** Bound the snapshot write with a timeout — a linked `CancellationTokenSource` cancelling after, say, the SQL `CommandTimeoutSeconds` budget — so a stuck write fails fast and logs rather than pinning the refresh loop. Moving the write off the gate is an alternative but would need its own write-serialization.
|
||||
|
||||
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `SaveAsync` wraps the write in a `CancellationTokenSource.CreateLinkedTokenSource(cancellationToken)` cancelled after `Math.Max(1, CommandTimeoutSeconds)` seconds, so a stuck write fails fast instead of pinning the refresh loop. The timeout-expiry path itself is not unit-tested — exercising it would require a genuinely hanging filesystem.
|
||||
|
||||
### Server-036
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Severity | Low |
|
||||
| Category | Error handling & resilience |
|
||||
| Location | `src/MxGateway.Server/Galaxy/GalaxyHierarchyCache.cs:345-348` (`PersistSnapshotAsync` catch) |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `PersistSnapshotAsync` passes the refresh `CancellationToken` to `SaveAsync` and catches every exception — including the `OperationCanceledException` thrown when that token is cancelled at gateway shutdown — in its general `catch (Exception)`, logging it as `Warning: "Failed to persist the Galaxy hierarchy snapshot to disk."`. A snapshot write interrupted by a normal shutdown is not a failure, but it surfaces as a misleading warning every time the gateway stops mid-write.
|
||||
|
||||
**Recommendation:** Let a cancellation-driven `OperationCanceledException` pass without the warning — e.g. add `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { }` before the general catch — matching the cancellation handling already used in `RefreshCoreAsync` and `TryRestoreFromDiskAsync`.
|
||||
|
||||
**Resolution:** Resolved in `bdccdbf` (2026-05-22): `PersistSnapshotAsync` has a `catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)` ahead of the general catch, so a save aborted by gateway shutdown is silent while a genuine failure (including a write timeout) still logs. New test `GalaxyHierarchyCacheTests.RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure`.
|
||||
|
||||
### Server-037
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Severity | Low |
|
||||
| Category | Testing coverage |
|
||||
| Location | `src/MxGateway.Tests/Galaxy/GalaxyHierarchySnapshotStoreTests.cs`, `src/MxGateway.Tests/Galaxy/GalaxyHierarchyCacheTests.cs` |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** The new snapshot tests cover the round-trip, missing-file, persistence-disabled, unrecognized-schema, and overwrite cases for the store, and the persist / restore-when-unreachable / promote-on-matching-deploy cases for the cache. Two resilience paths are untested: (1) `GalaxyHierarchyCache.TryRestoreFromDiskAsync`'s `catch` path when the snapshot file is corrupt — the cache must come up `Unavailable` rather than throwing; (2) the cache restore path when `PersistSnapshot = false` (the store yields `null` and the cache stays `Unavailable`). Both are the failure modes most likely to matter operationally.
|
||||
|
||||
**Recommendation:** Add a cache test that writes a corrupt snapshot file and asserts `RefreshAsync` with an unreachable repository leaves the cache `Unavailable` without throwing, and a test that confirms a `PersistSnapshot = false` store neither restores nor persists. If Server-034 is fixed, the corrupt-file test also pins the store's null-return.
|
||||
|
||||
**Resolution:** Resolved in `bdccdbf` (2026-05-22): added `GalaxyHierarchyCacheTests.RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing` and `RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk`, plus the `TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull` store test added for Server-034.
|
||||
|
||||
@@ -306,6 +306,12 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
|
||||
attributes: snapshot.Attributes);
|
||||
Volatile.Write(ref _current, restored);
|
||||
|
||||
// Restored data is a valid completed first load: unblock callers waiting on
|
||||
// the bootstrap gate immediately, rather than making them wait out the full
|
||||
// wait budget for a live query that — when the database is unreachable, the
|
||||
// scenario this restore exists for — may not return for seconds.
|
||||
_firstLoad.TrySetResult();
|
||||
|
||||
_notifier.Publish(new GalaxyDeployEventInfo(
|
||||
Sequence: sequence,
|
||||
ObservedAt: _timeProvider.GetUtcNow(),
|
||||
@@ -342,6 +348,11 @@ public sealed class GalaxyHierarchyCache : IGalaxyHierarchyCache
|
||||
new GalaxyHierarchySnapshot(deployTime, savedAt, hierarchy, attributes),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// The refresh was cancelled (gateway shutdown) before the write finished.
|
||||
// That is not a persistence failure — do not log it as a warning.
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
_logger?.LogWarning(exception, "Failed to persist the Galaxy hierarchy snapshot to disk.");
|
||||
|
||||
@@ -27,6 +27,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
};
|
||||
|
||||
private readonly string? _path;
|
||||
private readonly TimeSpan _writeTimeout;
|
||||
private readonly ILogger<GalaxyHierarchySnapshotStore>? _logger;
|
||||
private readonly SemaphoreSlim _ioGate = new(1, 1);
|
||||
|
||||
@@ -41,6 +42,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
_path = value.PersistSnapshot && !string.IsNullOrWhiteSpace(value.SnapshotCachePath)
|
||||
? value.SnapshotCachePath
|
||||
: null;
|
||||
_writeTimeout = TimeSpan.FromSeconds(Math.Max(1, value.CommandTimeoutSeconds));
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
@@ -58,6 +60,12 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
await _ioGate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
// Bound the write so a stuck disk — e.g. a SnapshotCachePath on an
|
||||
// unresponsive network share — cannot stall the caller. On the cache
|
||||
// refresh path that would otherwise pin the whole refresh loop.
|
||||
using CancellationTokenSource writeCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
writeCts.CancelAfter(_writeTimeout);
|
||||
|
||||
string? directory = Path.GetDirectoryName(_path);
|
||||
if (!string.IsNullOrEmpty(directory))
|
||||
{
|
||||
@@ -67,7 +75,7 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
string tempPath = _path + ".tmp";
|
||||
await using (FileStream stream = new(tempPath, FileMode.Create, FileAccess.Write, FileShare.None))
|
||||
{
|
||||
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await JsonSerializer.SerializeAsync(stream, file, SerializerOptions, writeCts.Token).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
File.Move(tempPath, _path, overwrite: true);
|
||||
@@ -111,6 +119,17 @@ public sealed class GalaxyHierarchySnapshotStore : IGalaxyHierarchySnapshotStore
|
||||
|
||||
return file.Snapshot;
|
||||
}
|
||||
catch (Exception exception) when (exception is JsonException or IOException or UnauthorizedAccessException)
|
||||
{
|
||||
// A corrupt, truncated, locked, or access-denied snapshot file is an
|
||||
// expected failure mode for a disk cache — honor the Try contract and
|
||||
// return null rather than throwing.
|
||||
_logger?.LogWarning(
|
||||
exception,
|
||||
"Ignoring Galaxy hierarchy snapshot at {Path}: the file is unreadable or not valid JSON.",
|
||||
_path);
|
||||
return null;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_ioGate.Release();
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using MxGateway.Server.Galaxy;
|
||||
using MxGateway.Contracts.Proto.Galaxy;
|
||||
@@ -208,6 +209,108 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
|
||||
Assert.Equal(0, repository.GetAttributesCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that a restored on-disk snapshot completes the first-load gate
|
||||
/// immediately, so a browse call racing the first refresh is not blocked for
|
||||
/// the full bootstrap budget while the live Galaxy query is still running.
|
||||
/// Regression test for Server-033.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_RestoredSnapshotCompletesFirstLoadBeforeLiveQueryReturns()
|
||||
{
|
||||
GalaxyHierarchySnapshotStore store = CreateStore();
|
||||
await store.SaveAsync(
|
||||
new GalaxyHierarchySnapshot(
|
||||
LastDeployTime: new DateTimeOffset(2026, 5, 20, 9, 0, 0, TimeSpan.Zero),
|
||||
SavedAt: new DateTimeOffset(2026, 5, 20, 9, 1, 0, TimeSpan.Zero),
|
||||
Hierarchy: [SampleHierarchyRow()],
|
||||
Attributes: [SampleAttributeRow()]),
|
||||
CancellationToken.None);
|
||||
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
BlockingGalaxyRepository repository = new();
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
|
||||
|
||||
Task refresh = cache.RefreshAsync(CancellationToken.None);
|
||||
|
||||
// The live query is blocked inside the repository; first-load must still
|
||||
// complete — from the restored snapshot — well within the wait budget.
|
||||
await cache.WaitForFirstLoadAsync(CancellationToken.None).WaitAsync(TimeSpan.FromSeconds(5));
|
||||
Assert.True(cache.Current.HasData);
|
||||
Assert.Equal(GalaxyCacheStatus.Stale, cache.Current.Status);
|
||||
|
||||
repository.Release();
|
||||
await refresh.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies a corrupt on-disk snapshot does not crash startup: the cache
|
||||
/// ignores the unreadable file and comes up Unavailable when the database is
|
||||
/// also unreachable. Regression test for Server-037.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_WhenSnapshotFileCorrupt_ComesUpUnavailableWithoutThrowing()
|
||||
{
|
||||
string path = CreateTempPath();
|
||||
await File.WriteAllTextAsync(path, "{ this is not valid json");
|
||||
GalaxyHierarchySnapshotStore store = CreateStore(path);
|
||||
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
|
||||
|
||||
await cache.RefreshAsync(CancellationToken.None);
|
||||
|
||||
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
|
||||
Assert.False(cache.Current.HasData);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that with snapshot persistence disabled the cache does not
|
||||
/// restore from disk — an unreachable database leaves it Unavailable.
|
||||
/// Regression test for Server-037.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_WhenPersistDisabled_DoesNotRestoreFromDisk()
|
||||
{
|
||||
GalaxyHierarchySnapshotStore store = CreateStore(CreateTempPath(), persist: false);
|
||||
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
ThrowingGalaxyRepository repository = new(new InvalidOperationException("Galaxy repository unreachable"));
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), snapshotStore: store);
|
||||
|
||||
await cache.RefreshAsync(CancellationToken.None);
|
||||
|
||||
Assert.Equal(GalaxyCacheStatus.Unavailable, cache.Current.Status);
|
||||
Assert.False(cache.Current.HasData);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that a snapshot save aborted because the gateway is shutting down
|
||||
/// (the refresh token is cancelled) is not logged as a persistence failure.
|
||||
/// Regression test for Server-036.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RefreshAsync_WhenSnapshotSaveCancelledAtShutdown_DoesNotLogPersistFailure()
|
||||
{
|
||||
using CancellationTokenSource cts = new();
|
||||
GalaxyDeployNotifier notifier = new();
|
||||
StubGalaxyRepository repository = new(
|
||||
deployTime: new DateTime(2026, 5, 20, 9, 0, 0, DateTimeKind.Utc),
|
||||
hierarchy: [SampleHierarchyRow()],
|
||||
attributes: [SampleAttributeRow()]);
|
||||
CancellingSaveStore store = new(cts);
|
||||
RecordingLogger<GalaxyHierarchyCache> logger = new();
|
||||
GalaxyHierarchyCache cache = new(repository, notifier, new ManualTimeProvider(), logger, store);
|
||||
|
||||
await cache.RefreshAsync(cts.Token);
|
||||
|
||||
Assert.DoesNotContain(
|
||||
logger.Entries,
|
||||
entry => entry.Level == LogLevel.Warning
|
||||
&& entry.Message.Contains("persist", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
private static GalaxyHierarchyRow SampleHierarchyRow() => new()
|
||||
{
|
||||
GobjectId = 99,
|
||||
@@ -228,20 +331,93 @@ public sealed class GalaxyHierarchyCacheTests : IDisposable
|
||||
DataTypeName = "Float",
|
||||
};
|
||||
|
||||
private GalaxyHierarchySnapshotStore CreateStore()
|
||||
private string CreateTempPath()
|
||||
{
|
||||
string path = Path.Combine(
|
||||
Path.GetTempPath(),
|
||||
$"mxgw-galaxy-cache-test-{Guid.NewGuid():N}.json");
|
||||
_tempPaths.Add(path);
|
||||
return path;
|
||||
}
|
||||
|
||||
private GalaxyHierarchySnapshotStore CreateStore() => CreateStore(CreateTempPath());
|
||||
|
||||
private static GalaxyHierarchySnapshotStore CreateStore(string path, bool persist = true)
|
||||
{
|
||||
GalaxyRepositoryOptions options = new()
|
||||
{
|
||||
PersistSnapshot = true,
|
||||
PersistSnapshot = persist,
|
||||
SnapshotCachePath = path,
|
||||
};
|
||||
return new GalaxyHierarchySnapshotStore(Options.Create(options));
|
||||
}
|
||||
|
||||
/// <summary><see cref="IGalaxyRepository"/> whose deploy-time query blocks until released.</summary>
|
||||
private sealed class BlockingGalaxyRepository : IGalaxyRepository
|
||||
{
|
||||
private readonly TaskCompletionSource _release = new(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
|
||||
public void Release() => _release.TrySetResult();
|
||||
|
||||
public Task<bool> TestConnectionAsync(CancellationToken ct = default) => Task.FromResult(false);
|
||||
|
||||
public async Task<DateTime?> GetLastDeployTimeAsync(CancellationToken ct = default)
|
||||
{
|
||||
await _release.Task.WaitAsync(ct).ConfigureAwait(false);
|
||||
throw new InvalidOperationException("Galaxy repository unreachable");
|
||||
}
|
||||
|
||||
public Task<List<GalaxyHierarchyRow>> GetHierarchyAsync(CancellationToken ct = default)
|
||||
=> throw new InvalidOperationException("GetHierarchyAsync should not be reached");
|
||||
|
||||
public Task<List<GalaxyAttributeRow>> GetAttributesAsync(CancellationToken ct = default)
|
||||
=> throw new InvalidOperationException("GetAttributesAsync should not be reached");
|
||||
}
|
||||
|
||||
/// <summary>Snapshot store whose <see cref="SaveAsync"/> cancels the token mid-save.</summary>
|
||||
private sealed class CancellingSaveStore(CancellationTokenSource cts) : IGalaxyHierarchySnapshotStore
|
||||
{
|
||||
public Task<GalaxyHierarchySnapshot?> TryLoadAsync(CancellationToken cancellationToken)
|
||||
=> Task.FromResult<GalaxyHierarchySnapshot?>(null);
|
||||
|
||||
public Task SaveAsync(GalaxyHierarchySnapshot snapshot, CancellationToken cancellationToken)
|
||||
{
|
||||
cts.Cancel();
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Minimal <see cref="ILogger{T}"/> that records every emitted log entry.</summary>
|
||||
private sealed class RecordingLogger<T> : ILogger<T>
|
||||
{
|
||||
public List<(LogLevel Level, string Message)> Entries { get; } = [];
|
||||
|
||||
public IDisposable BeginScope<TState>(TState state)
|
||||
where TState : notnull => NullScope.Instance;
|
||||
|
||||
public bool IsEnabled(LogLevel logLevel) => true;
|
||||
|
||||
public void Log<TState>(
|
||||
LogLevel logLevel,
|
||||
EventId eventId,
|
||||
TState state,
|
||||
Exception? exception,
|
||||
Func<TState, Exception?, string> formatter)
|
||||
{
|
||||
Entries.Add((logLevel, formatter(state, exception)));
|
||||
}
|
||||
|
||||
private sealed class NullScope : IDisposable
|
||||
{
|
||||
public static readonly NullScope Instance = new();
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>In-memory <see cref="IGalaxyRepository"/> that returns fixed rowsets.</summary>
|
||||
private sealed class StubGalaxyRepository(
|
||||
DateTime? deployTime,
|
||||
|
||||
@@ -59,6 +59,16 @@ public sealed class GalaxyHierarchySnapshotStoreTests : IDisposable
|
||||
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TryLoadAsync_WhenFileIsCorruptJson_ReturnsNull()
|
||||
{
|
||||
string path = CreateTempPath();
|
||||
await File.WriteAllTextAsync(path, "{ this is not valid json");
|
||||
GalaxyHierarchySnapshotStore store = CreateStore(path);
|
||||
|
||||
Assert.Null(await store.TryLoadAsync(CancellationToken.None));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TryLoadAsync_WhenSchemaVersionUnrecognized_ReturnsNull()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user