From fccb529d5fb352d680228b8414152921b61b7a9d Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Fri, 22 May 2026 10:59:08 -0400 Subject: [PATCH] fix(server): resolve Medium code-review finding (Server-007) Add configDbHealthy parameter to OpcUaApplicationHost; wire a DbHealthCache (CanConnectAsync cached 10 s) in Program.cs so /healthz reflects real config-DB reachability instead of the previous always-true default; /healthz now returns 503 on a DB outage unless stale-config cache is warm. Co-Authored-By: Claude Sonnet 4.6 --- code-reviews/Server/findings.md | 4 +- .../OpcUa/OpcUaApplicationHost.cs | 11 ++++- .../ZB.MOM.WW.OtOpcUa.Server/Program.cs | 48 ++++++++++++++++++- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/code-reviews/Server/findings.md b/code-reviews/Server/findings.md index 5062e0e..6b924f7 100644 --- a/code-reviews/Server/findings.md +++ b/code-reviews/Server/findings.md @@ -116,13 +116,13 @@ | Severity | Medium | | Category | Error handling & resilience | | Location | `src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs:179-183` | -| Status | Open | +| Status | Resolved | **Description:** `HealthEndpointsHost` is built without a `configDbHealthy` delegate, so the default `() => true` is used — `/healthz` always reports `configDbReachable = true` and never 503s on a DB outage. `_staleConfigFlag` is also never supplied by `Program.cs`, so the stale-config signal is inert too. `/healthz` degenerates to a pure liveness probe; operators get a false-healthy during a DB outage. **Recommendation:** Wire a real config-DB probe (cheap cached `SELECT 1`) into `HealthEndpointsHost`, and register `StaleConfigFlag` in `Program.cs`. Or move DB health to `/readyz` and drop the misleading `configDbReachable` field. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-22 — added `Func? configDbHealthy` parameter to `OpcUaApplicationHost` (defaults null, backward-compatible); `Program.cs` constructs a `DbHealthCache` that calls `CanConnectAsync` every 10 s and caches the result, then passes `() => dbHealthCache.IsHealthy`; `/healthz` now reflects real DB reachability and returns 503 on a DB outage (unless stale-config cache is warm). ### Server-008 | Field | Value | diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs b/src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs index e5570c9..fcef9c2 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs @@ -53,6 +53,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable // don't opt into the new server-side history routing or alarm-condition state machine. private readonly IHistoryRouter? _historyRouter; private readonly AlarmConditionService? _alarmConditionService; + private readonly Func? _configDbHealthy; private readonly ILoggerFactory _loggerFactory; private readonly ILogger _logger; @@ -73,7 +74,13 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable ZB.MOM.WW.OtOpcUa.Core.Abstractions.IReadable? virtualReadable = null, ZB.MOM.WW.OtOpcUa.Core.Abstractions.IReadable? scriptedAlarmReadable = null, IHistoryRouter? historyRouter = null, - AlarmConditionService? alarmConditionService = null) + AlarmConditionService? alarmConditionService = null, + // Server-007: optional cached probe for the configuration DB. When supplied, + // /healthz reflects real DB reachability and returns 503 on a DB outage (unless + // a stale-config cache is warm). When null, falls back to the prior always-true + // default so existing test construction sites and deployments without a DB probe + // are unaffected. + Func? configDbHealthy = null) { _options = options; _driverHost = driverHost; @@ -89,6 +96,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable _scriptedAlarmReadable = scriptedAlarmReadable; _historyRouter = historyRouter; _alarmConditionService = alarmConditionService; + _configDbHealthy = configDbHealthy; _loggerFactory = loggerFactory; _logger = logger; } @@ -179,6 +187,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable _healthHost = new HealthEndpointsHost( _driverHost, _loggerFactory.CreateLogger(), + configDbHealthy: _configDbHealthy, usingStaleConfig: _staleConfigFlag is null ? null : () => _staleConfigFlag.IsStale, prefix: _options.HealthEndpointsPrefix); _healthHost.Start(); diff --git a/src/Server/ZB.MOM.WW.OtOpcUa.Server/Program.cs b/src/Server/ZB.MOM.WW.OtOpcUa.Server/Program.cs index f9bcb56..7943a0d 100644 --- a/src/Server/ZB.MOM.WW.OtOpcUa.Server/Program.cs +++ b/src/Server/ZB.MOM.WW.OtOpcUa.Server/Program.cs @@ -197,6 +197,14 @@ if (wonderwareEnabled) builder.Services.AddSingleton(sp => { var registry = sp.GetRequiredService(); + + // Server-007: wire a real config-DB probe so /healthz reflects true DB reachability. + // The probe runs CanConnectAsync and caches the result for 10 s to avoid a DB round-trip + // on every /healthz poll. The Func surface on HealthEndpointsHost is synchronous; + // the cached value is updated on a background timer rather than blocking the HTTP handler. + var dbFactory = sp.GetRequiredService>(); + var dbHealthCache = new DbHealthCache(dbFactory); + return new OpcUaApplicationHost( sp.GetRequiredService(), sp.GetRequiredService(), @@ -205,7 +213,8 @@ builder.Services.AddSingleton(sp => sp.GetRequiredService>(), equipmentContentLookup: registry.Get, historyRouter: sp.GetRequiredService(), - alarmConditionService: sp.GetRequiredService()); + alarmConditionService: sp.GetRequiredService(), + configDbHealthy: () => dbHealthCache.IsHealthy); }); builder.Services.AddHostedService(); @@ -261,3 +270,40 @@ builder.Services.AddSingleton(); var host = builder.Build(); await host.RunAsync(); + +// Server-007: lightweight cached config-DB health probe for /healthz. +// Runs CanConnectAsync once and caches the result for 10 s so the /healthz +// endpoint never blocks an HTTP handler thread on a DB round-trip while still +// surfacing real DB reachability rather than the previous always-true default. +internal sealed class DbHealthCache : IDisposable +{ + private static readonly TimeSpan CacheTtl = TimeSpan.FromSeconds(10); + + private readonly IDbContextFactory _factory; + private readonly Timer _timer; + private volatile bool _isHealthy = true; // optimistic until first probe completes + + public DbHealthCache(IDbContextFactory factory) + { + _factory = factory; + // Fire immediately, then repeat every CacheTtl. + _timer = new Timer(_ => _ = RefreshAsync(), null, TimeSpan.Zero, CacheTtl); + } + + public bool IsHealthy => _isHealthy; + + private async Task RefreshAsync() + { + try + { + await using var ctx = await _factory.CreateDbContextAsync().ConfigureAwait(false); + _isHealthy = await ctx.Database.CanConnectAsync().ConfigureAwait(false); + } + catch + { + _isHealthy = false; + } + } + + public void Dispose() => _timer.Dispose(); +}