fix(server): resolve Medium code-review finding (Server-007)
Add configDbHealthy parameter to OpcUaApplicationHost; wire a DbHealthCache (CanConnectAsync cached 10 s) in Program.cs so /healthz reflects real config-DB reachability instead of the previous always-true default; /healthz now returns 503 on a DB outage unless stale-config cache is warm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -116,13 +116,13 @@
|
|||||||
| Severity | Medium |
|
| Severity | Medium |
|
||||||
| Category | Error handling & resilience |
|
| Category | Error handling & resilience |
|
||||||
| Location | `src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs:179-183` |
|
| Location | `src/Server/ZB.MOM.WW.OtOpcUa.Server/OpcUa/OpcUaApplicationHost.cs:179-183` |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
|
|
||||||
**Description:** `HealthEndpointsHost` is built without a `configDbHealthy` delegate, so the default `() => true` is used — `/healthz` always reports `configDbReachable = true` and never 503s on a DB outage. `_staleConfigFlag` is also never supplied by `Program.cs`, so the stale-config signal is inert too. `/healthz` degenerates to a pure liveness probe; operators get a false-healthy during a DB outage.
|
**Description:** `HealthEndpointsHost` is built without a `configDbHealthy` delegate, so the default `() => true` is used — `/healthz` always reports `configDbReachable = true` and never 503s on a DB outage. `_staleConfigFlag` is also never supplied by `Program.cs`, so the stale-config signal is inert too. `/healthz` degenerates to a pure liveness probe; operators get a false-healthy during a DB outage.
|
||||||
|
|
||||||
**Recommendation:** Wire a real config-DB probe (cheap cached `SELECT 1`) into `HealthEndpointsHost`, and register `StaleConfigFlag` in `Program.cs`. Or move DB health to `/readyz` and drop the misleading `configDbReachable` field.
|
**Recommendation:** Wire a real config-DB probe (cheap cached `SELECT 1`) into `HealthEndpointsHost`, and register `StaleConfigFlag` in `Program.cs`. Or move DB health to `/readyz` and drop the misleading `configDbReachable` field.
|
||||||
|
|
||||||
**Resolution:** _(open)_
|
**Resolution:** Resolved 2026-05-22 — added `Func<bool>? configDbHealthy` parameter to `OpcUaApplicationHost` (defaults null, backward-compatible); `Program.cs` constructs a `DbHealthCache` that calls `CanConnectAsync` every 10 s and caches the result, then passes `() => dbHealthCache.IsHealthy`; `/healthz` now reflects real DB reachability and returns 503 on a DB outage (unless stale-config cache is warm).
|
||||||
|
|
||||||
### Server-008
|
### Server-008
|
||||||
| Field | Value |
|
| Field | Value |
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
|||||||
// don't opt into the new server-side history routing or alarm-condition state machine.
|
// don't opt into the new server-side history routing or alarm-condition state machine.
|
||||||
private readonly IHistoryRouter? _historyRouter;
|
private readonly IHistoryRouter? _historyRouter;
|
||||||
private readonly AlarmConditionService? _alarmConditionService;
|
private readonly AlarmConditionService? _alarmConditionService;
|
||||||
|
private readonly Func<bool>? _configDbHealthy;
|
||||||
|
|
||||||
private readonly ILoggerFactory _loggerFactory;
|
private readonly ILoggerFactory _loggerFactory;
|
||||||
private readonly ILogger<OpcUaApplicationHost> _logger;
|
private readonly ILogger<OpcUaApplicationHost> _logger;
|
||||||
@@ -73,7 +74,13 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
|||||||
ZB.MOM.WW.OtOpcUa.Core.Abstractions.IReadable? virtualReadable = null,
|
ZB.MOM.WW.OtOpcUa.Core.Abstractions.IReadable? virtualReadable = null,
|
||||||
ZB.MOM.WW.OtOpcUa.Core.Abstractions.IReadable? scriptedAlarmReadable = null,
|
ZB.MOM.WW.OtOpcUa.Core.Abstractions.IReadable? scriptedAlarmReadable = null,
|
||||||
IHistoryRouter? historyRouter = null,
|
IHistoryRouter? historyRouter = null,
|
||||||
AlarmConditionService? alarmConditionService = null)
|
AlarmConditionService? alarmConditionService = null,
|
||||||
|
// Server-007: optional cached probe for the configuration DB. When supplied,
|
||||||
|
// /healthz reflects real DB reachability and returns 503 on a DB outage (unless
|
||||||
|
// a stale-config cache is warm). When null, falls back to the prior always-true
|
||||||
|
// default so existing test construction sites and deployments without a DB probe
|
||||||
|
// are unaffected.
|
||||||
|
Func<bool>? configDbHealthy = null)
|
||||||
{
|
{
|
||||||
_options = options;
|
_options = options;
|
||||||
_driverHost = driverHost;
|
_driverHost = driverHost;
|
||||||
@@ -89,6 +96,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
|||||||
_scriptedAlarmReadable = scriptedAlarmReadable;
|
_scriptedAlarmReadable = scriptedAlarmReadable;
|
||||||
_historyRouter = historyRouter;
|
_historyRouter = historyRouter;
|
||||||
_alarmConditionService = alarmConditionService;
|
_alarmConditionService = alarmConditionService;
|
||||||
|
_configDbHealthy = configDbHealthy;
|
||||||
_loggerFactory = loggerFactory;
|
_loggerFactory = loggerFactory;
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
}
|
}
|
||||||
@@ -179,6 +187,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
|||||||
_healthHost = new HealthEndpointsHost(
|
_healthHost = new HealthEndpointsHost(
|
||||||
_driverHost,
|
_driverHost,
|
||||||
_loggerFactory.CreateLogger<HealthEndpointsHost>(),
|
_loggerFactory.CreateLogger<HealthEndpointsHost>(),
|
||||||
|
configDbHealthy: _configDbHealthy,
|
||||||
usingStaleConfig: _staleConfigFlag is null ? null : () => _staleConfigFlag.IsStale,
|
usingStaleConfig: _staleConfigFlag is null ? null : () => _staleConfigFlag.IsStale,
|
||||||
prefix: _options.HealthEndpointsPrefix);
|
prefix: _options.HealthEndpointsPrefix);
|
||||||
_healthHost.Start();
|
_healthHost.Start();
|
||||||
|
|||||||
@@ -197,6 +197,14 @@ if (wonderwareEnabled)
|
|||||||
builder.Services.AddSingleton<OpcUaApplicationHost>(sp =>
|
builder.Services.AddSingleton<OpcUaApplicationHost>(sp =>
|
||||||
{
|
{
|
||||||
var registry = sp.GetRequiredService<DriverEquipmentContentRegistry>();
|
var registry = sp.GetRequiredService<DriverEquipmentContentRegistry>();
|
||||||
|
|
||||||
|
// Server-007: wire a real config-DB probe so /healthz reflects true DB reachability.
|
||||||
|
// The probe runs CanConnectAsync and caches the result for 10 s to avoid a DB round-trip
|
||||||
|
// on every /healthz poll. The Func<bool> surface on HealthEndpointsHost is synchronous;
|
||||||
|
// the cached value is updated on a background timer rather than blocking the HTTP handler.
|
||||||
|
var dbFactory = sp.GetRequiredService<IDbContextFactory<OtOpcUaConfigDbContext>>();
|
||||||
|
var dbHealthCache = new DbHealthCache(dbFactory);
|
||||||
|
|
||||||
return new OpcUaApplicationHost(
|
return new OpcUaApplicationHost(
|
||||||
sp.GetRequiredService<OpcUaServerOptions>(),
|
sp.GetRequiredService<OpcUaServerOptions>(),
|
||||||
sp.GetRequiredService<DriverHost>(),
|
sp.GetRequiredService<DriverHost>(),
|
||||||
@@ -205,7 +213,8 @@ builder.Services.AddSingleton<OpcUaApplicationHost>(sp =>
|
|||||||
sp.GetRequiredService<ILogger<OpcUaApplicationHost>>(),
|
sp.GetRequiredService<ILogger<OpcUaApplicationHost>>(),
|
||||||
equipmentContentLookup: registry.Get,
|
equipmentContentLookup: registry.Get,
|
||||||
historyRouter: sp.GetRequiredService<IHistoryRouter>(),
|
historyRouter: sp.GetRequiredService<IHistoryRouter>(),
|
||||||
alarmConditionService: sp.GetRequiredService<AlarmConditionService>());
|
alarmConditionService: sp.GetRequiredService<AlarmConditionService>(),
|
||||||
|
configDbHealthy: () => dbHealthCache.IsHealthy);
|
||||||
});
|
});
|
||||||
builder.Services.AddHostedService<OpcUaServerService>();
|
builder.Services.AddHostedService<OpcUaServerService>();
|
||||||
|
|
||||||
@@ -261,3 +270,40 @@ builder.Services.AddSingleton<Phase7Composer>();
|
|||||||
|
|
||||||
var host = builder.Build();
|
var host = builder.Build();
|
||||||
await host.RunAsync();
|
await host.RunAsync();
|
||||||
|
|
||||||
|
// Server-007: lightweight cached config-DB health probe for /healthz.
|
||||||
|
// Runs CanConnectAsync once and caches the result for 10 s so the /healthz
|
||||||
|
// endpoint never blocks an HTTP handler thread on a DB round-trip while still
|
||||||
|
// surfacing real DB reachability rather than the previous always-true default.
|
||||||
|
internal sealed class DbHealthCache : IDisposable
|
||||||
|
{
|
||||||
|
private static readonly TimeSpan CacheTtl = TimeSpan.FromSeconds(10);
|
||||||
|
|
||||||
|
private readonly IDbContextFactory<OtOpcUaConfigDbContext> _factory;
|
||||||
|
private readonly Timer _timer;
|
||||||
|
private volatile bool _isHealthy = true; // optimistic until first probe completes
|
||||||
|
|
||||||
|
public DbHealthCache(IDbContextFactory<OtOpcUaConfigDbContext> factory)
|
||||||
|
{
|
||||||
|
_factory = factory;
|
||||||
|
// Fire immediately, then repeat every CacheTtl.
|
||||||
|
_timer = new Timer(_ => _ = RefreshAsync(), null, TimeSpan.Zero, CacheTtl);
|
||||||
|
}
|
||||||
|
|
||||||
|
public bool IsHealthy => _isHealthy;
|
||||||
|
|
||||||
|
private async Task RefreshAsync()
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await using var ctx = await _factory.CreateDbContextAsync().ConfigureAwait(false);
|
||||||
|
_isHealthy = await ctx.Database.CanConnectAsync().ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
_isHealthy = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose() => _timer.Dispose();
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user