Phase 6.1 Stream C — health endpoints on :4841 + LogContextEnricher + Serilog JSON sink + CapabilityInvoker enrichment
Closes Stream C per docs/v2/implementation/phase-6-1-resilience-and-observability.md. Core.Observability (new namespace): - DriverHealthReport — pure-function aggregation over DriverHealthSnapshot list. Empty fleet = Healthy. Any Faulted = Faulted. Any Unknown/Initializing (no Faulted) = NotReady. Any Degraded or Reconnecting (no Faulted, no NotReady) = Degraded. Else Healthy. HttpStatus(verdict) maps to the Stream C.1 state matrix: Healthy/Degraded → 200, NotReady/Faulted → 503. - LogContextEnricher — Serilog LogContext wrapper. Push(id, type, capability, correlationId) returns an IDisposable scope; inner log calls carry DriverInstanceId / DriverType / CapabilityName / CorrelationId structured properties automatically. NewCorrelationId = 12-hex-char GUID slice for cases where no OPC UA RequestHeader.RequestHandle is in flight. CapabilityInvoker — now threads LogContextEnricher around every ExecuteAsync / ExecuteWriteAsync call site. OtOpcUaServer passes driver.DriverType through so logs correlate to the driver type too. Every capability call emits structured fields per the Stream C.4 compliance check. Server.Observability: - HealthEndpointsHost — standalone HttpListener on http://localhost:4841/ (loopback avoids Windows URL-ACL elevation; remote probing via reverse proxy or explicit netsh urlacl grant). Routes: /healthz → 200 when (configDbReachable OR usingStaleConfig); 503 otherwise. Body: status, uptimeSeconds, configDbReachable, usingStaleConfig. /readyz → DriverHealthReport.Aggregate + HttpStatus mapping. Body: verdict, drivers[], degradedDrivers[], uptimeSeconds. anything else → 404. Disposal cooperative with the HttpListener shutdown. - OpcUaApplicationHost starts the health host after the OPC UA server comes up and disposes it on shutdown. New OpcUaServerOptions knobs: HealthEndpointsEnabled (default true), HealthEndpointsPrefix (default http://localhost:4841/). Program.cs: - Serilog pipeline adds Enrich.FromLogContext + opt-in JSON file sink via `Serilog:WriteJson = true` appsetting. Uses Serilog.Formatting.Compact's CompactJsonFormatter (one JSON object per line — SIEMs like Splunk, Datadog, Graylog ingest without a regex parser). Server.Tests: - Existing 3 OpcUaApplicationHost integration tests now set HealthEndpointsEnabled=false to avoid port :4841 collisions under parallel execution. - New HealthEndpointsHostTests (9): /healthz healthy empty fleet; stale-config returns 200 with flag; unreachable+no-cache returns 503; /readyz empty/ Healthy/Faulted/Degraded/Initializing drivers return correct status and bodies; unknown path → 404. Uses ephemeral ports via Interlocked counter. Core.Tests: - DriverHealthReportTests (8): empty fleet, all-healthy, any-Faulted trumps, any-NotReady without Faulted, Degraded without Faulted/NotReady, HttpStatus per-verdict theory. - LogContextEnricherTests (8): all 4 properties attach; scope disposes cleanly; NewCorrelationId shape; null/whitespace driverInstanceId throws. - CapabilityInvokerEnrichmentTests (2): inner logs carry structured properties; no context leak outside the call site. Full solution dotnet test: 1016 passing (baseline 906, +110 for Phase 6.1 so far across Streams A+B+C). Pre-existing Client.CLI Subscribe flake unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,181 @@
|
||||
using System.Net;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Observability;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Standalone <see cref="HttpListener"/> host for <c>/healthz</c> and <c>/readyz</c>
|
||||
/// separate from the OPC UA binding. Per <c>docs/v2/implementation/phase-6-1-resilience-
|
||||
/// and-observability.md</c> §Stream C.1.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Binds to <c>http://localhost:4841</c> by default — loopback avoids the Windows URL-ACL
|
||||
/// elevation requirement that binding to <c>http://+:4841</c> (wildcard) would impose.
|
||||
/// When a deployment needs remote probing, a reverse proxy or explicit netsh urlacl grant
|
||||
/// is the expected path; documented in <c>docs/v2/Server-Deployment.md</c> in a follow-up.
|
||||
/// </remarks>
|
||||
public sealed class HealthEndpointsHost : IAsyncDisposable
|
||||
{
|
||||
private readonly string _prefix;
|
||||
private readonly DriverHost _driverHost;
|
||||
private readonly Func<bool> _configDbHealthy;
|
||||
private readonly Func<bool> _usingStaleConfig;
|
||||
private readonly ILogger<HealthEndpointsHost> _logger;
|
||||
private readonly HttpListener _listener = new();
|
||||
private readonly DateTime _startedUtc = DateTime.UtcNow;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _acceptLoop;
|
||||
private bool _disposed;
|
||||
|
||||
public HealthEndpointsHost(
|
||||
DriverHost driverHost,
|
||||
ILogger<HealthEndpointsHost> logger,
|
||||
Func<bool>? configDbHealthy = null,
|
||||
Func<bool>? usingStaleConfig = null,
|
||||
string prefix = "http://localhost:4841/")
|
||||
{
|
||||
_driverHost = driverHost;
|
||||
_logger = logger;
|
||||
_configDbHealthy = configDbHealthy ?? (() => true);
|
||||
_usingStaleConfig = usingStaleConfig ?? (() => false);
|
||||
_prefix = prefix.EndsWith('/') ? prefix : prefix + "/";
|
||||
_listener.Prefixes.Add(_prefix);
|
||||
}
|
||||
|
||||
public void Start()
|
||||
{
|
||||
_listener.Start();
|
||||
_cts = new CancellationTokenSource();
|
||||
_acceptLoop = Task.Run(() => AcceptLoopAsync(_cts.Token));
|
||||
_logger.LogInformation("Health endpoints listening on {Prefix}", _prefix);
|
||||
}
|
||||
|
||||
private async Task AcceptLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
HttpListenerContext ctx;
|
||||
try
|
||||
{
|
||||
ctx = await _listener.GetContextAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (HttpListenerException) when (ct.IsCancellationRequested) { break; }
|
||||
catch (ObjectDisposedException) { break; }
|
||||
|
||||
_ = Task.Run(() => HandleAsync(ctx), ct);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleAsync(HttpListenerContext ctx)
|
||||
{
|
||||
try
|
||||
{
|
||||
var path = ctx.Request.Url?.AbsolutePath ?? "/";
|
||||
switch (path)
|
||||
{
|
||||
case "/healthz":
|
||||
await WriteHealthzAsync(ctx).ConfigureAwait(false);
|
||||
break;
|
||||
case "/readyz":
|
||||
await WriteReadyzAsync(ctx).ConfigureAwait(false);
|
||||
break;
|
||||
default:
|
||||
ctx.Response.StatusCode = 404;
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Health endpoint handler failure");
|
||||
try { ctx.Response.StatusCode = 500; } catch { /* ignore */ }
|
||||
}
|
||||
finally
|
||||
{
|
||||
try { ctx.Response.Close(); } catch { /* ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
private async Task WriteHealthzAsync(HttpListenerContext ctx)
|
||||
{
|
||||
var configHealthy = _configDbHealthy();
|
||||
var staleConfig = _usingStaleConfig();
|
||||
// /healthz is 200 when process alive + (config DB reachable OR cache-warm).
|
||||
// Stale-config still serves 200 so the process isn't flagged dead when the DB
|
||||
// blips; the body surfaces the stale flag for operators.
|
||||
var healthy = configHealthy || staleConfig;
|
||||
ctx.Response.StatusCode = healthy ? 200 : 503;
|
||||
|
||||
var body = JsonSerializer.Serialize(new
|
||||
{
|
||||
status = healthy ? "healthy" : "unhealthy",
|
||||
uptimeSeconds = (int)(DateTime.UtcNow - _startedUtc).TotalSeconds,
|
||||
configDbReachable = configHealthy,
|
||||
usingStaleConfig = staleConfig,
|
||||
});
|
||||
await WriteBodyAsync(ctx, body).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task WriteReadyzAsync(HttpListenerContext ctx)
|
||||
{
|
||||
var snapshots = BuildSnapshots();
|
||||
var verdict = DriverHealthReport.Aggregate(snapshots);
|
||||
ctx.Response.StatusCode = DriverHealthReport.HttpStatus(verdict);
|
||||
|
||||
var body = JsonSerializer.Serialize(new
|
||||
{
|
||||
verdict = verdict.ToString(),
|
||||
uptimeSeconds = (int)(DateTime.UtcNow - _startedUtc).TotalSeconds,
|
||||
drivers = snapshots.Select(d => new
|
||||
{
|
||||
id = d.DriverInstanceId,
|
||||
state = d.State.ToString(),
|
||||
detail = d.DetailMessage,
|
||||
}).ToArray(),
|
||||
degradedDrivers = snapshots
|
||||
.Where(d => d.State == DriverState.Degraded || d.State == DriverState.Reconnecting)
|
||||
.Select(d => d.DriverInstanceId)
|
||||
.ToArray(),
|
||||
});
|
||||
await WriteBodyAsync(ctx, body).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private IReadOnlyList<DriverHealthSnapshot> BuildSnapshots()
|
||||
{
|
||||
var list = new List<DriverHealthSnapshot>();
|
||||
foreach (var id in _driverHost.RegisteredDriverIds)
|
||||
{
|
||||
var driver = _driverHost.GetDriver(id);
|
||||
if (driver is null) continue;
|
||||
var health = driver.GetHealth();
|
||||
list.Add(new DriverHealthSnapshot(driver.DriverInstanceId, health.State, health.LastError));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private static async Task WriteBodyAsync(HttpListenerContext ctx, string body)
|
||||
{
|
||||
var bytes = Encoding.UTF8.GetBytes(body);
|
||||
ctx.Response.ContentType = "application/json; charset=utf-8";
|
||||
ctx.Response.ContentLength64 = bytes.LongLength;
|
||||
await ctx.Response.OutputStream.WriteAsync(bytes).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_cts?.Cancel();
|
||||
try { _listener.Stop(); } catch { /* ignore */ }
|
||||
if (_acceptLoop is not null)
|
||||
{
|
||||
try { await _acceptLoop.ConfigureAwait(false); } catch { /* ignore */ }
|
||||
}
|
||||
_listener.Close();
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ using Opc.Ua.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.OpcUa;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Resilience;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Observability;
|
||||
using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Server.OpcUa;
|
||||
@@ -26,6 +27,7 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
private readonly ILogger<OpcUaApplicationHost> _logger;
|
||||
private ApplicationInstance? _application;
|
||||
private OtOpcUaServer? _server;
|
||||
private HealthEndpointsHost? _healthHost;
|
||||
private bool _disposed;
|
||||
|
||||
public OpcUaApplicationHost(OpcUaServerOptions options, DriverHost driverHost,
|
||||
@@ -68,6 +70,17 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
_logger.LogInformation("OPC UA server started — endpoint={Endpoint} driverCount={Count}",
|
||||
_options.EndpointUrl, _server.DriverNodeManagers.Count);
|
||||
|
||||
// Phase 6.1 Stream C: health endpoints on :4841 (loopback by default — see
|
||||
// HealthEndpointsHost remarks for the Windows URL-ACL tradeoff).
|
||||
if (_options.HealthEndpointsEnabled)
|
||||
{
|
||||
_healthHost = new HealthEndpointsHost(
|
||||
_driverHost,
|
||||
_loggerFactory.CreateLogger<HealthEndpointsHost>(),
|
||||
prefix: _options.HealthEndpointsPrefix);
|
||||
_healthHost.Start();
|
||||
}
|
||||
|
||||
// Drive each driver's discovery through its node manager. The node manager IS the
|
||||
// IAddressSpaceBuilder; GenericDriverNodeManager captures alarm-condition sinks into
|
||||
// its internal map and wires OnAlarmEvent → sink routing.
|
||||
@@ -221,6 +234,12 @@ public sealed class OpcUaApplicationHost : IAsyncDisposable
|
||||
{
|
||||
_logger.LogWarning(ex, "OPC UA server stop threw during dispose");
|
||||
}
|
||||
|
||||
if (_healthHost is not null)
|
||||
{
|
||||
try { await _healthHost.DisposeAsync().ConfigureAwait(false); }
|
||||
catch (Exception ex) { _logger.LogWarning(ex, "Health endpoints host dispose threw"); }
|
||||
}
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +58,20 @@ public sealed class OpcUaServerOptions
|
||||
/// </summary>
|
||||
public bool AutoAcceptUntrustedClientCertificates { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to start the Phase 6.1 Stream C <c>/healthz</c> + <c>/readyz</c> HTTP listener.
|
||||
/// Defaults to <c>true</c>; set false in embedded deployments that don't need HTTP
|
||||
/// (e.g. tests that only exercise the OPC UA surface).
|
||||
/// </summary>
|
||||
public bool HealthEndpointsEnabled { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// URL prefix the health endpoints bind to. Default <c>http://localhost:4841/</c> — loopback
|
||||
/// avoids Windows URL-ACL elevation. Production deployments that need remote probing should
|
||||
/// either reverse-proxy or use <c>http://+:4841/</c> with netsh urlacl granted.
|
||||
/// </summary>
|
||||
public string HealthEndpointsPrefix { get; init; } = "http://localhost:4841/";
|
||||
|
||||
/// <summary>
|
||||
/// Security profile advertised on the endpoint. Default <see cref="OpcUaSecurityProfile.None"/>
|
||||
/// preserves the PR 17 endpoint shape; set to <see cref="OpcUaSecurityProfile.Basic256Sha256SignAndEncrypt"/>
|
||||
|
||||
@@ -57,7 +57,7 @@ public sealed class OtOpcUaServer : StandardServer
|
||||
// per-type tiers into DriverTypeRegistry. Read ResilienceConfig JSON from the
|
||||
// DriverInstance row in a follow-up PR; for now every driver gets Tier A defaults.
|
||||
var options = new DriverResilienceOptions { Tier = DriverTier.A };
|
||||
var invoker = new CapabilityInvoker(_pipelineBuilder, driver.DriverInstanceId, () => options);
|
||||
var invoker = new CapabilityInvoker(_pipelineBuilder, driver.DriverInstanceId, () => options, driver.DriverType);
|
||||
var manager = new DriverNodeManager(server, configuration, driver, invoker, logger);
|
||||
_driverNodeManagers.Add(manager);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Serilog;
|
||||
using Serilog.Formatting.Compact;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.LocalCache;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Hosting;
|
||||
@@ -13,11 +14,25 @@ using ZB.MOM.WW.OtOpcUa.Server.Security;
|
||||
|
||||
var builder = Host.CreateApplicationBuilder(args);
|
||||
|
||||
Log.Logger = new LoggerConfiguration()
|
||||
// Per Phase 6.1 Stream C.3: SIEMs (Splunk, Datadog) ingest the JSON file without a
|
||||
// regex parser. Plain-text rolling file stays on by default for human readability;
|
||||
// JSON file is opt-in via appsetting `Serilog:WriteJson = true`.
|
||||
var writeJson = builder.Configuration.GetValue<bool>("Serilog:WriteJson");
|
||||
var loggerBuilder = new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(builder.Configuration)
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.Console()
|
||||
.WriteTo.File("logs/otopcua-.log", rollingInterval: RollingInterval.Day)
|
||||
.CreateLogger();
|
||||
.WriteTo.File("logs/otopcua-.log", rollingInterval: RollingInterval.Day);
|
||||
|
||||
if (writeJson)
|
||||
{
|
||||
loggerBuilder = loggerBuilder.WriteTo.File(
|
||||
new CompactJsonFormatter(),
|
||||
"logs/otopcua-.json.log",
|
||||
rollingInterval: RollingInterval.Day);
|
||||
}
|
||||
|
||||
Log.Logger = loggerBuilder.CreateLogger();
|
||||
|
||||
builder.Services.AddSerilog();
|
||||
builder.Services.AddWindowsService(o => o.ServiceName = "OtOpcUa");
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
<PackageReference Include="Serilog.Settings.Configuration" Version="9.0.0"/>
|
||||
<PackageReference Include="Serilog.Sinks.Console" Version="6.0.0"/>
|
||||
<PackageReference Include="Serilog.Sinks.File" Version="7.0.0"/>
|
||||
<PackageReference Include="Serilog.Formatting.Compact" Version="3.0.0"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Server" Version="1.5.374.126"/>
|
||||
<PackageReference Include="OPCFoundation.NetStandard.Opc.Ua.Configuration" Version="1.5.374.126"/>
|
||||
<PackageReference Include="Novell.Directory.Ldap.NETStandard" Version="3.6.0"/>
|
||||
|
||||
Reference in New Issue
Block a user