Add rich HTTP health endpoints for cluster monitoring

Enhance /api/health with component-level health, ServiceLevel, and
redundancy state for load balancer probes. Add /health HTML page for
operators to monitor node health in clustered System Platform deployments.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-03-28 16:44:31 -04:00
parent f0a076ec26
commit 9d3599fbb6
5 changed files with 381 additions and 3 deletions

View File

@@ -201,6 +201,73 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
public List<string> ServerUris { get; set; } = new();
}
/// <summary>
/// DTO for the /api/health endpoint. Includes component-level health, ServiceLevel, and redundancy state.
/// </summary>
public class HealthEndpointData
{
/// <summary>
/// Gets or sets the overall health status: Healthy, Degraded, or Unhealthy.
/// </summary>
public string Status { get; set; } = "Unknown";
/// <summary>
/// Gets or sets the computed OPC UA ServiceLevel byte (0-255). Only meaningful when redundancy is enabled.
/// </summary>
public byte ServiceLevel { get; set; }
/// <summary>
/// Gets or sets whether redundancy is enabled.
/// </summary>
public bool RedundancyEnabled { get; set; }
/// <summary>
/// Gets or sets this instance's redundancy role when enabled (Primary/Secondary), or null when disabled.
/// </summary>
public string? RedundancyRole { get; set; }
/// <summary>
/// Gets or sets the redundancy mode when enabled (Warm/Hot), or null when disabled.
/// </summary>
public string? RedundancyMode { get; set; }
/// <summary>
/// Gets or sets the per-component health breakdown.
/// </summary>
public ComponentHealth Components { get; set; } = new();
/// <summary>
/// Gets or sets the server uptime since the health endpoint was initialized.
/// </summary>
public string Uptime { get; set; } = "";
/// <summary>
/// Gets or sets the UTC timestamp of this health snapshot.
/// </summary>
public DateTime Timestamp { get; set; } = DateTime.UtcNow;
}
/// <summary>
/// Per-component health breakdown for the health endpoint.
/// </summary>
public class ComponentHealth
{
/// <summary>
/// Gets or sets MXAccess runtime connectivity status.
/// </summary>
public string MxAccess { get; set; } = "Disconnected";
/// <summary>
/// Gets or sets Galaxy repository database connectivity status.
/// </summary>
public string Database { get; set; } = "Disconnected";
/// <summary>
/// Gets or sets OPC UA server status.
/// </summary>
public string OpcUaServer { get; set; } = "Stopped";
}
/// <summary>
/// Dashboard model for the status page footer.
/// </summary>

View File

@@ -16,6 +16,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
{
private readonly HealthCheckService _healthCheck;
private readonly int _refreshIntervalSeconds;
private readonly DateTime _startTime = DateTime.UtcNow;
private IMxAccessClient? _mxAccessClient;
private PerformanceMetrics? _metrics;
@@ -229,5 +230,132 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
var state = _mxAccessClient?.State ?? ConnectionState.Disconnected;
return _healthCheck.IsHealthy(state, _metrics);
}
/// <summary>
/// Builds the rich health endpoint data including component health, ServiceLevel, and redundancy state.
/// </summary>
public HealthEndpointData GetHealthData()
{
var connectionState = _mxAccessClient?.State ?? ConnectionState.Disconnected;
var mxConnected = connectionState == ConnectionState.Connected;
var dbConnected = _galaxyStats?.DbConnected ?? false;
var health = _healthCheck.CheckHealth(connectionState, _metrics);
var uptime = DateTime.UtcNow - _startTime;
var data = new HealthEndpointData
{
Status = health.Status,
RedundancyEnabled = _redundancyConfig?.Enabled ?? false,
Components = new ComponentHealth
{
MxAccess = connectionState.ToString(),
Database = dbConnected ? "Connected" : "Disconnected",
OpcUaServer = (_serverHost?.IsRunning ?? false) ? "Running" : "Stopped"
},
Uptime = FormatUptime(uptime),
Timestamp = DateTime.UtcNow
};
if (_redundancyConfig != null && _redundancyConfig.Enabled)
{
var isPrimary = string.Equals(_redundancyConfig.Role, "Primary", StringComparison.OrdinalIgnoreCase);
var baseLevel = isPrimary
? _redundancyConfig.ServiceLevelBase
: Math.Max(0, _redundancyConfig.ServiceLevelBase - 50);
var calculator = new ServiceLevelCalculator();
data.ServiceLevel = calculator.Calculate(baseLevel, mxConnected, dbConnected);
data.RedundancyRole = _redundancyConfig.Role;
data.RedundancyMode = _redundancyConfig.Mode;
}
else
{
// Non-redundant: 255 when healthy, 0 when both down
data.ServiceLevel = mxConnected ? (byte)255 : (byte)0;
}
return data;
}
/// <summary>
/// Generates the JSON payload for the /api/health endpoint.
/// </summary>
public string GenerateHealthJson()
{
var data = GetHealthData();
return JsonSerializer.Serialize(data, new JsonSerializerOptions { WriteIndented = true });
}
/// <summary>
/// Generates a focused health status HTML page for operators and monitoring dashboards.
/// </summary>
public string GenerateHealthHtml()
{
var data = GetHealthData();
var sb = new StringBuilder();
var statusColor = data.Status == "Healthy" ? "#00cc66" : data.Status == "Degraded" ? "#cccc33" : "#cc3333";
var mxColor = data.Components.MxAccess == "Connected" ? "#00cc66" : "#cc3333";
var dbColor = data.Components.Database == "Connected" ? "#00cc66" : "#cc3333";
var uaColor = data.Components.OpcUaServer == "Running" ? "#00cc66" : "#cc3333";
sb.AppendLine("<!DOCTYPE html><html><head>");
sb.AppendLine("<meta charset='utf-8'>");
sb.AppendLine($"<meta http-equiv='refresh' content='{_refreshIntervalSeconds}'>");
sb.AppendLine("<title>LmxOpcUa Health</title>");
sb.AppendLine("<style>");
sb.AppendLine("body { font-family: monospace; background: #1a1a2e; color: #eee; padding: 20px; margin: 0; }");
sb.AppendLine(".header { text-align: center; padding: 30px 0; }");
sb.AppendLine(".status-badge { display: inline-block; font-size: 2em; font-weight: bold; padding: 15px 40px; border-radius: 12px; letter-spacing: 2px; }");
sb.AppendLine(".service-level { text-align: center; font-size: 4em; font-weight: bold; margin: 20px 0; }");
sb.AppendLine(".service-level .label { font-size: 0.3em; color: #999; display: block; }");
sb.AppendLine(".components { display: flex; justify-content: center; gap: 20px; flex-wrap: wrap; margin: 30px auto; max-width: 800px; }");
sb.AppendLine(".component { border: 2px solid #444; border-radius: 8px; padding: 20px; min-width: 200px; text-align: center; }");
sb.AppendLine(".component .name { font-size: 0.9em; color: #999; margin-bottom: 8px; }");
sb.AppendLine(".component .value { font-size: 1.3em; font-weight: bold; }");
sb.AppendLine(".meta { text-align: center; margin-top: 30px; color: #666; font-size: 0.85em; }");
sb.AppendLine(".redundancy { text-align: center; margin: 10px 0; color: #999; }");
sb.AppendLine(".redundancy b { color: #66ccff; }");
sb.AppendLine("</style></head><body>");
// Status badge
sb.AppendLine("<div class='header'>");
sb.AppendLine($"<div class='status-badge' style='background: {statusColor}; color: #000;'>{data.Status.ToUpperInvariant()}</div>");
sb.AppendLine("</div>");
// Service Level
sb.AppendLine($"<div class='service-level' style='color: {statusColor};'>");
sb.AppendLine("<span class='label'>SERVICE LEVEL</span>");
sb.AppendLine($"{data.ServiceLevel}");
sb.AppendLine("</div>");
// Redundancy info
if (data.RedundancyEnabled)
{
sb.AppendLine($"<div class='redundancy'>Role: <b>{data.RedundancyRole}</b> | Mode: <b>{data.RedundancyMode}</b></div>");
}
// Component health cards
sb.AppendLine("<div class='components'>");
sb.AppendLine($"<div class='component' style='border-color: {mxColor};'><div class='name'>MXAccess</div><div class='value' style='color: {mxColor};'>{data.Components.MxAccess}</div></div>");
sb.AppendLine($"<div class='component' style='border-color: {dbColor};'><div class='name'>Galaxy Database</div><div class='value' style='color: {dbColor};'>{data.Components.Database}</div></div>");
sb.AppendLine($"<div class='component' style='border-color: {uaColor};'><div class='name'>OPC UA Server</div><div class='value' style='color: {uaColor};'>{data.Components.OpcUaServer}</div></div>");
sb.AppendLine("</div>");
// Footer
sb.AppendLine($"<div class='meta'>Uptime: {data.Uptime} | {data.Timestamp:O}</div>");
sb.AppendLine("</body></html>");
return sb.ToString();
}
private static string FormatUptime(TimeSpan ts)
{
if (ts.TotalDays >= 1)
return $"{(int)ts.TotalDays}d {ts.Hours}h {ts.Minutes}m";
if (ts.TotalHours >= 1)
return $"{(int)ts.TotalHours}h {ts.Minutes}m";
return $"{(int)ts.TotalMinutes}m {ts.Seconds}s";
}
}
}

View File

@@ -120,14 +120,19 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
await WriteResponse(response, _reportService.GenerateHtml(), "text/html", 200);
break;
case "/health":
await WriteResponse(response, _reportService.GenerateHealthHtml(), "text/html", 200);
break;
case "/api/status":
await WriteResponse(response, _reportService.GenerateJson(), "application/json", 200);
break;
case "/api/health":
var isHealthy = _reportService.IsHealthy();
var healthJson = isHealthy ? "{\"status\":\"healthy\"}" : "{\"status\":\"unhealthy\"}";
await WriteResponse(response, healthJson, "application/json", isHealthy ? 200 : 503);
var healthData = _reportService.GetHealthData();
var healthJson = _reportService.GenerateHealthJson();
var healthStatusCode = healthData.Status == "Unhealthy" ? 503 : 200;
await WriteResponse(response, healthJson, "application/json", healthStatusCode);
break;
default:

View File

@@ -132,6 +132,139 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.Status
sut.IsHealthy().ShouldBe(false);
}
[Fact]
public void GetHealthData_WhenConnected_ReturnsHealthyStatus()
{
var sut = CreateService();
var data = sut.GetHealthData();
data.Status.ShouldBe("Healthy");
data.Components.MxAccess.ShouldBe("Connected");
data.Components.Database.ShouldBe("Connected");
}
[Fact]
public void GetHealthData_WhenDisconnected_ReturnsUnhealthyStatus()
{
var mxClient = new FakeMxAccessClient { State = ConnectionState.Disconnected };
var galaxyStats = new GalaxyRepositoryStats { DbConnected = false };
var sut = new StatusReportService(new HealthCheckService(), 10);
sut.SetComponents(mxClient, null, galaxyStats, null);
var data = sut.GetHealthData();
data.Status.ShouldBe("Unhealthy");
data.ServiceLevel.ShouldBe((byte)0);
data.Components.MxAccess.ShouldBe("Disconnected");
data.Components.Database.ShouldBe("Disconnected");
}
[Fact]
public void GetHealthData_NoRedundancy_ServiceLevel255WhenHealthy()
{
var sut = CreateService();
var data = sut.GetHealthData();
data.RedundancyEnabled.ShouldBe(false);
data.ServiceLevel.ShouldBe((byte)255);
data.RedundancyRole.ShouldBeNull();
data.RedundancyMode.ShouldBeNull();
}
[Fact]
public void GetHealthData_WithRedundancy_IncludesRoleAndServiceLevel()
{
var sut = CreateServiceWithRedundancy("Primary");
var data = sut.GetHealthData();
data.RedundancyEnabled.ShouldBe(true);
data.RedundancyRole.ShouldBe("Primary");
data.RedundancyMode.ShouldBe("Warm");
data.ServiceLevel.ShouldBe((byte)200);
}
[Fact]
public void GetHealthData_SecondaryRole_LowerServiceLevel()
{
var sut = CreateServiceWithRedundancy("Secondary");
var data = sut.GetHealthData();
data.ServiceLevel.ShouldBe((byte)150);
}
[Fact]
public void GetHealthData_ContainsUptime()
{
var sut = CreateService();
var data = sut.GetHealthData();
data.Uptime.ShouldNotBeNullOrWhiteSpace();
}
[Fact]
public void GetHealthData_ContainsTimestamp()
{
var sut = CreateService();
var data = sut.GetHealthData();
data.Timestamp.ShouldBeGreaterThan(DateTime.UtcNow.AddMinutes(-1));
}
[Fact]
public void GenerateHealthJson_ContainsExpectedFields()
{
var sut = CreateService();
var json = sut.GenerateHealthJson();
json.ShouldContain("Status");
json.ShouldContain("ServiceLevel");
json.ShouldContain("Components");
json.ShouldContain("MxAccess");
json.ShouldContain("Database");
json.ShouldContain("OpcUaServer");
json.ShouldContain("Uptime");
}
[Fact]
public void GenerateHealthHtml_ContainsStatusBadge()
{
var sut = CreateService();
var html = sut.GenerateHealthHtml();
html.ShouldContain("HEALTHY");
html.ShouldContain("SERVICE LEVEL");
html.ShouldContain("255");
}
[Fact]
public void GenerateHealthHtml_ContainsComponentCards()
{
var sut = CreateService();
var html = sut.GenerateHealthHtml();
html.ShouldContain("MXAccess");
html.ShouldContain("Galaxy Database");
html.ShouldContain("OPC UA Server");
}
[Fact]
public void GenerateHealthHtml_WithRedundancy_ShowsRoleAndMode()
{
var sut = CreateServiceWithRedundancy("Primary");
var html = sut.GenerateHealthHtml();
html.ShouldContain("Primary");
html.ShouldContain("Warm");
}
[Fact]
public void GenerateHealthHtml_ContainsAutoRefresh()
{
var sut = CreateService();
var html = sut.GenerateHealthHtml();
html.ShouldContain("meta http-equiv='refresh' content='10'");
}
/// <summary>
/// Creates a status report service preloaded with representative runtime, Galaxy, and metrics data.
/// </summary>
@@ -157,5 +290,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.Status
sut.SetComponents(mxClient, metrics, galaxyStats, null);
return sut;
}
private static StatusReportService CreateServiceWithRedundancy(string role)
{
var mxClient = new FakeMxAccessClient();
var galaxyStats = new GalaxyRepositoryStats { GalaxyName = "TestGalaxy", DbConnected = true };
var redundancyConfig = new Host.Configuration.RedundancyConfiguration
{
Enabled = true,
Mode = "Warm",
Role = role,
ServiceLevelBase = 200
};
var sut = new StatusReportService(new HealthCheckService(), 10);
sut.SetComponents(mxClient, null, galaxyStats, null, null, redundancyConfig, "urn:test:instance1");
return sut;
}
}
}

View File

@@ -107,6 +107,35 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.Status
response.Headers.CacheControl?.NoStore.ShouldBe(true);
}
/// <summary>
/// Confirms that the /health route returns an HTML health page.
/// </summary>
[Fact]
public async Task HealthPage_ReturnsHtml200()
{
var response = await _client.GetAsync("/health");
response.StatusCode.ShouldBe(HttpStatusCode.OK);
response.Content.Headers.ContentType?.MediaType.ShouldBe("text/html");
var body = await response.Content.ReadAsStringAsync();
body.ShouldContain("SERVICE LEVEL");
body.ShouldContain("MXAccess");
}
/// <summary>
/// Confirms that /api/health returns rich JSON with component health details.
/// </summary>
[Fact]
public async Task ApiHealth_ReturnsRichJson()
{
var response = await _client.GetAsync("/api/health");
response.StatusCode.ShouldBe(HttpStatusCode.OK);
response.Content.Headers.ContentType?.MediaType.ShouldBe("application/json");
var body = await response.Content.ReadAsStringAsync();
body.ShouldContain("ServiceLevel");
body.ShouldContain("Components");
body.ShouldContain("Uptime");
}
/// <summary>
/// Confirms that the server can be started and stopped cleanly.
/// </summary>