Add rich HTTP health endpoints for cluster monitoring
Enhance /api/health with component-level health, ServiceLevel, and redundancy state for load balancer probes. Add /health HTML page for operators to monitor node health in clustered System Platform deployments. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -201,6 +201,73 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
public List<string> ServerUris { get; set; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DTO for the /api/health endpoint. Includes component-level health, ServiceLevel, and redundancy state.
|
||||
/// </summary>
|
||||
public class HealthEndpointData
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the overall health status: Healthy, Degraded, or Unhealthy.
|
||||
/// </summary>
|
||||
public string Status { get; set; } = "Unknown";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the computed OPC UA ServiceLevel byte (0-255). Only meaningful when redundancy is enabled.
|
||||
/// </summary>
|
||||
public byte ServiceLevel { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets whether redundancy is enabled.
|
||||
/// </summary>
|
||||
public bool RedundancyEnabled { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets this instance's redundancy role when enabled (Primary/Secondary), or null when disabled.
|
||||
/// </summary>
|
||||
public string? RedundancyRole { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the redundancy mode when enabled (Warm/Hot), or null when disabled.
|
||||
/// </summary>
|
||||
public string? RedundancyMode { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the per-component health breakdown.
|
||||
/// </summary>
|
||||
public ComponentHealth Components { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the server uptime since the health endpoint was initialized.
|
||||
/// </summary>
|
||||
public string Uptime { get; set; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the UTC timestamp of this health snapshot.
|
||||
/// </summary>
|
||||
public DateTime Timestamp { get; set; } = DateTime.UtcNow;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-component health breakdown for the health endpoint.
|
||||
/// </summary>
|
||||
public class ComponentHealth
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets MXAccess runtime connectivity status.
|
||||
/// </summary>
|
||||
public string MxAccess { get; set; } = "Disconnected";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets Galaxy repository database connectivity status.
|
||||
/// </summary>
|
||||
public string Database { get; set; } = "Disconnected";
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets OPC UA server status.
|
||||
/// </summary>
|
||||
public string OpcUaServer { get; set; } = "Stopped";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dashboard model for the status page footer.
|
||||
/// </summary>
|
||||
|
||||
@@ -16,6 +16,7 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
{
|
||||
private readonly HealthCheckService _healthCheck;
|
||||
private readonly int _refreshIntervalSeconds;
|
||||
private readonly DateTime _startTime = DateTime.UtcNow;
|
||||
|
||||
private IMxAccessClient? _mxAccessClient;
|
||||
private PerformanceMetrics? _metrics;
|
||||
@@ -229,5 +230,132 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
var state = _mxAccessClient?.State ?? ConnectionState.Disconnected;
|
||||
return _healthCheck.IsHealthy(state, _metrics);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the rich health endpoint data including component health, ServiceLevel, and redundancy state.
|
||||
/// </summary>
|
||||
public HealthEndpointData GetHealthData()
|
||||
{
|
||||
var connectionState = _mxAccessClient?.State ?? ConnectionState.Disconnected;
|
||||
var mxConnected = connectionState == ConnectionState.Connected;
|
||||
var dbConnected = _galaxyStats?.DbConnected ?? false;
|
||||
var health = _healthCheck.CheckHealth(connectionState, _metrics);
|
||||
var uptime = DateTime.UtcNow - _startTime;
|
||||
|
||||
var data = new HealthEndpointData
|
||||
{
|
||||
Status = health.Status,
|
||||
RedundancyEnabled = _redundancyConfig?.Enabled ?? false,
|
||||
Components = new ComponentHealth
|
||||
{
|
||||
MxAccess = connectionState.ToString(),
|
||||
Database = dbConnected ? "Connected" : "Disconnected",
|
||||
OpcUaServer = (_serverHost?.IsRunning ?? false) ? "Running" : "Stopped"
|
||||
},
|
||||
Uptime = FormatUptime(uptime),
|
||||
Timestamp = DateTime.UtcNow
|
||||
};
|
||||
|
||||
if (_redundancyConfig != null && _redundancyConfig.Enabled)
|
||||
{
|
||||
var isPrimary = string.Equals(_redundancyConfig.Role, "Primary", StringComparison.OrdinalIgnoreCase);
|
||||
var baseLevel = isPrimary
|
||||
? _redundancyConfig.ServiceLevelBase
|
||||
: Math.Max(0, _redundancyConfig.ServiceLevelBase - 50);
|
||||
var calculator = new ServiceLevelCalculator();
|
||||
|
||||
data.ServiceLevel = calculator.Calculate(baseLevel, mxConnected, dbConnected);
|
||||
data.RedundancyRole = _redundancyConfig.Role;
|
||||
data.RedundancyMode = _redundancyConfig.Mode;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Non-redundant: 255 when healthy, 0 when both down
|
||||
data.ServiceLevel = mxConnected ? (byte)255 : (byte)0;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates the JSON payload for the /api/health endpoint.
|
||||
/// </summary>
|
||||
public string GenerateHealthJson()
|
||||
{
|
||||
var data = GetHealthData();
|
||||
return JsonSerializer.Serialize(data, new JsonSerializerOptions { WriteIndented = true });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a focused health status HTML page for operators and monitoring dashboards.
|
||||
/// </summary>
|
||||
public string GenerateHealthHtml()
|
||||
{
|
||||
var data = GetHealthData();
|
||||
var sb = new StringBuilder();
|
||||
|
||||
var statusColor = data.Status == "Healthy" ? "#00cc66" : data.Status == "Degraded" ? "#cccc33" : "#cc3333";
|
||||
var mxColor = data.Components.MxAccess == "Connected" ? "#00cc66" : "#cc3333";
|
||||
var dbColor = data.Components.Database == "Connected" ? "#00cc66" : "#cc3333";
|
||||
var uaColor = data.Components.OpcUaServer == "Running" ? "#00cc66" : "#cc3333";
|
||||
|
||||
sb.AppendLine("<!DOCTYPE html><html><head>");
|
||||
sb.AppendLine("<meta charset='utf-8'>");
|
||||
sb.AppendLine($"<meta http-equiv='refresh' content='{_refreshIntervalSeconds}'>");
|
||||
sb.AppendLine("<title>LmxOpcUa Health</title>");
|
||||
sb.AppendLine("<style>");
|
||||
sb.AppendLine("body { font-family: monospace; background: #1a1a2e; color: #eee; padding: 20px; margin: 0; }");
|
||||
sb.AppendLine(".header { text-align: center; padding: 30px 0; }");
|
||||
sb.AppendLine(".status-badge { display: inline-block; font-size: 2em; font-weight: bold; padding: 15px 40px; border-radius: 12px; letter-spacing: 2px; }");
|
||||
sb.AppendLine(".service-level { text-align: center; font-size: 4em; font-weight: bold; margin: 20px 0; }");
|
||||
sb.AppendLine(".service-level .label { font-size: 0.3em; color: #999; display: block; }");
|
||||
sb.AppendLine(".components { display: flex; justify-content: center; gap: 20px; flex-wrap: wrap; margin: 30px auto; max-width: 800px; }");
|
||||
sb.AppendLine(".component { border: 2px solid #444; border-radius: 8px; padding: 20px; min-width: 200px; text-align: center; }");
|
||||
sb.AppendLine(".component .name { font-size: 0.9em; color: #999; margin-bottom: 8px; }");
|
||||
sb.AppendLine(".component .value { font-size: 1.3em; font-weight: bold; }");
|
||||
sb.AppendLine(".meta { text-align: center; margin-top: 30px; color: #666; font-size: 0.85em; }");
|
||||
sb.AppendLine(".redundancy { text-align: center; margin: 10px 0; color: #999; }");
|
||||
sb.AppendLine(".redundancy b { color: #66ccff; }");
|
||||
sb.AppendLine("</style></head><body>");
|
||||
|
||||
// Status badge
|
||||
sb.AppendLine("<div class='header'>");
|
||||
sb.AppendLine($"<div class='status-badge' style='background: {statusColor}; color: #000;'>{data.Status.ToUpperInvariant()}</div>");
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Service Level
|
||||
sb.AppendLine($"<div class='service-level' style='color: {statusColor};'>");
|
||||
sb.AppendLine("<span class='label'>SERVICE LEVEL</span>");
|
||||
sb.AppendLine($"{data.ServiceLevel}");
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Redundancy info
|
||||
if (data.RedundancyEnabled)
|
||||
{
|
||||
sb.AppendLine($"<div class='redundancy'>Role: <b>{data.RedundancyRole}</b> | Mode: <b>{data.RedundancyMode}</b></div>");
|
||||
}
|
||||
|
||||
// Component health cards
|
||||
sb.AppendLine("<div class='components'>");
|
||||
sb.AppendLine($"<div class='component' style='border-color: {mxColor};'><div class='name'>MXAccess</div><div class='value' style='color: {mxColor};'>{data.Components.MxAccess}</div></div>");
|
||||
sb.AppendLine($"<div class='component' style='border-color: {dbColor};'><div class='name'>Galaxy Database</div><div class='value' style='color: {dbColor};'>{data.Components.Database}</div></div>");
|
||||
sb.AppendLine($"<div class='component' style='border-color: {uaColor};'><div class='name'>OPC UA Server</div><div class='value' style='color: {uaColor};'>{data.Components.OpcUaServer}</div></div>");
|
||||
sb.AppendLine("</div>");
|
||||
|
||||
// Footer
|
||||
sb.AppendLine($"<div class='meta'>Uptime: {data.Uptime} | {data.Timestamp:O}</div>");
|
||||
|
||||
sb.AppendLine("</body></html>");
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string FormatUptime(TimeSpan ts)
|
||||
{
|
||||
if (ts.TotalDays >= 1)
|
||||
return $"{(int)ts.TotalDays}d {ts.Hours}h {ts.Minutes}m";
|
||||
if (ts.TotalHours >= 1)
|
||||
return $"{(int)ts.TotalHours}h {ts.Minutes}m";
|
||||
return $"{(int)ts.TotalMinutes}m {ts.Seconds}s";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,14 +120,19 @@ namespace ZB.MOM.WW.LmxOpcUa.Host.Status
|
||||
await WriteResponse(response, _reportService.GenerateHtml(), "text/html", 200);
|
||||
break;
|
||||
|
||||
case "/health":
|
||||
await WriteResponse(response, _reportService.GenerateHealthHtml(), "text/html", 200);
|
||||
break;
|
||||
|
||||
case "/api/status":
|
||||
await WriteResponse(response, _reportService.GenerateJson(), "application/json", 200);
|
||||
break;
|
||||
|
||||
case "/api/health":
|
||||
var isHealthy = _reportService.IsHealthy();
|
||||
var healthJson = isHealthy ? "{\"status\":\"healthy\"}" : "{\"status\":\"unhealthy\"}";
|
||||
await WriteResponse(response, healthJson, "application/json", isHealthy ? 200 : 503);
|
||||
var healthData = _reportService.GetHealthData();
|
||||
var healthJson = _reportService.GenerateHealthJson();
|
||||
var healthStatusCode = healthData.Status == "Unhealthy" ? 503 : 200;
|
||||
await WriteResponse(response, healthJson, "application/json", healthStatusCode);
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
@@ -132,6 +132,139 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.Status
|
||||
sut.IsHealthy().ShouldBe(false);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_WhenConnected_ReturnsHealthyStatus()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.Status.ShouldBe("Healthy");
|
||||
data.Components.MxAccess.ShouldBe("Connected");
|
||||
data.Components.Database.ShouldBe("Connected");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_WhenDisconnected_ReturnsUnhealthyStatus()
|
||||
{
|
||||
var mxClient = new FakeMxAccessClient { State = ConnectionState.Disconnected };
|
||||
var galaxyStats = new GalaxyRepositoryStats { DbConnected = false };
|
||||
var sut = new StatusReportService(new HealthCheckService(), 10);
|
||||
sut.SetComponents(mxClient, null, galaxyStats, null);
|
||||
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.Status.ShouldBe("Unhealthy");
|
||||
data.ServiceLevel.ShouldBe((byte)0);
|
||||
data.Components.MxAccess.ShouldBe("Disconnected");
|
||||
data.Components.Database.ShouldBe("Disconnected");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_NoRedundancy_ServiceLevel255WhenHealthy()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.RedundancyEnabled.ShouldBe(false);
|
||||
data.ServiceLevel.ShouldBe((byte)255);
|
||||
data.RedundancyRole.ShouldBeNull();
|
||||
data.RedundancyMode.ShouldBeNull();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_WithRedundancy_IncludesRoleAndServiceLevel()
|
||||
{
|
||||
var sut = CreateServiceWithRedundancy("Primary");
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.RedundancyEnabled.ShouldBe(true);
|
||||
data.RedundancyRole.ShouldBe("Primary");
|
||||
data.RedundancyMode.ShouldBe("Warm");
|
||||
data.ServiceLevel.ShouldBe((byte)200);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_SecondaryRole_LowerServiceLevel()
|
||||
{
|
||||
var sut = CreateServiceWithRedundancy("Secondary");
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.ServiceLevel.ShouldBe((byte)150);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_ContainsUptime()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.Uptime.ShouldNotBeNullOrWhiteSpace();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetHealthData_ContainsTimestamp()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var data = sut.GetHealthData();
|
||||
|
||||
data.Timestamp.ShouldBeGreaterThan(DateTime.UtcNow.AddMinutes(-1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GenerateHealthJson_ContainsExpectedFields()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var json = sut.GenerateHealthJson();
|
||||
|
||||
json.ShouldContain("Status");
|
||||
json.ShouldContain("ServiceLevel");
|
||||
json.ShouldContain("Components");
|
||||
json.ShouldContain("MxAccess");
|
||||
json.ShouldContain("Database");
|
||||
json.ShouldContain("OpcUaServer");
|
||||
json.ShouldContain("Uptime");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GenerateHealthHtml_ContainsStatusBadge()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var html = sut.GenerateHealthHtml();
|
||||
|
||||
html.ShouldContain("HEALTHY");
|
||||
html.ShouldContain("SERVICE LEVEL");
|
||||
html.ShouldContain("255");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GenerateHealthHtml_ContainsComponentCards()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var html = sut.GenerateHealthHtml();
|
||||
|
||||
html.ShouldContain("MXAccess");
|
||||
html.ShouldContain("Galaxy Database");
|
||||
html.ShouldContain("OPC UA Server");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GenerateHealthHtml_WithRedundancy_ShowsRoleAndMode()
|
||||
{
|
||||
var sut = CreateServiceWithRedundancy("Primary");
|
||||
var html = sut.GenerateHealthHtml();
|
||||
|
||||
html.ShouldContain("Primary");
|
||||
html.ShouldContain("Warm");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GenerateHealthHtml_ContainsAutoRefresh()
|
||||
{
|
||||
var sut = CreateService();
|
||||
var html = sut.GenerateHealthHtml();
|
||||
html.ShouldContain("meta http-equiv='refresh' content='10'");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a status report service preloaded with representative runtime, Galaxy, and metrics data.
|
||||
/// </summary>
|
||||
@@ -157,5 +290,21 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.Status
|
||||
sut.SetComponents(mxClient, metrics, galaxyStats, null);
|
||||
return sut;
|
||||
}
|
||||
|
||||
private static StatusReportService CreateServiceWithRedundancy(string role)
|
||||
{
|
||||
var mxClient = new FakeMxAccessClient();
|
||||
var galaxyStats = new GalaxyRepositoryStats { GalaxyName = "TestGalaxy", DbConnected = true };
|
||||
var redundancyConfig = new Host.Configuration.RedundancyConfiguration
|
||||
{
|
||||
Enabled = true,
|
||||
Mode = "Warm",
|
||||
Role = role,
|
||||
ServiceLevelBase = 200
|
||||
};
|
||||
var sut = new StatusReportService(new HealthCheckService(), 10);
|
||||
sut.SetComponents(mxClient, null, galaxyStats, null, null, redundancyConfig, "urn:test:instance1");
|
||||
return sut;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,6 +107,35 @@ namespace ZB.MOM.WW.LmxOpcUa.Tests.Status
|
||||
response.Headers.CacheControl?.NoStore.ShouldBe(true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confirms that the /health route returns an HTML health page.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task HealthPage_ReturnsHtml200()
|
||||
{
|
||||
var response = await _client.GetAsync("/health");
|
||||
response.StatusCode.ShouldBe(HttpStatusCode.OK);
|
||||
response.Content.Headers.ContentType?.MediaType.ShouldBe("text/html");
|
||||
var body = await response.Content.ReadAsStringAsync();
|
||||
body.ShouldContain("SERVICE LEVEL");
|
||||
body.ShouldContain("MXAccess");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confirms that /api/health returns rich JSON with component health details.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ApiHealth_ReturnsRichJson()
|
||||
{
|
||||
var response = await _client.GetAsync("/api/health");
|
||||
response.StatusCode.ShouldBe(HttpStatusCode.OK);
|
||||
response.Content.Headers.ContentType?.MediaType.ShouldBe("application/json");
|
||||
var body = await response.Content.ReadAsStringAsync();
|
||||
body.ShouldContain("ServiceLevel");
|
||||
body.ShouldContain("Components");
|
||||
body.ShouldContain("Uptime");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confirms that the server can be started and stopped cleanly.
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user