Files
lmxopcua/tests/ZB.MOM.WW.OtOpcUa.Admin.Tests/FleetStatusPollerTests.cs
Joseph Doherty 1f3343e61f OpenTelemetry redundancy metrics + RoleChanged SignalR push. Closes instrumentation + live-push slices of task #198; the exporter wiring (OTLP vs Prometheus package decision) is split to new task #201 because the collector/scrape-endpoint choice is a fleet-ops decision that deserves its own PR rather than hardcoded here. New RedundancyMetrics class (Singleton-registered in DI) owning a System.Diagnostics.Metrics.Meter("ZB.MOM.WW.OtOpcUa.Redundancy", "1.0.0"). Three ObservableGauge instruments — otopcua.redundancy.primary_count / secondary_count / stale_count — all tagged by cluster.id, populated by SetClusterCounts(clusterId, primary, secondary, stale) which the poller calls at the tail of every tick; ObservableGauge callbacks snapshot the last value set under a lock so the reader (OTel collector, dotnet-counters) sees consistent tuples. One Counter — otopcua.redundancy.role_transition — tagged cluster.id, node.id, from_role, to_role; ideal for tracking "how often does Cluster-X failover" + "which node transitions most" aggregate queries. In-box Metrics API means zero NuGet dep here — the exporter PR adds OpenTelemetry.Extensions.Hosting + OpenTelemetry.Exporter.OpenTelemetryProtocol or OpenTelemetry.Exporter.Prometheus.AspNetCore to actually ship the data somewhere. FleetStatusPoller extended with role-change detection. Its PollOnceAsync now pulls ClusterNode rows alongside the existing ClusterNodeGenerationState scan, and a new PollRolesAsync walks every node comparing RedundancyRole to the _lastRole cache. On change: records the transition to RedundancyMetrics + emits a RoleChanged SignalR message to both FleetStatusHub.GroupName(cluster) + FleetStatusHub.FleetGroup so cluster-scoped + fleet-wide subscribers both see it. First observation per node is a bootstrap (cache fill) + NOT a transition — avoids spurious churn on service startup or pod restart. UpdateClusterGauges groups nodes by cluster + sets the three gauge values, using ClusterNodeService.StaleThreshold (shared 30s convention) for staleness so the /hosts page + the gauge agree. RoleChangedMessage record lives alongside NodeStateChangedMessage in FleetStatusPoller.cs. RedundancyTab.razor subscribes to the fleet-status hub on first parameters-set, filters RoleChanged events to the current cluster, reloads the node list + paints a blue info banner ("Role changed on node-a: Primary → Secondary at HH:mm:ss UTC") so operators see the transition without needing to poll-refresh the page. IAsyncDisposable closes the connection on tab swap-away. Two new RedundancyMetricsTests covering RecordRoleTransition tag emission (cluster.id + node.id + from_role + to_role all flow through the MeterListener callback) + ObservableGauge snapshot for two clusters (assert primary_count=1 for c1, stale_count=1 for c2). Existing FleetStatusPollerTests ctor-line updated to pass a RedundancyMetrics instance; all tests still pass. Full Admin.Tests suite 87/87 passing (was 85, +2). Admin project builds 0 errors. Task #201 captures the exporter-wiring follow-up — OpenTelemetry.Extensions.Hosting + OTLP vs Prometheus + /metrics endpoint decision, driven by fleet-ops infra direction.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 23:16:09 -04:00

157 lines
6.6 KiB
C#

using Microsoft.AspNetCore.SignalR;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging.Abstractions;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Admin.Hubs;
using ZB.MOM.WW.OtOpcUa.Admin.Services;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
namespace ZB.MOM.WW.OtOpcUa.Admin.Tests;
[Trait("Category", "Integration")]
public sealed class FleetStatusPollerTests : IDisposable
{
private const string DefaultServer = "localhost,14330";
private const string DefaultSaPassword = "OtOpcUaDev_2026!";
private readonly string _databaseName = $"OtOpcUaPollerTest_{Guid.NewGuid():N}";
private readonly string _connectionString;
private readonly ServiceProvider _sp;
public FleetStatusPollerTests()
{
var server = Environment.GetEnvironmentVariable("OTOPCUA_CONFIG_TEST_SERVER") ?? DefaultServer;
var password = Environment.GetEnvironmentVariable("OTOPCUA_CONFIG_TEST_SA_PASSWORD") ?? DefaultSaPassword;
_connectionString =
$"Server={server};Database={_databaseName};User Id=sa;Password={password};TrustServerCertificate=True;Encrypt=False;";
var services = new ServiceCollection();
services.AddLogging();
services.AddSignalR();
services.AddDbContext<OtOpcUaConfigDbContext>(o => o.UseSqlServer(_connectionString));
_sp = services.BuildServiceProvider();
using var scope = _sp.CreateScope();
scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>().Database.Migrate();
}
public void Dispose()
{
_sp.Dispose();
using var conn = new Microsoft.Data.SqlClient.SqlConnection(
new Microsoft.Data.SqlClient.SqlConnectionStringBuilder(_connectionString)
{ InitialCatalog = "master" }.ConnectionString);
conn.Open();
using var cmd = conn.CreateCommand();
cmd.CommandText = $@"
IF DB_ID(N'{_databaseName}') IS NOT NULL
BEGIN
ALTER DATABASE [{_databaseName}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE;
DROP DATABASE [{_databaseName}];
END";
cmd.ExecuteNonQuery();
}
[Fact]
public async Task Poller_detects_new_apply_state_and_pushes_to_fleet_hub()
{
// Seed a cluster + node + credential + generation + apply state.
using (var scope = _sp.CreateScope())
{
var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();
db.ServerClusters.Add(new ServerCluster
{
ClusterId = "p-1", Name = "Poll test", Enterprise = "zb", Site = "dev",
NodeCount = 1, RedundancyMode = RedundancyMode.None, Enabled = true, CreatedBy = "t",
});
db.ClusterNodes.Add(new ClusterNode
{
NodeId = "p-1-a", ClusterId = "p-1", RedundancyRole = RedundancyRole.Primary,
Host = "localhost", OpcUaPort = 4840, DashboardPort = 5001,
ApplicationUri = "urn:p1:test", ServiceLevelBase = 200, Enabled = true, CreatedBy = "t",
});
var gen = new ConfigGeneration
{
ClusterId = "p-1", Status = GenerationStatus.Published, CreatedBy = "t",
PublishedBy = "t", PublishedAt = DateTime.UtcNow,
};
db.ConfigGenerations.Add(gen);
await db.SaveChangesAsync();
db.ClusterNodeGenerationStates.Add(new ClusterNodeGenerationState
{
NodeId = "p-1-a", CurrentGenerationId = gen.GenerationId,
LastAppliedStatus = NodeApplyStatus.Applied,
LastAppliedAt = DateTime.UtcNow, LastSeenAt = DateTime.UtcNow,
});
await db.SaveChangesAsync();
}
// Recording hub contexts — capture what would be pushed to clients.
var recorder = new RecordingHubClients();
var fleetHub = new RecordingHubContext<FleetStatusHub>(recorder);
var alertHub = new RecordingHubContext<AlertHub>(new RecordingHubClients());
var poller = new FleetStatusPoller(
_sp.GetRequiredService<IServiceScopeFactory>(),
fleetHub, alertHub, NullLogger<FleetStatusPoller>.Instance, new RedundancyMetrics());
await poller.PollOnceAsync(CancellationToken.None);
var match = recorder.SentMessages.FirstOrDefault(m =>
m.Method == "NodeStateChanged" &&
m.Args.Length > 0 &&
m.Args[0] is NodeStateChangedMessage msg &&
msg.NodeId == "p-1-a");
match.ShouldNotBeNull("poller should have pushed a NodeStateChanged for p-1-a");
}
[Fact]
public async Task Poller_raises_alert_on_transition_into_Failed()
{
using (var scope = _sp.CreateScope())
{
var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();
db.ServerClusters.Add(new ServerCluster
{
ClusterId = "p-2", Name = "Fail test", Enterprise = "zb", Site = "dev",
NodeCount = 1, RedundancyMode = RedundancyMode.None, Enabled = true, CreatedBy = "t",
});
db.ClusterNodes.Add(new ClusterNode
{
NodeId = "p-2-a", ClusterId = "p-2", RedundancyRole = RedundancyRole.Primary,
Host = "localhost", OpcUaPort = 4840, DashboardPort = 5001,
ApplicationUri = "urn:p2:test", ServiceLevelBase = 200, Enabled = true, CreatedBy = "t",
});
db.ClusterNodeGenerationStates.Add(new ClusterNodeGenerationState
{
NodeId = "p-2-a",
LastAppliedStatus = NodeApplyStatus.Failed,
LastAppliedError = "simulated",
LastAppliedAt = DateTime.UtcNow, LastSeenAt = DateTime.UtcNow,
});
await db.SaveChangesAsync();
}
var alerts = new RecordingHubClients();
var alertHub = new RecordingHubContext<AlertHub>(alerts);
var fleetHub = new RecordingHubContext<FleetStatusHub>(new RecordingHubClients());
var poller = new FleetStatusPoller(
_sp.GetRequiredService<IServiceScopeFactory>(),
fleetHub, alertHub, NullLogger<FleetStatusPoller>.Instance, new RedundancyMetrics());
await poller.PollOnceAsync(CancellationToken.None);
var alertMatch = alerts.SentMessages.FirstOrDefault(m =>
m.Method == "AlertRaised" &&
m.Args.Length > 0 &&
m.Args[0] is AlertMessage alert && alert.NodeId == "p-2-a" && alert.Severity == "error");
alertMatch.ShouldNotBeNull("poller should have raised AlertRaised for p-2-a");
}
}