OpenTelemetry redundancy metrics + RoleChanged SignalR push. Closes instrumentation + live-push slices of task #198; the exporter wiring (OTLP vs Prometheus package decision) is split to new task #201 because the collector/scrape-endpoint choice is a fleet-ops decision that deserves its own PR rather than hardcoded here. New RedundancyMetrics class (Singleton-registered in DI) owning a System.Diagnostics.Metrics.Meter("ZB.MOM.WW.OtOpcUa.Redundancy", "1.0.0"). Three ObservableGauge instruments — otopcua.redundancy.primary_count / secondary_count / stale_count — all tagged by cluster.id, populated by SetClusterCounts(clusterId, primary, secondary, stale) which the poller calls at the tail of every tick; ObservableGauge callbacks snapshot the last value set under a lock so the reader (OTel collector, dotnet-counters) sees consistent tuples. One Counter — otopcua.redundancy.role_transition — tagged cluster.id, node.id, from_role, to_role; ideal for tracking "how often does Cluster-X failover" + "which node transitions most" aggregate queries. In-box Metrics API means zero NuGet dep here — the exporter PR adds OpenTelemetry.Extensions.Hosting + OpenTelemetry.Exporter.OpenTelemetryProtocol or OpenTelemetry.Exporter.Prometheus.AspNetCore to actually ship the data somewhere. FleetStatusPoller extended with role-change detection. Its PollOnceAsync now pulls ClusterNode rows alongside the existing ClusterNodeGenerationState scan, and a new PollRolesAsync walks every node comparing RedundancyRole to the _lastRole cache. On change: records the transition to RedundancyMetrics + emits a RoleChanged SignalR message to both FleetStatusHub.GroupName(cluster) + FleetStatusHub.FleetGroup so cluster-scoped + fleet-wide subscribers both see it. First observation per node is a bootstrap (cache fill) + NOT a transition — avoids spurious churn on service startup or pod restart. UpdateClusterGauges groups nodes by cluster + sets the three gauge values, using ClusterNodeService.StaleThreshold (shared 30s convention) for staleness so the /hosts page + the gauge agree. RoleChangedMessage record lives alongside NodeStateChangedMessage in FleetStatusPoller.cs. RedundancyTab.razor subscribes to the fleet-status hub on first parameters-set, filters RoleChanged events to the current cluster, reloads the node list + paints a blue info banner ("Role changed on node-a: Primary → Secondary at HH:mm:ss UTC") so operators see the transition without needing to poll-refresh the page. IAsyncDisposable closes the connection on tab swap-away. Two new RedundancyMetricsTests covering RecordRoleTransition tag emission (cluster.id + node.id + from_role + to_role all flow through the MeterListener callback) + ObservableGauge snapshot for two clusters (assert primary_count=1 for c1, stale_count=1 for c2). Existing FleetStatusPollerTests ctor-line updated to pass a RedundancyMetrics instance; all tests still pass. Full Admin.Tests suite 87/87 passing (was 85, +2). Admin project builds 0 errors. Task #201 captures the exporter-wiring follow-up — OpenTelemetry.Extensions.Hosting + OTLP vs Prometheus + /metrics endpoint decision, driven by fleet-ops infra direction.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
using Microsoft.AspNetCore.SignalR;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using ZB.MOM.WW.OtOpcUa.Admin.Services;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Admin.Hubs;
|
||||
|
||||
@@ -14,11 +16,13 @@ public sealed class FleetStatusPoller(
|
||||
IServiceScopeFactory scopeFactory,
|
||||
IHubContext<FleetStatusHub> fleetHub,
|
||||
IHubContext<AlertHub> alertHub,
|
||||
ILogger<FleetStatusPoller> logger) : BackgroundService
|
||||
ILogger<FleetStatusPoller> logger,
|
||||
RedundancyMetrics redundancyMetrics) : BackgroundService
|
||||
{
|
||||
public TimeSpan PollInterval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
private readonly Dictionary<string, NodeStateSnapshot> _last = new();
|
||||
private readonly Dictionary<string, RedundancyRole> _lastRole = new(StringComparer.Ordinal);
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
@@ -42,6 +46,10 @@ public sealed class FleetStatusPoller(
|
||||
using var scope = scopeFactory.CreateScope();
|
||||
var db = scope.ServiceProvider.GetRequiredService<OtOpcUaConfigDbContext>();
|
||||
|
||||
var nodes = await db.ClusterNodes.AsNoTracking().ToListAsync(ct);
|
||||
await PollRolesAsync(nodes, ct);
|
||||
UpdateClusterGauges(nodes);
|
||||
|
||||
var rows = await db.ClusterNodeGenerationStates.AsNoTracking()
|
||||
.Join(db.ClusterNodes.AsNoTracking(), s => s.NodeId, n => n.NodeId, (s, n) => new { s, n.ClusterId })
|
||||
.ToListAsync(ct);
|
||||
@@ -85,9 +93,63 @@ public sealed class FleetStatusPoller(
|
||||
}
|
||||
|
||||
/// <summary>Exposed for tests — forces a snapshot reset so stub data re-seeds.</summary>
|
||||
internal void ResetCache() => _last.Clear();
|
||||
internal void ResetCache()
|
||||
{
|
||||
_last.Clear();
|
||||
_lastRole.Clear();
|
||||
}
|
||||
|
||||
private async Task PollRolesAsync(IReadOnlyList<ClusterNode> nodes, CancellationToken ct)
|
||||
{
|
||||
foreach (var n in nodes)
|
||||
{
|
||||
var hadPrior = _lastRole.TryGetValue(n.NodeId, out var priorRole);
|
||||
if (hadPrior && priorRole == n.RedundancyRole) continue;
|
||||
|
||||
_lastRole[n.NodeId] = n.RedundancyRole;
|
||||
if (!hadPrior) continue; // first-observation bootstrap — not a transition
|
||||
|
||||
redundancyMetrics.RecordRoleTransition(
|
||||
clusterId: n.ClusterId, nodeId: n.NodeId,
|
||||
fromRole: priorRole.ToString(), toRole: n.RedundancyRole.ToString());
|
||||
|
||||
var msg = new RoleChangedMessage(
|
||||
ClusterId: n.ClusterId, NodeId: n.NodeId,
|
||||
FromRole: priorRole.ToString(), ToRole: n.RedundancyRole.ToString(),
|
||||
ObservedAtUtc: DateTime.UtcNow);
|
||||
|
||||
await fleetHub.Clients.Group(FleetStatusHub.GroupName(n.ClusterId))
|
||||
.SendAsync("RoleChanged", msg, ct);
|
||||
await fleetHub.Clients.Group(FleetStatusHub.FleetGroup)
|
||||
.SendAsync("RoleChanged", msg, ct);
|
||||
}
|
||||
}
|
||||
|
||||
private void UpdateClusterGauges(IReadOnlyList<ClusterNode> nodes)
|
||||
{
|
||||
var staleCutoff = DateTime.UtcNow - Services.ClusterNodeService.StaleThreshold;
|
||||
foreach (var group in nodes.GroupBy(n => n.ClusterId))
|
||||
{
|
||||
var primary = group.Count(n => n.RedundancyRole == RedundancyRole.Primary);
|
||||
var secondary = group.Count(n => n.RedundancyRole == RedundancyRole.Secondary);
|
||||
var stale = group.Count(n => n.LastSeenAt is null || n.LastSeenAt.Value < staleCutoff);
|
||||
redundancyMetrics.SetClusterCounts(group.Key, primary, secondary, stale);
|
||||
}
|
||||
}
|
||||
|
||||
private readonly record struct NodeStateSnapshot(
|
||||
string NodeId, string ClusterId, long? GenerationId,
|
||||
string? Status, string? Error, DateTime? AppliedAt, DateTime? SeenAt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pushed by <see cref="FleetStatusPoller"/> when it observes a change in
|
||||
/// <see cref="ClusterNode.RedundancyRole"/>. Consumed by the Admin RedundancyTab to trigger
|
||||
/// an instant reload instead of waiting for the next on-parameter-set poll.
|
||||
/// </summary>
|
||||
public sealed record RoleChangedMessage(
|
||||
string ClusterId,
|
||||
string NodeId,
|
||||
string FromRole,
|
||||
string ToRole,
|
||||
DateTime ObservedAtUtc);
|
||||
|
||||
Reference in New Issue
Block a user