feat: wire all health metrics and add instance counts to dashboard

Wired ISiteHealthCollector calls for script errors (ScriptExecutionActor),
alarm eval errors (AlarmActor), dead letters (DeadLetterMonitorActor), and
S&F buffer depth placeholder. Added instance count tracking (deployed/
enabled/disabled) to SiteHealthReport via DeploymentManagerActor. Updated
Health Dashboard UI to show instance counts per site. All metrics flow
through the existing health report pipeline via ClusterClient.
This commit is contained in:
Joseph Doherty
2026-03-18 00:57:49 -04:00
parent 88b5f6cb54
commit f165ca2774
18 changed files with 151 additions and 28 deletions

View File

@@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Messages.Streaming;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.Commons.Types.Flattening;
using ScadaLink.HealthMonitoring;
using ScadaLink.SiteRuntime.Scripts;
using System.Text.Json;
@@ -34,6 +35,7 @@ public class AlarmActor : ReceiveActor
private readonly SharedScriptLibrary _sharedScriptLibrary;
private readonly SiteRuntimeOptions _options;
private readonly ILogger _logger;
private readonly ISiteHealthCollector? _healthCollector;
private AlarmState _currentState = AlarmState.Normal;
private readonly AlarmTriggerType _triggerType;
@@ -56,7 +58,8 @@ public class AlarmActor : ReceiveActor
Script<object?>? onTriggerCompiledScript,
SharedScriptLibrary sharedScriptLibrary,
SiteRuntimeOptions options,
ILogger logger)
ILogger logger,
ISiteHealthCollector? healthCollector = null)
{
_alarmName = alarmName;
_instanceName = instanceName;
@@ -64,6 +67,7 @@ public class AlarmActor : ReceiveActor
_sharedScriptLibrary = sharedScriptLibrary;
_options = options;
_logger = logger;
_healthCollector = healthCollector;
_priority = alarmConfig.PriorityLevel;
_onTriggerScriptName = alarmConfig.OnTriggerScriptCanonicalName;
_onTriggerCompiledScript = onTriggerCompiledScript;
@@ -164,6 +168,7 @@ public class AlarmActor : ReceiveActor
}
catch (Exception ex)
{
_healthCollector?.IncrementAlarmError();
// Alarm evaluation errors logged, actor continues
_logger.LogError(ex,
"Alarm {Alarm} evaluation error on {Instance}",

View File

@@ -5,6 +5,7 @@ using ScadaLink.Commons.Messages.DebugView;
using ScadaLink.Commons.Messages.Deployment;
using ScadaLink.Commons.Messages.Lifecycle;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.HealthMonitoring;
using ScadaLink.SiteRuntime.Persistence;
using ScadaLink.SiteRuntime.Scripts;
using ScadaLink.SiteRuntime.Streaming;
@@ -30,7 +31,9 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
private readonly SiteRuntimeOptions _options;
private readonly ILogger<DeploymentManagerActor> _logger;
private readonly IActorRef? _dclManager;
private readonly ISiteHealthCollector? _healthCollector;
private readonly Dictionary<string, IActorRef> _instanceActors = new();
private int _totalDeployedCount;
public ITimerScheduler Timers { get; set; } = null!;
@@ -41,7 +44,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
SiteStreamManager? streamManager,
SiteRuntimeOptions options,
ILogger<DeploymentManagerActor> logger,
IActorRef? dclManager = null)
IActorRef? dclManager = null,
ISiteHealthCollector? healthCollector = null)
{
_storage = storage;
_compilationService = compilationService;
@@ -49,6 +53,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
_streamManager = streamManager;
_options = options;
_dclManager = dclManager;
_healthCollector = healthCollector;
_logger = logger;
// Lifecycle commands
@@ -123,9 +128,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
}
var enabledConfigs = msg.Configs.Where(c => c.IsEnabled).ToList();
_totalDeployedCount = msg.Configs.Count;
_logger.LogInformation(
"Loaded {Total} deployed configs ({Enabled} enabled) from SQLite",
msg.Configs.Count, enabledConfigs.Count);
UpdateInstanceCounts();
if (enabledConfigs.Count == 0)
return;
@@ -200,6 +207,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
// Create the Instance Actor immediately (no existing actor to replace)
CreateInstanceActor(instanceName, command.FlattenedConfigurationJson);
_totalDeployedCount++;
UpdateInstanceCounts();
// Persist to SQLite and clear static overrides asynchronously
var sender = Sender;
@@ -257,6 +266,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
_instanceActors.Remove(instanceName);
}
UpdateInstanceCounts();
var sender = Sender;
_storage.SetInstanceEnabledAsync(instanceName, false).ContinueWith(t =>
{
@@ -313,6 +324,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
{
CreateInstanceActor(instanceName, result.Config.ConfigJson);
}
UpdateInstanceCounts();
result.OriginalSender.Tell(new InstanceLifecycleResponse(
result.Command.CommandId, instanceName, true, null, DateTimeOffset.UtcNow));
@@ -333,6 +345,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
Context.Stop(actor);
_instanceActors.Remove(instanceName);
}
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
UpdateInstanceCounts();
var sender = Sender;
_storage.RemoveDeployedConfigAsync(instanceName).ContinueWith(t =>
@@ -536,7 +550,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
_streamManager,
_options,
loggerFactory.CreateLogger<InstanceActor>(),
_dclManager));
_dclManager,
_healthCollector));
var actorRef = Context.ActorOf(props, instanceName);
_instanceActors[instanceName] = actorRef;
@@ -549,6 +564,18 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
/// </summary>
internal int InstanceActorCount => _instanceActors.Count;
/// <summary>
/// Updates the health collector with current instance counts.
/// Total deployed = _totalDeployedCount, enabled = running actors, disabled = difference.
/// </summary>
private void UpdateInstanceCounts()
{
_healthCollector?.SetInstanceCounts(
deployed: _totalDeployedCount,
enabled: _instanceActors.Count,
disabled: _totalDeployedCount - _instanceActors.Count);
}
// ── Internal messages ──
internal record StartupConfigsLoaded(List<DeployedInstance> Configs, string? Error);

View File

@@ -8,6 +8,7 @@ using ScadaLink.Commons.Messages.ScriptExecution;
using ScadaLink.Commons.Messages.Streaming;
using ScadaLink.Commons.Types.Enums;
using ScadaLink.Commons.Types.Flattening;
using ScadaLink.HealthMonitoring;
using ScadaLink.SiteRuntime.Persistence;
using ScadaLink.SiteRuntime.Scripts;
using ScadaLink.SiteRuntime.Streaming;
@@ -37,6 +38,7 @@ public class InstanceActor : ReceiveActor
private readonly SiteStreamManager? _streamManager;
private readonly SiteRuntimeOptions _options;
private readonly ILogger _logger;
private readonly ISiteHealthCollector? _healthCollector;
private readonly Dictionary<string, object?> _attributes = new();
private readonly Dictionary<string, string> _attributeQualities = new();
private readonly Dictionary<string, AlarmState> _alarmStates = new();
@@ -61,7 +63,8 @@ public class InstanceActor : ReceiveActor
SiteStreamManager? streamManager,
SiteRuntimeOptions options,
ILogger logger,
IActorRef? dclManager = null)
IActorRef? dclManager = null,
ISiteHealthCollector? healthCollector = null)
{
_instanceUniqueName = instanceUniqueName;
_storage = storage;
@@ -71,6 +74,7 @@ public class InstanceActor : ReceiveActor
_options = options;
_logger = logger;
_dclManager = dclManager;
_healthCollector = healthCollector;
// Deserialize the flattened configuration
_configuration = JsonSerializer.Deserialize<FlattenedConfiguration>(configJson);
@@ -474,7 +478,8 @@ public class InstanceActor : ReceiveActor
script,
_sharedScriptLibrary,
_options,
_logger));
_logger,
_healthCollector));
var actorRef = Context.ActorOf(props, $"script-{script.CanonicalName}");
_scriptActors[script.CanonicalName] = actorRef;
@@ -516,7 +521,8 @@ public class InstanceActor : ReceiveActor
onTriggerScript,
_sharedScriptLibrary,
_options,
_logger));
_logger,
_healthCollector));
var actorRef = Context.ActorOf(props, $"alarm-{alarm.CanonicalName}");
_alarmActors[alarm.CanonicalName] = actorRef;

View File

@@ -4,6 +4,7 @@ using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Messages.ScriptExecution;
using ScadaLink.Commons.Messages.Streaming;
using ScadaLink.Commons.Types.Flattening;
using ScadaLink.HealthMonitoring;
using ScadaLink.SiteRuntime.Scripts;
using System.Text.Json;
@@ -29,6 +30,7 @@ public class ScriptActor : ReceiveActor, IWithTimers
private readonly SharedScriptLibrary _sharedScriptLibrary;
private readonly SiteRuntimeOptions _options;
private readonly ILogger _logger;
private readonly ISiteHealthCollector? _healthCollector;
private Script<object?>? _compiledScript;
private ScriptTriggerConfig? _triggerConfig;
@@ -46,7 +48,8 @@ public class ScriptActor : ReceiveActor, IWithTimers
ResolvedScript scriptConfig,
SharedScriptLibrary sharedScriptLibrary,
SiteRuntimeOptions options,
ILogger logger)
ILogger logger,
ISiteHealthCollector? healthCollector = null)
{
_scriptName = scriptName;
_instanceName = instanceName;
@@ -55,6 +58,7 @@ public class ScriptActor : ReceiveActor, IWithTimers
_sharedScriptLibrary = sharedScriptLibrary;
_options = options;
_logger = logger;
_healthCollector = healthCollector;
_minTimeBetweenRuns = scriptConfig.MinTimeBetweenRuns;
// Parse trigger configuration
@@ -207,7 +211,8 @@ public class ScriptActor : ReceiveActor, IWithTimers
_options,
replyTo,
correlationId,
_logger));
_logger,
_healthCollector));
Context.ActorOf(props, executionId);
}

View File

@@ -2,6 +2,7 @@ using Akka.Actor;
using Microsoft.CodeAnalysis.Scripting;
using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Messages.ScriptExecution;
using ScadaLink.HealthMonitoring;
using ScadaLink.SiteRuntime.Scripts;
namespace ScadaLink.SiteRuntime.Actors;
@@ -28,7 +29,8 @@ public class ScriptExecutionActor : ReceiveActor
SiteRuntimeOptions options,
IActorRef replyTo,
string correlationId,
ILogger logger)
ILogger logger,
ISiteHealthCollector? healthCollector = null)
{
// Immediately begin execution
var self = Self;
@@ -37,7 +39,7 @@ public class ScriptExecutionActor : ReceiveActor
ExecuteScript(
scriptName, instanceName, compiledScript, parameters, callDepth,
instanceActor, sharedScriptLibrary, options, replyTo, correlationId,
self, parent, logger);
self, parent, logger, healthCollector);
}
private static void ExecuteScript(
@@ -53,7 +55,8 @@ public class ScriptExecutionActor : ReceiveActor
string correlationId,
IActorRef self,
IActorRef parent,
ILogger logger)
ILogger logger,
ISiteHealthCollector? healthCollector)
{
var timeout = TimeSpan.FromSeconds(options.ScriptExecutionTimeoutSeconds);
@@ -93,6 +96,7 @@ public class ScriptExecutionActor : ReceiveActor
}
catch (OperationCanceledException)
{
healthCollector?.IncrementScriptError();
var errorMsg = $"Script '{scriptName}' on instance '{instanceName}' timed out after {timeout.TotalSeconds}s";
logger.LogWarning(errorMsg);
@@ -105,6 +109,7 @@ public class ScriptExecutionActor : ReceiveActor
}
catch (Exception ex)
{
healthCollector?.IncrementScriptError();
// WP-32: Failures logged to site event log; script NOT disabled after failure
var errorMsg = $"Script '{scriptName}' on instance '{instanceName}' failed: {ex.Message}";
logger.LogError(ex, "Script execution failed: {Script} on {Instance}", scriptName, instanceName);

View File

@@ -22,6 +22,7 @@
<ItemGroup>
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
<ProjectReference Include="../ScadaLink.HealthMonitoring/ScadaLink.HealthMonitoring.csproj" />
</ItemGroup>
</Project>