feat(siteruntime): per-node CertStore actor + trust broadcast to both site nodes (T17)

This commit is contained in:
Joseph Doherty
2026-06-18 03:13:48 -04:00
parent 303385fd98
commit c8d9303031
6 changed files with 526 additions and 0 deletions
@@ -0,0 +1,159 @@
using System.Security.Cryptography.X509Certificates;
using Akka.Actor;
using Akka.Event;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
using ZB.MOM.WW.ScadaBridge.DataConnectionLayer;
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
/// <summary>
/// Per-node OPC UA certificate-store actor (T17 / D6). Runs on EVERY site
/// node (NOT a singleton) at a well-known name so the Deployment Manager
/// singleton can address it on each node via <see cref="ActorSelection"/>.
///
/// Cert trust is site-local: the trusted-peer PKI store is a directory on each
/// node's file system. A trust/remove decision must therefore reach BOTH site
/// nodes or the two stores diverge across failover (D6). The singleton fans the
/// per-node <see cref="WriteCertToLocalStore"/> / <see cref="RemoveCertFromLocalStore"/>
/// message out; this actor performs the actual file write/delete/enumerate.
///
/// NOTE on the script-trust forbidden-IO rule: that rule applies only to USER
/// scripts. This is a framework/DCL actor doing legitimate PKI store file I/O,
/// exactly as the underlying OPC UA stack does when it persists rejected certs.
/// </summary>
public class CertStoreActor : ReceiveActor
{
/// <summary>
/// Well-known actor name. The Deployment Manager singleton addresses each
/// site node's instance via <c>{member.Address}/user/{WellKnownName}</c>.
/// </summary>
public const string WellKnownName = "cert-store";
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly string _trustedStoreDir;
private readonly string _rejectedStoreDir;
/// <summary>
/// Initializes the actor with the deployment-wide OPC UA options used to
/// resolve the trusted-peer and rejected store directories.
/// </summary>
/// <param name="opcUaGlobalOptions">Deployment-wide OPC UA options (store paths).</param>
public CertStoreActor(OpcUaGlobalOptions opcUaGlobalOptions)
{
// Resolve store directories with the SAME fallback logic
// RealOpcUaClient.ResolveStorePath uses, so a cert trusted here lands in
// the exact store the OPC UA client validates against. That helper is a
// private static method on RealOpcUaClient; the tiny logic is replicated
// here rather than widening its visibility.
_trustedStoreDir = ResolveStorePath(opcUaGlobalOptions.TrustedPeerStorePath, "trusted");
_rejectedStoreDir = ResolveStorePath(opcUaGlobalOptions.RejectedCertificateStorePath, "rejected");
Receive<WriteCertToLocalStore>(HandleWrite);
Receive<RemoveCertFromLocalStore>(HandleRemove);
Receive<ListLocalCerts>(_ => HandleList());
}
/// <summary>
/// Replicates <c>RealOpcUaClient.ResolveStorePath</c>: an empty configured
/// path falls back to <c>%TEMP%/ScadaBridge/pki/&lt;leaf&gt;</c>.
/// </summary>
private static string ResolveStorePath(string configured, string fallbackLeaf) =>
string.IsNullOrWhiteSpace(configured)
? Path.Combine(Path.GetTempPath(), "ScadaBridge", "pki", fallbackLeaf)
: configured;
private void HandleWrite(WriteCertToLocalStore msg)
{
try
{
Directory.CreateDirectory(_trustedStoreDir);
var der = Convert.FromBase64String(msg.DerBase64);
var path = Path.Combine(_trustedStoreDir, FileNameFor(msg.Thumbprint));
File.WriteAllBytes(path, der);
_log.Info("Trusted server certificate {Thumbprint} written to {Path}", msg.Thumbprint, path);
Sender.Tell(new LocalCertOpAck(true, null, null));
}
catch (Exception ex)
{
_log.Warning(ex, "Failed to write trusted server certificate {Thumbprint}", msg.Thumbprint);
Sender.Tell(new LocalCertOpAck(false, ex.Message, null));
}
}
private void HandleRemove(RemoveCertFromLocalStore msg)
{
try
{
var path = Path.Combine(_trustedStoreDir, FileNameFor(msg.Thumbprint));
if (File.Exists(path))
{
File.Delete(path);
_log.Info("Trusted server certificate {Thumbprint} removed from {Path}", msg.Thumbprint, path);
}
else
{
_log.Info("Trusted server certificate {Thumbprint} not present; remove is a no-op", msg.Thumbprint);
}
Sender.Tell(new LocalCertOpAck(true, null, null));
}
catch (Exception ex)
{
_log.Warning(ex, "Failed to remove trusted server certificate {Thumbprint}", msg.Thumbprint);
Sender.Tell(new LocalCertOpAck(false, ex.Message, null));
}
}
private void HandleList()
{
try
{
var certs = new List<TrustedCertInfo>();
certs.AddRange(EnumerateStore(_trustedStoreDir, rejected: false));
certs.AddRange(EnumerateStore(_rejectedStoreDir, rejected: true));
Sender.Tell(new LocalCertOpAck(true, null, certs));
}
catch (Exception ex)
{
_log.Warning(ex, "Failed to list certificates from PKI stores");
Sender.Tell(new LocalCertOpAck(false, ex.Message, null));
}
}
private IEnumerable<TrustedCertInfo> EnumerateStore(string storeDir, bool rejected)
{
if (!Directory.Exists(storeDir))
{
yield break;
}
foreach (var file in Directory.EnumerateFiles(storeDir)
.Where(f => f.EndsWith(".der", StringComparison.OrdinalIgnoreCase)
|| f.EndsWith(".crt", StringComparison.OrdinalIgnoreCase)))
{
TrustedCertInfo? info = null;
try
{
var cert = X509CertificateLoader.LoadCertificate(File.ReadAllBytes(file));
info = new TrustedCertInfo(
cert.Thumbprint,
cert.Subject,
cert.Issuer,
cert.NotBefore.ToUniversalTime(),
cert.NotAfter.ToUniversalTime(),
rejected);
}
catch (Exception ex)
{
// A malformed file in the store should not abort the whole listing.
_log.Warning(ex, "Skipping unreadable certificate file {File}", file);
}
if (info is not null)
{
yield return info;
}
}
}
private static string FileNameFor(string thumbprint) => $"{thumbprint}.der";
}
@@ -1,4 +1,5 @@
using Akka.Actor;
using Akka.Cluster;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts;
@@ -167,6 +168,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
Receive<ReadTagValuesCommand>(msg =>
Context.ActorSelection("/user/dcl-manager").Tell(msg, Sender));
// T17 / D6 — OPC UA server-certificate trust. Trust is site-local: the
// trusted-peer PKI store is per-node, so a trust/remove MUST reach BOTH
// site nodes (node-a + node-b) or they diverge across failover. This
// singleton fans the corresponding per-node message out to the
// CertStoreActor on EVERY site node; list is answered from this node.
Receive<TrustServerCertCommand>(HandleTrustServerCert);
Receive<RemoveServerCertCommand>(HandleRemoveServerCert);
Receive<ListServerCertsCommand>(HandleListServerCerts);
// Internal startup messages
Receive<StartupConfigsLoaded>(HandleStartupConfigsLoaded);
Receive<SharedScriptsLoaded>(HandleSharedScriptsLoaded);
@@ -701,6 +711,120 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
}).PipeTo(sender);
}
// ── T17 / D6 — OPC UA server-certificate trust ──
/// <summary>
/// The base cluster role every site node carries (in addition to its
/// per-site role <c>site-{SiteId}</c>). Used to enumerate the site nodes a
/// trust/remove must reach so node-a and node-b PKI stores stay in lock-step
/// across failover (D6). Matches the role string set in <c>NodeOptions.Role</c>
/// for site hosts (see <c>AkkaHostedService.BuildRoles</c>).
/// </summary>
private const string SiteClusterRole = "Site";
/// <summary>
/// The per-node ask timeout for a cert broadcast. A node that does not ack
/// within this window is reported as a partial failure (the command is not
/// failed wholesale).
/// </summary>
private static readonly TimeSpan CertBroadcastTimeout = TimeSpan.FromSeconds(5);
/// <summary>
/// T17 / D6: broadcasts a trust to the per-node <see cref="CertStoreActor"/>
/// on every Up site node so both PKI stores receive the certificate.
/// </summary>
private void HandleTrustServerCert(TrustServerCertCommand command) =>
BroadcastToSiteCertStores(
new WriteCertToLocalStore(command.DerBase64, command.Thumbprint),
$"trust cert {command.Thumbprint} (connection {command.ConnectionName})");
/// <summary>
/// T17 / D6: broadcasts a remove to the per-node <see cref="CertStoreActor"/>
/// on every Up site node so the certificate leaves both PKI stores.
/// </summary>
private void HandleRemoveServerCert(RemoveServerCertCommand command) =>
BroadcastToSiteCertStores(
new RemoveCertFromLocalStore(command.Thumbprint),
$"remove cert {command.Thumbprint}");
/// <summary>
/// T17: lists this node's trusted + rejected PKI stores by asking the LOCAL
/// <see cref="CertStoreActor"/> (the singleton's own node). The list reflects
/// the active node's view; a trust broadcast keeps the standby in sync.
/// </summary>
private void HandleListServerCerts(ListServerCertsCommand command)
{
var sender = Sender;
var local = Context.ActorSelection($"/user/{CertStoreActor.WellKnownName}");
local.Ask<LocalCertOpAck>(new ListLocalCerts(), CertBroadcastTimeout)
.ContinueWith(t =>
{
if (t.IsCompletedSuccessfully)
{
var ack = t.Result;
return new CertTrustResult(ack.Success, ack.Error, ack.Certs);
}
var err = t.Exception?.GetBaseException().Message ?? "cert store did not respond";
_logger.LogWarning("Local cert store list failed: {Error}", err);
return new CertTrustResult(false, err, null);
}).PipeTo(sender);
}
/// <summary>
/// Fans <paramref name="localMessage"/> out to the <see cref="CertStoreActor"/>
/// on every Up site node, asks each with a short timeout, and aggregates the
/// acks into one <see cref="CertTrustResult"/>. A node that fails to ack is a
/// partial failure (Success=false, first error reported) — it never throws,
/// so a single unreachable standby cannot stall the singleton.
/// </summary>
private void BroadcastToSiteCertStores(object localMessage, string description)
{
var sender = Sender;
var cluster = Cluster.Get(Context.System);
var targets = cluster.State.Members
.Where(m => m.Status == MemberStatus.Up && m.HasRole(SiteClusterRole))
.Select(m => m.Address)
.ToList();
if (targets.Count == 0)
{
_logger.LogWarning("No Up site nodes found to {Description}; nothing trusted", description);
sender.Tell(new CertTrustResult(false, "no site nodes available", null));
return;
}
_logger.LogInformation("Broadcasting cert op to {Count} site node(s): {Description}",
targets.Count, description);
var asks = targets.Select(address =>
{
var path = new RootActorPath(address) / "user" / CertStoreActor.WellKnownName;
return Context.ActorSelection(path)
.Ask<LocalCertOpAck>(localMessage, CertBroadcastTimeout)
.ContinueWith(t => t.IsCompletedSuccessfully
? t.Result
: new LocalCertOpAck(false,
$"node {address} did not ack: {t.Exception?.GetBaseException().Message}",
null));
}).ToArray();
Task.WhenAll(asks).ContinueWith(t =>
{
// ContinueWith on WhenAll over per-task error-trapping completions
// never faults, so t.Result is always populated.
var acks = t.Result;
var allSucceeded = acks.All(a => a.Success);
var firstError = acks.FirstOrDefault(a => !a.Success)?.Error;
if (!allSucceeded)
{
_logger.LogWarning("Cert broadcast partial/total failure ({Description}): {Error}",
description, firstError);
}
return new CertTrustResult(allSucceeded, firstError, null);
}).PipeTo(sender);
}
// ── DCL connection management ──
/// <summary>