feat(siteruntime): per-node CertStore actor + trust broadcast to both site nodes (T17)

This commit is contained in:
Joseph Doherty
2026-06-18 03:13:48 -04:00
parent 303385fd98
commit c8d9303031
6 changed files with 526 additions and 0 deletions
@@ -0,0 +1,92 @@
namespace ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
// ─────────────────────────────────────────────────────────────────────────────
// T17 / D6 — OPC UA server-certificate trust management.
//
// Cert trust is SITE-LOCAL: there is no central persistence of trusted server
// certificates. The trusted-peer PKI store lives on each site node's file
// system, so a trust/remove decision must reach BOTH site nodes (node-a and
// node-b) or the two PKI stores diverge across failover (Decision D6).
//
// The public commands below are handled by the site Deployment Manager
// singleton (active node only). For trust/remove it broadcasts the
// corresponding per-node internal message to the per-node CertStoreActor on
// EVERY site cluster node; for list it answers from the local node.
// ─────────────────────────────────────────────────────────────────────────────
/// <summary>
/// Trust an OPC UA server certificate at every site node. Carries the raw
/// DER bytes (base64-encoded) so the certificate can be written into each
/// node's trusted-peer PKI store. The thumbprint is the store filename key.
/// </summary>
/// <param name="ConnectionName">The data-connection the certificate was captured from (diagnostics / correlation only).</param>
/// <param name="DerBase64">The server certificate's DER encoding, base64-encoded.</param>
/// <param name="Thumbprint">The certificate thumbprint — used as the store filename key.</param>
public record TrustServerCertCommand(string ConnectionName, string DerBase64, string Thumbprint);
/// <summary>
/// Remove a previously-trusted OPC UA server certificate from every site
/// node's trusted-peer PKI store, identified by thumbprint.
/// </summary>
/// <param name="Thumbprint">The thumbprint of the certificate to remove.</param>
public record RemoveServerCertCommand(string Thumbprint);
/// <summary>
/// List the certificates currently present in this site's trusted-peer and
/// rejected PKI stores. Answered from the local node (the Deployment Manager
/// singleton's own node).
/// </summary>
public record ListServerCertsCommand();
/// <summary>
/// Read-only projection of a certificate found in a site PKI store.
/// </summary>
/// <param name="Thumbprint">The certificate thumbprint.</param>
/// <param name="Subject">The certificate subject distinguished name.</param>
/// <param name="Issuer">The certificate issuer distinguished name.</param>
/// <param name="NotBeforeUtc">Validity start (UTC).</param>
/// <param name="NotAfterUtc">Validity end (UTC).</param>
/// <param name="Rejected">True if the certificate is in the rejected store; false if trusted.</param>
public record TrustedCertInfo(
string Thumbprint,
string Subject,
string Issuer,
DateTime NotBeforeUtc,
DateTime NotAfterUtc,
bool Rejected);
/// <summary>
/// Aggregate result of a cert-trust command. For trust/remove, <paramref name="Success"/>
/// reflects whether every reachable site node acked; <paramref name="Error"/> carries the
/// first node error (or a partial-failure note when a node did not ack in time).
/// For list, <paramref name="Certs"/> carries the local node's store contents.
/// </summary>
/// <param name="Success">True when the operation succeeded on all targeted nodes.</param>
/// <param name="Error">First error encountered, or null on success.</param>
/// <param name="Certs">Listed certificates (list command only), otherwise null.</param>
public record CertTrustResult(bool Success, string? Error, IReadOnlyList<TrustedCertInfo>? Certs);
// ── Per-node internal messages (CertStoreActor wire protocol) ──
// Sent by the Deployment Manager singleton to each site node's CertStoreActor,
// or used directly in tests. Not part of the public management surface.
/// <summary>Per-node: decode <paramref name="DerBase64"/> and write it into the local trusted-peer store as <c>&lt;Thumbprint&gt;.der</c>.</summary>
/// <param name="DerBase64">The certificate's DER encoding, base64-encoded.</param>
/// <param name="Thumbprint">The thumbprint used as the store filename key.</param>
public record WriteCertToLocalStore(string DerBase64, string Thumbprint);
/// <summary>Per-node: delete the trusted-peer store file matching <paramref name="Thumbprint"/>.</summary>
/// <param name="Thumbprint">The thumbprint of the certificate to remove.</param>
public record RemoveCertFromLocalStore(string Thumbprint);
/// <summary>Per-node: enumerate the local trusted-peer and rejected stores.</summary>
public record ListLocalCerts();
/// <summary>
/// Per-node ack for a <see cref="WriteCertToLocalStore"/>, <see cref="RemoveCertFromLocalStore"/>
/// or <see cref="ListLocalCerts"/> operation.
/// </summary>
/// <param name="Success">True if the local store operation succeeded.</param>
/// <param name="Error">The error message on failure, otherwise null.</param>
/// <param name="Certs">Listed certificates (list only), otherwise null.</param>
public record LocalCertOpAck(bool Success, string? Error, IReadOnlyList<TrustedCertInfo>? Certs);
@@ -800,6 +800,21 @@ akka {{
_logger.LogInformation("Data Connection Layer manager actor created");
}
// T17 / D6 — per-node OPC UA certificate-store actor. Created on EVERY
// site node (NOT a singleton) at a well-known name so the Deployment
// Manager singleton can fan a trust/remove out to BOTH nodes' PKI stores
// (node-a + node-b) and keep them in lock-step across failover. It needs
// the same deployment-wide OpcUaGlobalOptions the DCL manager uses so a
// trusted cert lands in the exact store RealOpcUaClient validates against.
var certStoreOpcUaOptions = _serviceProvider
.GetService<IOptions<ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions>>()?.Value
?? new ZB.MOM.WW.ScadaBridge.DataConnectionLayer.OpcUaGlobalOptions();
_actorSystem!.ActorOf(
Props.Create(() => new ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors.CertStoreActor(certStoreOpcUaOptions)),
ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors.CertStoreActor.WellKnownName);
_logger.LogInformation("Per-node CertStoreActor created at well-known name '{Name}' (T17/D6)",
ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors.CertStoreActor.WellKnownName);
// Resolve the health collector for the Deployment Manager
var siteHealthCollector = _serviceProvider.GetService<ZB.MOM.WW.ScadaBridge.HealthMonitoring.ISiteHealthCollector>();
siteHealthCollector?.SetNodeHostname(_nodeOptions.NodeHostname);
@@ -0,0 +1,159 @@
using System.Security.Cryptography.X509Certificates;
using Akka.Actor;
using Akka.Event;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Management;
using ZB.MOM.WW.ScadaBridge.DataConnectionLayer;
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
/// <summary>
/// Per-node OPC UA certificate-store actor (T17 / D6). Runs on EVERY site
/// node (NOT a singleton) at a well-known name so the Deployment Manager
/// singleton can address it on each node via <see cref="ActorSelection"/>.
///
/// Cert trust is site-local: the trusted-peer PKI store is a directory on each
/// node's file system. A trust/remove decision must therefore reach BOTH site
/// nodes or the two stores diverge across failover (D6). The singleton fans the
/// per-node <see cref="WriteCertToLocalStore"/> / <see cref="RemoveCertFromLocalStore"/>
/// message out; this actor performs the actual file write/delete/enumerate.
///
/// NOTE on the script-trust forbidden-IO rule: that rule applies only to USER
/// scripts. This is a framework/DCL actor doing legitimate PKI store file I/O,
/// exactly as the underlying OPC UA stack does when it persists rejected certs.
/// </summary>
public class CertStoreActor : ReceiveActor
{
/// <summary>
/// Well-known actor name. The Deployment Manager singleton addresses each
/// site node's instance via <c>{member.Address}/user/{WellKnownName}</c>.
/// </summary>
public const string WellKnownName = "cert-store";
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly string _trustedStoreDir;
private readonly string _rejectedStoreDir;
/// <summary>
/// Initializes the actor with the deployment-wide OPC UA options used to
/// resolve the trusted-peer and rejected store directories.
/// </summary>
/// <param name="opcUaGlobalOptions">Deployment-wide OPC UA options (store paths).</param>
public CertStoreActor(OpcUaGlobalOptions opcUaGlobalOptions)
{
// Resolve store directories with the SAME fallback logic
// RealOpcUaClient.ResolveStorePath uses, so a cert trusted here lands in
// the exact store the OPC UA client validates against. That helper is a
// private static method on RealOpcUaClient; the tiny logic is replicated
// here rather than widening its visibility.
_trustedStoreDir = ResolveStorePath(opcUaGlobalOptions.TrustedPeerStorePath, "trusted");
_rejectedStoreDir = ResolveStorePath(opcUaGlobalOptions.RejectedCertificateStorePath, "rejected");
Receive<WriteCertToLocalStore>(HandleWrite);
Receive<RemoveCertFromLocalStore>(HandleRemove);
Receive<ListLocalCerts>(_ => HandleList());
}
/// <summary>
/// Replicates <c>RealOpcUaClient.ResolveStorePath</c>: an empty configured
/// path falls back to <c>%TEMP%/ScadaBridge/pki/&lt;leaf&gt;</c>.
/// </summary>
private static string ResolveStorePath(string configured, string fallbackLeaf) =>
string.IsNullOrWhiteSpace(configured)
? Path.Combine(Path.GetTempPath(), "ScadaBridge", "pki", fallbackLeaf)
: configured;
private void HandleWrite(WriteCertToLocalStore msg)
{
try
{
Directory.CreateDirectory(_trustedStoreDir);
var der = Convert.FromBase64String(msg.DerBase64);
var path = Path.Combine(_trustedStoreDir, FileNameFor(msg.Thumbprint));
File.WriteAllBytes(path, der);
_log.Info("Trusted server certificate {Thumbprint} written to {Path}", msg.Thumbprint, path);
Sender.Tell(new LocalCertOpAck(true, null, null));
}
catch (Exception ex)
{
_log.Warning(ex, "Failed to write trusted server certificate {Thumbprint}", msg.Thumbprint);
Sender.Tell(new LocalCertOpAck(false, ex.Message, null));
}
}
private void HandleRemove(RemoveCertFromLocalStore msg)
{
try
{
var path = Path.Combine(_trustedStoreDir, FileNameFor(msg.Thumbprint));
if (File.Exists(path))
{
File.Delete(path);
_log.Info("Trusted server certificate {Thumbprint} removed from {Path}", msg.Thumbprint, path);
}
else
{
_log.Info("Trusted server certificate {Thumbprint} not present; remove is a no-op", msg.Thumbprint);
}
Sender.Tell(new LocalCertOpAck(true, null, null));
}
catch (Exception ex)
{
_log.Warning(ex, "Failed to remove trusted server certificate {Thumbprint}", msg.Thumbprint);
Sender.Tell(new LocalCertOpAck(false, ex.Message, null));
}
}
private void HandleList()
{
try
{
var certs = new List<TrustedCertInfo>();
certs.AddRange(EnumerateStore(_trustedStoreDir, rejected: false));
certs.AddRange(EnumerateStore(_rejectedStoreDir, rejected: true));
Sender.Tell(new LocalCertOpAck(true, null, certs));
}
catch (Exception ex)
{
_log.Warning(ex, "Failed to list certificates from PKI stores");
Sender.Tell(new LocalCertOpAck(false, ex.Message, null));
}
}
private IEnumerable<TrustedCertInfo> EnumerateStore(string storeDir, bool rejected)
{
if (!Directory.Exists(storeDir))
{
yield break;
}
foreach (var file in Directory.EnumerateFiles(storeDir)
.Where(f => f.EndsWith(".der", StringComparison.OrdinalIgnoreCase)
|| f.EndsWith(".crt", StringComparison.OrdinalIgnoreCase)))
{
TrustedCertInfo? info = null;
try
{
var cert = X509CertificateLoader.LoadCertificate(File.ReadAllBytes(file));
info = new TrustedCertInfo(
cert.Thumbprint,
cert.Subject,
cert.Issuer,
cert.NotBefore.ToUniversalTime(),
cert.NotAfter.ToUniversalTime(),
rejected);
}
catch (Exception ex)
{
// A malformed file in the store should not abort the whole listing.
_log.Warning(ex, "Skipping unreadable certificate file {File}", file);
}
if (info is not null)
{
yield return info;
}
}
}
private static string FileNameFor(string thumbprint) => $"{thumbprint}.der";
}
@@ -1,4 +1,5 @@
using Akka.Actor;
using Akka.Cluster;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Artifacts;
@@ -167,6 +168,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
Receive<ReadTagValuesCommand>(msg =>
Context.ActorSelection("/user/dcl-manager").Tell(msg, Sender));
// T17 / D6 — OPC UA server-certificate trust. Trust is site-local: the
// trusted-peer PKI store is per-node, so a trust/remove MUST reach BOTH
// site nodes (node-a + node-b) or they diverge across failover. This
// singleton fans the corresponding per-node message out to the
// CertStoreActor on EVERY site node; list is answered from this node.
Receive<TrustServerCertCommand>(HandleTrustServerCert);
Receive<RemoveServerCertCommand>(HandleRemoveServerCert);
Receive<ListServerCertsCommand>(HandleListServerCerts);
// Internal startup messages
Receive<StartupConfigsLoaded>(HandleStartupConfigsLoaded);
Receive<SharedScriptsLoaded>(HandleSharedScriptsLoaded);
@@ -701,6 +711,120 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
}).PipeTo(sender);
}
// ── T17 / D6 — OPC UA server-certificate trust ──
/// <summary>
/// The base cluster role every site node carries (in addition to its
/// per-site role <c>site-{SiteId}</c>). Used to enumerate the site nodes a
/// trust/remove must reach so node-a and node-b PKI stores stay in lock-step
/// across failover (D6). Matches the role string set in <c>NodeOptions.Role</c>
/// for site hosts (see <c>AkkaHostedService.BuildRoles</c>).
/// </summary>
private const string SiteClusterRole = "Site";
/// <summary>
/// The per-node ask timeout for a cert broadcast. A node that does not ack
/// within this window is reported as a partial failure (the command is not
/// failed wholesale).
/// </summary>
private static readonly TimeSpan CertBroadcastTimeout = TimeSpan.FromSeconds(5);
/// <summary>
/// T17 / D6: broadcasts a trust to the per-node <see cref="CertStoreActor"/>
/// on every Up site node so both PKI stores receive the certificate.
/// </summary>
private void HandleTrustServerCert(TrustServerCertCommand command) =>
BroadcastToSiteCertStores(
new WriteCertToLocalStore(command.DerBase64, command.Thumbprint),
$"trust cert {command.Thumbprint} (connection {command.ConnectionName})");
/// <summary>
/// T17 / D6: broadcasts a remove to the per-node <see cref="CertStoreActor"/>
/// on every Up site node so the certificate leaves both PKI stores.
/// </summary>
private void HandleRemoveServerCert(RemoveServerCertCommand command) =>
BroadcastToSiteCertStores(
new RemoveCertFromLocalStore(command.Thumbprint),
$"remove cert {command.Thumbprint}");
/// <summary>
/// T17: lists this node's trusted + rejected PKI stores by asking the LOCAL
/// <see cref="CertStoreActor"/> (the singleton's own node). The list reflects
/// the active node's view; a trust broadcast keeps the standby in sync.
/// </summary>
private void HandleListServerCerts(ListServerCertsCommand command)
{
var sender = Sender;
var local = Context.ActorSelection($"/user/{CertStoreActor.WellKnownName}");
local.Ask<LocalCertOpAck>(new ListLocalCerts(), CertBroadcastTimeout)
.ContinueWith(t =>
{
if (t.IsCompletedSuccessfully)
{
var ack = t.Result;
return new CertTrustResult(ack.Success, ack.Error, ack.Certs);
}
var err = t.Exception?.GetBaseException().Message ?? "cert store did not respond";
_logger.LogWarning("Local cert store list failed: {Error}", err);
return new CertTrustResult(false, err, null);
}).PipeTo(sender);
}
/// <summary>
/// Fans <paramref name="localMessage"/> out to the <see cref="CertStoreActor"/>
/// on every Up site node, asks each with a short timeout, and aggregates the
/// acks into one <see cref="CertTrustResult"/>. A node that fails to ack is a
/// partial failure (Success=false, first error reported) — it never throws,
/// so a single unreachable standby cannot stall the singleton.
/// </summary>
private void BroadcastToSiteCertStores(object localMessage, string description)
{
var sender = Sender;
var cluster = Cluster.Get(Context.System);
var targets = cluster.State.Members
.Where(m => m.Status == MemberStatus.Up && m.HasRole(SiteClusterRole))
.Select(m => m.Address)
.ToList();
if (targets.Count == 0)
{
_logger.LogWarning("No Up site nodes found to {Description}; nothing trusted", description);
sender.Tell(new CertTrustResult(false, "no site nodes available", null));
return;
}
_logger.LogInformation("Broadcasting cert op to {Count} site node(s): {Description}",
targets.Count, description);
var asks = targets.Select(address =>
{
var path = new RootActorPath(address) / "user" / CertStoreActor.WellKnownName;
return Context.ActorSelection(path)
.Ask<LocalCertOpAck>(localMessage, CertBroadcastTimeout)
.ContinueWith(t => t.IsCompletedSuccessfully
? t.Result
: new LocalCertOpAck(false,
$"node {address} did not ack: {t.Exception?.GetBaseException().Message}",
null));
}).ToArray();
Task.WhenAll(asks).ContinueWith(t =>
{
// ContinueWith on WhenAll over per-task error-trapping completions
// never faults, so t.Result is always populated.
var acks = t.Result;
var allSucceeded = acks.All(a => a.Success);
var firstError = acks.FirstOrDefault(a => !a.Success)?.Error;
if (!allSucceeded)
{
_logger.LogWarning("Cert broadcast partial/total failure ({Description}): {Error}",
description, firstError);
}
return new CertTrustResult(allSucceeded, firstError, null);
}).PipeTo(sender);
}
// ── DCL connection management ──
/// <summary>
@@ -35,6 +35,13 @@
<ItemGroup>
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Communication/ZB.MOM.WW.ScadaBridge.Communication.csproj" />
<!--
T17 / D6: CertStoreActor resolves the OPC UA trusted-peer / rejected store
paths from OpcUaGlobalOptions (DataConnectionLayer) so a trusted server cert
lands in the SAME PKI store RealOpcUaClient validates against. DCL does not
reference SiteRuntime, so this edge introduces no cycle.
-->
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.DataConnectionLayer/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.csproj" />
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ZB.MOM.WW.ScadaBridge.ScriptAnalysis.csproj" />
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.HealthMonitoring/ZB.MOM.WW.ScadaBridge.HealthMonitoring.csproj" />
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.SiteEventLogging/ZB.MOM.WW.ScadaBridge.SiteEventLogging.csproj" />