fix(review): full code-review remediation — 5 High + Medium/Low across 16 modules

Remediation from the full per-module code review at 4307c381 (findings recorded
separately in code-reviews/).

Highs fixed:
- DeploymentManager-025/SiteRuntime-031: stop broadcasting notification lists + SMTP
  configs (incl. credentials) to sites; site purges already-persisted rows on apply
  (enforces the central-only delivery design; clears plaintext SMTP creds at rest).
- DataConnectionLayer-023: guard the native-alarm subscribe path against the
  mid-flight-unsubscribe adapter-feed leak (mirrors the DCL-021 tag-path fix).
- SiteEventLogging-024: normalize From/To query bounds to UTC (the -016 fix the
  audit trail claimed but never committed).
- KpiHistory-001: add an in-flight guard to the recorder sample tick.
- ScriptAnalysis-001: harden the trust analyzer's TPA-absent fallback (resolve
  forbidden anchors in the minimal reference set; warn on degraded mode) — anchors
  added to validation references only, never the compile gate.
(InboundAPI-026 left to the feat/ipsen-movein effort per owner decision.)

Medium/Low: DM-026 deterministic deploy-status tiebreaker; SR-027/028/029/030
native-alarm leak/phantom-active/delete-during-redeploy fixes; AL-013/014/016;
TE-024 (folder-mutation audit rows now persisted)/025; SF-025 gauge-provider
clear-on-stop; ESG-025/026; SEC-023/024/025; SCA-007/008/009; plus doc/test
accuracy COM-023/024, HOST-025/026, HM-024/025, NS-027/028.

Full-solution build 0 warnings; ~3560 tests across 18 touched suites green.
This commit is contained in:
Joseph Doherty
2026-06-20 17:55:12 -04:00
parent 4307c38117
commit fd618cf1dc
52 changed files with 2239 additions and 313 deletions
@@ -528,7 +528,28 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
{
var instanceName = command.InstanceUniqueName;
if (_instanceActors.TryGetValue(instanceName, out var actor))
// SiteRuntime-029: a disable arriving mid-redeploy must cancel the buffered
// redeploy. Otherwise HandleTerminated re-creates the Instance Actor and
// re-stores its config with isEnabled: true when the predecessor terminates,
// silently reverting the operator's disable back to enabled. Mirror the
// last-write-wins handling in HandleDeploy/HandleDelete: drop the pending
// command (so HandleTerminated returns early), clear the shadow, and tell the
// displaced deployer it was superseded. The disable itself still persists
// is_enabled = false below, which becomes the durable state.
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
{
if (_pendingRedeploys.Remove(terminatingRef, out var pending))
{
pending.OriginalSender.Tell(new DeploymentStatusResponse(
pending.Command.DeploymentId,
instanceName,
DeploymentStatus.Failed,
$"superseded by disable of {instanceName} before redeploy finished terminating",
DateTimeOffset.UtcNow));
}
_terminatingActorsByName.Remove(instanceName);
}
else if (_instanceActors.TryGetValue(instanceName, out var actor))
{
Context.Stop(actor);
_instanceActors.Remove(instanceName);
@@ -628,12 +649,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
{
var instanceName = command.InstanceUniqueName;
if (_instanceActors.TryGetValue(instanceName, out var actor))
// SiteRuntime-029: a delete arriving while a redeploy is still terminating must
// be authoritative over the mid-redeploy bookkeeping. HandleDeploy already
// removed the instance from _instanceActors and buffered a PendingRedeploy
// keyed by the terminating ref. If we fall straight through to the
// _instanceActors miss + unconditional decrement, the buffered redeploy is
// left intact — so when Terminated fires, HandleTerminated calls
// ApplyDeployment(isRedeploy: true) and RESURRECTS the just-deleted instance,
// with the counter now inconsistent. Cancel the pending redeploy first.
var wasPresent = false;
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
{
// Drop the buffered command so HandleTerminated's _pendingRedeploys.Remove
// misses and it returns early (no resurrection). Clear the shadow too.
if (_pendingRedeploys.Remove(terminatingRef, out var pending))
{
pending.OriginalSender.Tell(new DeploymentStatusResponse(
pending.Command.DeploymentId,
instanceName,
DeploymentStatus.Failed,
$"superseded by delete of {instanceName} before redeploy finished terminating",
DateTimeOffset.UtcNow));
}
_terminatingActorsByName.Remove(instanceName);
// The terminating predecessor is already being stopped by HandleDeploy;
// no Context.Stop needed here.
wasPresent = true;
}
else if (_instanceActors.TryGetValue(instanceName, out var actor))
{
Context.Stop(actor);
_instanceActors.Remove(instanceName);
wasPresent = true;
}
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
// SiteRuntime-029: only decrement when the instance was actually present
// (live in _instanceActors OR mid-redeploy in _terminatingActorsByName).
// A delete for a wholly-unknown instance must not drive the count negative.
if (wasPresent)
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
UpdateInstanceCounts();
var sender = Sender;
@@ -1379,14 +1433,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
}
}
// WP-33: Store notification lists
if (command.NotificationLists != null)
{
foreach (var nl in command.NotificationLists)
{
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
}
}
// DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP
// configuration are central-only — sites store-and-forward notifications
// to central and never deliver over SMTP. Central no longer ships these
// (the DeployArtifactsCommand fields stay for additive compatibility but
// are always null), so the site neither persists them nor reads them.
// Purge any rows a prior (pre-fix) build may have written — including the
// plaintext SMTP password — so existing exposure is cleared, not just
// future writes. Purge is idempotent and runs on every artifact apply.
await _storage.PurgeCentralOnlyNotificationConfigAsync();
// Store data connection definitions (OPC UA endpoints, etc.)
if (command.DataConnections != null)
@@ -1413,16 +1468,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections));
}
// Store SMTP configurations
if (command.SmtpConfigurations != null)
{
foreach (var smtp in command.SmtpConfigurations)
{
await _storage.StoreSmtpConfigurationAsync(
smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
}
}
// DeploymentManager-025 / SiteRuntime-031: SMTP configuration is
// central-only and is never stored on a site (see the purge above).
// Replicate artifacts to standby node
_replicationActor?.Tell(new ReplicateArtifacts(command));
@@ -12,6 +12,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming;
@@ -209,6 +210,12 @@ public class InstanceActor : ReceiveActor
// WP-16: Handle alarm state changes from Alarm Actors (Tell pattern)
Receive<AlarmStateChanged>(HandleAlarmStateChanged);
// SiteRuntime-027: a NativeAlarmActor tells us when one of its native
// conditions has left the mirror for good (snapshot-swap removal, retention
// drop, or cap eviction) so we can evict the stale _latestAlarmEvents key
// and not leak per-instance memory / bloat DebugView snapshots.
Receive<NativeAlarmDropped>(HandleNativeAlarmDropped);
// WP-25: Debug view subscribe/unsubscribe (Ask pattern for snapshot)
Receive<SubscribeDebugViewRequest>(HandleSubscribeDebugView);
Receive<UnsubscribeDebugViewRequest>(HandleUnsubscribeDebugView);
@@ -1016,6 +1023,25 @@ public class InstanceActor : ReceiveActor
_streamManager?.PublishAlarmStateChanged(changed);
}
/// <summary>
/// SiteRuntime-027: evicts a native condition's key from the alarm-state maps once
/// the owning <see cref="NativeAlarmActor"/> has dropped it from its mirror (after
/// emitting the condition's final return-to-normal). Without this the
/// <c>_latestAlarmEvents</c> map grows without bound on a source that mints a fresh
/// <c>SourceReference</c> per occurrence (one permanently-retained Normal entry per
/// distinct condition the instance has ever seen), leaking per-instance memory and
/// bloating every DebugView snapshot.
///
/// Native-only by construction: the key is a native condition's <c>SourceReference</c>.
/// Computed-alarm keys (configuration-bounded) are never sent here and never removed.
/// </summary>
private void HandleNativeAlarmDropped(NativeAlarmDropped dropped)
{
_latestAlarmEvents.Remove(dropped.SourceReference);
_alarmStates.Remove(dropped.SourceReference);
_alarmTimestamps.Remove(dropped.SourceReference);
}
/// <summary>
/// WP-25: Debug view subscribe — returns snapshot and begins streaming.
/// </summary>
@@ -8,6 +8,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
@@ -204,6 +205,10 @@ public class NativeAlarmActor : ReceiveActor
{
Emit(prior, prior.Condition with { Active = false });
PersistDelete(sourceRef);
// SiteRuntime-027: this condition is gone for good — tell the parent
// to evict its _latestAlarmEvents key so it does not retain a stale
// (Normal) entry forever.
NotifyParentDropped(sourceRef);
}
}
@@ -244,6 +249,9 @@ public class NativeAlarmActor : ReceiveActor
{
_alarms.Remove(t.SourceReference);
PersistDelete(t.SourceReference);
// SiteRuntime-027: evict the parent's _latestAlarmEvents key for the
// now-resolved condition so it does not leak.
NotifyParentDropped(t.SourceReference);
}
EnforceCap();
@@ -289,19 +297,48 @@ public class NativeAlarmActor : ReceiveActor
var overflow = _alarms.Values
.OrderBy(a => a.TransitionTime)
.Take(_alarms.Count - cap)
.Select(a => a.SourceReference)
.ToList();
foreach (var sourceRef in overflow)
foreach (var evicted in overflow)
{
var sourceRef = evicted.SourceReference;
// SiteRuntime-028: the sibling drop paths (ApplySnapshotSwap, the
// ApplyLiveTransition retention drop) always emit a return-to-normal
// before the condition leaves the mirror. EnforceCap previously dropped
// a condition whose last-emitted state could still be Active, with no
// compensating emit — so the Instance Actor (and central's stream / the
// operator Alarm Summary) kept showing it Active forever, a phantom
// stuck alarm the mirror could never clear. Emit the return-to-normal
// for any still-active evicted condition (mirroring ApplySnapshotSwap)
// before removing it.
if (evicted.Condition.Active)
{
Emit(evicted, evicted.Condition with { Active = false });
}
_alarms.Remove(sourceRef);
PersistDelete(sourceRef);
// SiteRuntime-027: this condition is gone for good — evict the parent's
// _latestAlarmEvents key so it does not retain a stale entry.
NotifyParentDropped(sourceRef);
_logger.LogWarning(
"Native alarm cap {Cap} exceeded for {Source} on {Instance}; dropped oldest mirrored alarm {Ref}",
cap, _source.CanonicalName, _instanceName, sourceRef);
}
}
/// <summary>
/// SiteRuntime-027: signals the parent Instance Actor that a native condition has
/// left the mirror for good so it can evict the matching <c>_latestAlarmEvents</c>
/// key. Always sent AFTER the condition's final return-to-normal
/// <see cref="AlarmStateChanged"/> emit, so the stream still sees the clear.
/// </summary>
private void NotifyParentDropped(string sourceReference)
{
_instanceActor.Tell(new NativeAlarmDropped(sourceReference));
}
/// <summary>
/// Builds and tells the parent an enriched <see cref="AlarmStateChanged"/> for a condition.
/// </summary>
@@ -189,18 +189,16 @@ public class SiteReplicationActor : ReceiveActor
foreach (var db in command.DatabaseConnections)
await _storage.StoreDatabaseConnectionAsync(db.Name, db.ConnectionString, db.MaxRetries, db.RetryDelay);
if (command.NotificationLists != null)
foreach (var nl in command.NotificationLists)
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
// DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP
// configuration are central-only and are never persisted on a site.
// Mirror the primary apply path: purge any pre-fix rows (including the
// plaintext SMTP password) instead of writing the command's
// (now-always-null) NotificationLists/SmtpConfigurations.
await _storage.PurgeCentralOnlyNotificationConfigAsync();
if (command.DataConnections != null)
foreach (var dc in command.DataConnections)
await _storage.StoreDataConnectionDefinitionAsync(dc.Name, dc.Protocol, dc.PrimaryConfigurationJson, dc.BackupConfigurationJson, dc.FailoverRetryCount);
if (command.SmtpConfigurations != null)
foreach (var smtp in command.SmtpConfigurations)
await _storage.StoreSmtpConfigurationAsync(smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
}
catch (Exception ex)
{
@@ -0,0 +1,23 @@
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
/// <summary>
/// SiteRuntime-027: terminal-drop signal sent from a <c>NativeAlarmActor</c> to its
/// parent <c>InstanceActor</c> when a native condition leaves the mirror for good —
/// the snapshot-swap removal, the live-transition retention drop
/// (<c>inactive &amp;&amp; acknowledged</c>), and the cap eviction. The parent removes the
/// condition's key (<see cref="SourceReference"/>) from its <c>_latestAlarmEvents</c>
/// map so the per-instance map and every DebugView snapshot do not accumulate one
/// permanently-retained (Normal) entry per distinct native condition the instance has
/// ever seen.
///
/// The actor still emits the condition's return-to-normal <c>AlarmStateChanged</c>
/// (so central/UI see it clear) immediately BEFORE this drop signal; only the
/// stale-key retention in <c>_latestAlarmEvents</c> is what this evicts. Computed-alarm
/// keys are configuration-bounded and are never dropped this way.
/// </summary>
/// <param name="SourceReference">
/// The native condition's source reference — the same value used as the
/// <c>AlarmStateChanged.AlarmName</c> key for native alarms, so the parent can remove
/// the matching <c>_latestAlarmEvents</c> entry.
/// </param>
public sealed record NativeAlarmDropped(string SourceReference);
@@ -623,6 +623,30 @@ public class SiteStorageService
// ── WP-33: Notification List CRUD ──
/// <summary>
/// DeploymentManager-025 / SiteRuntime-031: notification delivery is central-only.
/// Sites store-and-forward notifications to the central cluster and never deliver
/// over SMTP, so notification lists and SMTP configuration must never live on a
/// site. This purges every row from the site-local <c>notification_lists</c> and
/// <c>smtp_configurations</c> tables, clearing any rows a prior (now-corrected)
/// build may have shipped — most importantly the plaintext SMTP password. It is
/// idempotent and is invoked on every artifact apply / deploy so existing exposure
/// is cleared, not just future writes. The tables themselves are retained (the
/// schema is harmless once empty); only their contents are removed.
/// </summary>
/// <returns>A task that completes when both tables have been emptied.</returns>
public async Task PurgeCentralOnlyNotificationConfigAsync()
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var command = connection.CreateCommand();
command.CommandText = @"
DELETE FROM notification_lists;
DELETE FROM smtp_configurations;";
await command.ExecuteNonQueryAsync();
}
/// <summary>
/// Stores or updates a notification list.
/// </summary>