fix(review): full code-review remediation — 5 High + Medium/Low across 16 modules
Remediation from the full per-module code review at 4307c381 (findings recorded
separately in code-reviews/).
Highs fixed:
- DeploymentManager-025/SiteRuntime-031: stop broadcasting notification lists + SMTP
configs (incl. credentials) to sites; site purges already-persisted rows on apply
(enforces the central-only delivery design; clears plaintext SMTP creds at rest).
- DataConnectionLayer-023: guard the native-alarm subscribe path against the
mid-flight-unsubscribe adapter-feed leak (mirrors the DCL-021 tag-path fix).
- SiteEventLogging-024: normalize From/To query bounds to UTC (the -016 fix the
audit trail claimed but never committed).
- KpiHistory-001: add an in-flight guard to the recorder sample tick.
- ScriptAnalysis-001: harden the trust analyzer's TPA-absent fallback (resolve
forbidden anchors in the minimal reference set; warn on degraded mode) — anchors
added to validation references only, never the compile gate.
(InboundAPI-026 left to the feat/ipsen-movein effort per owner decision.)
Medium/Low: DM-026 deterministic deploy-status tiebreaker; SR-027/028/029/030
native-alarm leak/phantom-active/delete-during-redeploy fixes; AL-013/014/016;
TE-024 (folder-mutation audit rows now persisted)/025; SF-025 gauge-provider
clear-on-stop; ESG-025/026; SEC-023/024/025; SCA-007/008/009; plus doc/test
accuracy COM-023/024, HOST-025/026, HM-024/025, NS-027/028.
Full-solution build 0 warnings; ~3560 tests across 18 touched suites green.
This commit is contained in:
@@ -528,7 +528,28 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
var instanceName = command.InstanceUniqueName;
|
||||
|
||||
if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
// SiteRuntime-029: a disable arriving mid-redeploy must cancel the buffered
|
||||
// redeploy. Otherwise HandleTerminated re-creates the Instance Actor and
|
||||
// re-stores its config with isEnabled: true when the predecessor terminates,
|
||||
// silently reverting the operator's disable back to enabled. Mirror the
|
||||
// last-write-wins handling in HandleDeploy/HandleDelete: drop the pending
|
||||
// command (so HandleTerminated returns early), clear the shadow, and tell the
|
||||
// displaced deployer it was superseded. The disable itself still persists
|
||||
// is_enabled = false below, which becomes the durable state.
|
||||
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
|
||||
{
|
||||
if (_pendingRedeploys.Remove(terminatingRef, out var pending))
|
||||
{
|
||||
pending.OriginalSender.Tell(new DeploymentStatusResponse(
|
||||
pending.Command.DeploymentId,
|
||||
instanceName,
|
||||
DeploymentStatus.Failed,
|
||||
$"superseded by disable of {instanceName} before redeploy finished terminating",
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
_terminatingActorsByName.Remove(instanceName);
|
||||
}
|
||||
else if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
{
|
||||
Context.Stop(actor);
|
||||
_instanceActors.Remove(instanceName);
|
||||
@@ -628,12 +649,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
var instanceName = command.InstanceUniqueName;
|
||||
|
||||
if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
// SiteRuntime-029: a delete arriving while a redeploy is still terminating must
|
||||
// be authoritative over the mid-redeploy bookkeeping. HandleDeploy already
|
||||
// removed the instance from _instanceActors and buffered a PendingRedeploy
|
||||
// keyed by the terminating ref. If we fall straight through to the
|
||||
// _instanceActors miss + unconditional decrement, the buffered redeploy is
|
||||
// left intact — so when Terminated fires, HandleTerminated calls
|
||||
// ApplyDeployment(isRedeploy: true) and RESURRECTS the just-deleted instance,
|
||||
// with the counter now inconsistent. Cancel the pending redeploy first.
|
||||
var wasPresent = false;
|
||||
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
|
||||
{
|
||||
// Drop the buffered command so HandleTerminated's _pendingRedeploys.Remove
|
||||
// misses and it returns early (no resurrection). Clear the shadow too.
|
||||
if (_pendingRedeploys.Remove(terminatingRef, out var pending))
|
||||
{
|
||||
pending.OriginalSender.Tell(new DeploymentStatusResponse(
|
||||
pending.Command.DeploymentId,
|
||||
instanceName,
|
||||
DeploymentStatus.Failed,
|
||||
$"superseded by delete of {instanceName} before redeploy finished terminating",
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
_terminatingActorsByName.Remove(instanceName);
|
||||
// The terminating predecessor is already being stopped by HandleDeploy;
|
||||
// no Context.Stop needed here.
|
||||
wasPresent = true;
|
||||
}
|
||||
else if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
{
|
||||
Context.Stop(actor);
|
||||
_instanceActors.Remove(instanceName);
|
||||
wasPresent = true;
|
||||
}
|
||||
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
|
||||
|
||||
// SiteRuntime-029: only decrement when the instance was actually present
|
||||
// (live in _instanceActors OR mid-redeploy in _terminatingActorsByName).
|
||||
// A delete for a wholly-unknown instance must not drive the count negative.
|
||||
if (wasPresent)
|
||||
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
|
||||
UpdateInstanceCounts();
|
||||
|
||||
var sender = Sender;
|
||||
@@ -1379,14 +1433,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
}
|
||||
}
|
||||
|
||||
// WP-33: Store notification lists
|
||||
if (command.NotificationLists != null)
|
||||
{
|
||||
foreach (var nl in command.NotificationLists)
|
||||
{
|
||||
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
|
||||
}
|
||||
}
|
||||
// DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP
|
||||
// configuration are central-only — sites store-and-forward notifications
|
||||
// to central and never deliver over SMTP. Central no longer ships these
|
||||
// (the DeployArtifactsCommand fields stay for additive compatibility but
|
||||
// are always null), so the site neither persists them nor reads them.
|
||||
// Purge any rows a prior (pre-fix) build may have written — including the
|
||||
// plaintext SMTP password — so existing exposure is cleared, not just
|
||||
// future writes. Purge is idempotent and runs on every artifact apply.
|
||||
await _storage.PurgeCentralOnlyNotificationConfigAsync();
|
||||
|
||||
// Store data connection definitions (OPC UA endpoints, etc.)
|
||||
if (command.DataConnections != null)
|
||||
@@ -1413,16 +1468,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections));
|
||||
}
|
||||
|
||||
// Store SMTP configurations
|
||||
if (command.SmtpConfigurations != null)
|
||||
{
|
||||
foreach (var smtp in command.SmtpConfigurations)
|
||||
{
|
||||
await _storage.StoreSmtpConfigurationAsync(
|
||||
smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
|
||||
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
|
||||
}
|
||||
}
|
||||
// DeploymentManager-025 / SiteRuntime-031: SMTP configuration is
|
||||
// central-only and is never stored on a site (see the purge above).
|
||||
|
||||
// Replicate artifacts to standby node
|
||||
_replicationActor?.Tell(new ReplicateArtifacts(command));
|
||||
|
||||
@@ -12,6 +12,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming;
|
||||
@@ -209,6 +210,12 @@ public class InstanceActor : ReceiveActor
|
||||
// WP-16: Handle alarm state changes from Alarm Actors (Tell pattern)
|
||||
Receive<AlarmStateChanged>(HandleAlarmStateChanged);
|
||||
|
||||
// SiteRuntime-027: a NativeAlarmActor tells us when one of its native
|
||||
// conditions has left the mirror for good (snapshot-swap removal, retention
|
||||
// drop, or cap eviction) so we can evict the stale _latestAlarmEvents key
|
||||
// and not leak per-instance memory / bloat DebugView snapshots.
|
||||
Receive<NativeAlarmDropped>(HandleNativeAlarmDropped);
|
||||
|
||||
// WP-25: Debug view subscribe/unsubscribe (Ask pattern for snapshot)
|
||||
Receive<SubscribeDebugViewRequest>(HandleSubscribeDebugView);
|
||||
Receive<UnsubscribeDebugViewRequest>(HandleUnsubscribeDebugView);
|
||||
@@ -1016,6 +1023,25 @@ public class InstanceActor : ReceiveActor
|
||||
_streamManager?.PublishAlarmStateChanged(changed);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-027: evicts a native condition's key from the alarm-state maps once
|
||||
/// the owning <see cref="NativeAlarmActor"/> has dropped it from its mirror (after
|
||||
/// emitting the condition's final return-to-normal). Without this the
|
||||
/// <c>_latestAlarmEvents</c> map grows without bound on a source that mints a fresh
|
||||
/// <c>SourceReference</c> per occurrence (one permanently-retained Normal entry per
|
||||
/// distinct condition the instance has ever seen), leaking per-instance memory and
|
||||
/// bloating every DebugView snapshot.
|
||||
///
|
||||
/// Native-only by construction: the key is a native condition's <c>SourceReference</c>.
|
||||
/// Computed-alarm keys (configuration-bounded) are never sent here and never removed.
|
||||
/// </summary>
|
||||
private void HandleNativeAlarmDropped(NativeAlarmDropped dropped)
|
||||
{
|
||||
_latestAlarmEvents.Remove(dropped.SourceReference);
|
||||
_alarmStates.Remove(dropped.SourceReference);
|
||||
_alarmTimestamps.Remove(dropped.SourceReference);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-25: Debug view subscribe — returns snapshot and begins streaming.
|
||||
/// </summary>
|
||||
|
||||
@@ -8,6 +8,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
||||
@@ -204,6 +205,10 @@ public class NativeAlarmActor : ReceiveActor
|
||||
{
|
||||
Emit(prior, prior.Condition with { Active = false });
|
||||
PersistDelete(sourceRef);
|
||||
// SiteRuntime-027: this condition is gone for good — tell the parent
|
||||
// to evict its _latestAlarmEvents key so it does not retain a stale
|
||||
// (Normal) entry forever.
|
||||
NotifyParentDropped(sourceRef);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -244,6 +249,9 @@ public class NativeAlarmActor : ReceiveActor
|
||||
{
|
||||
_alarms.Remove(t.SourceReference);
|
||||
PersistDelete(t.SourceReference);
|
||||
// SiteRuntime-027: evict the parent's _latestAlarmEvents key for the
|
||||
// now-resolved condition so it does not leak.
|
||||
NotifyParentDropped(t.SourceReference);
|
||||
}
|
||||
|
||||
EnforceCap();
|
||||
@@ -289,19 +297,48 @@ public class NativeAlarmActor : ReceiveActor
|
||||
var overflow = _alarms.Values
|
||||
.OrderBy(a => a.TransitionTime)
|
||||
.Take(_alarms.Count - cap)
|
||||
.Select(a => a.SourceReference)
|
||||
.ToList();
|
||||
|
||||
foreach (var sourceRef in overflow)
|
||||
foreach (var evicted in overflow)
|
||||
{
|
||||
var sourceRef = evicted.SourceReference;
|
||||
|
||||
// SiteRuntime-028: the sibling drop paths (ApplySnapshotSwap, the
|
||||
// ApplyLiveTransition retention drop) always emit a return-to-normal
|
||||
// before the condition leaves the mirror. EnforceCap previously dropped
|
||||
// a condition whose last-emitted state could still be Active, with no
|
||||
// compensating emit — so the Instance Actor (and central's stream / the
|
||||
// operator Alarm Summary) kept showing it Active forever, a phantom
|
||||
// stuck alarm the mirror could never clear. Emit the return-to-normal
|
||||
// for any still-active evicted condition (mirroring ApplySnapshotSwap)
|
||||
// before removing it.
|
||||
if (evicted.Condition.Active)
|
||||
{
|
||||
Emit(evicted, evicted.Condition with { Active = false });
|
||||
}
|
||||
|
||||
_alarms.Remove(sourceRef);
|
||||
PersistDelete(sourceRef);
|
||||
// SiteRuntime-027: this condition is gone for good — evict the parent's
|
||||
// _latestAlarmEvents key so it does not retain a stale entry.
|
||||
NotifyParentDropped(sourceRef);
|
||||
_logger.LogWarning(
|
||||
"Native alarm cap {Cap} exceeded for {Source} on {Instance}; dropped oldest mirrored alarm {Ref}",
|
||||
cap, _source.CanonicalName, _instanceName, sourceRef);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-027: signals the parent Instance Actor that a native condition has
|
||||
/// left the mirror for good so it can evict the matching <c>_latestAlarmEvents</c>
|
||||
/// key. Always sent AFTER the condition's final return-to-normal
|
||||
/// <see cref="AlarmStateChanged"/> emit, so the stream still sees the clear.
|
||||
/// </summary>
|
||||
private void NotifyParentDropped(string sourceReference)
|
||||
{
|
||||
_instanceActor.Tell(new NativeAlarmDropped(sourceReference));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds and tells the parent an enriched <see cref="AlarmStateChanged"/> for a condition.
|
||||
/// </summary>
|
||||
|
||||
@@ -189,18 +189,16 @@ public class SiteReplicationActor : ReceiveActor
|
||||
foreach (var db in command.DatabaseConnections)
|
||||
await _storage.StoreDatabaseConnectionAsync(db.Name, db.ConnectionString, db.MaxRetries, db.RetryDelay);
|
||||
|
||||
if (command.NotificationLists != null)
|
||||
foreach (var nl in command.NotificationLists)
|
||||
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
|
||||
// DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP
|
||||
// configuration are central-only and are never persisted on a site.
|
||||
// Mirror the primary apply path: purge any pre-fix rows (including the
|
||||
// plaintext SMTP password) instead of writing the command's
|
||||
// (now-always-null) NotificationLists/SmtpConfigurations.
|
||||
await _storage.PurgeCentralOnlyNotificationConfigAsync();
|
||||
|
||||
if (command.DataConnections != null)
|
||||
foreach (var dc in command.DataConnections)
|
||||
await _storage.StoreDataConnectionDefinitionAsync(dc.Name, dc.Protocol, dc.PrimaryConfigurationJson, dc.BackupConfigurationJson, dc.FailoverRetryCount);
|
||||
|
||||
if (command.SmtpConfigurations != null)
|
||||
foreach (var smtp in command.SmtpConfigurations)
|
||||
await _storage.StoreSmtpConfigurationAsync(smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
|
||||
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-027: terminal-drop signal sent from a <c>NativeAlarmActor</c> to its
|
||||
/// parent <c>InstanceActor</c> when a native condition leaves the mirror for good —
|
||||
/// the snapshot-swap removal, the live-transition retention drop
|
||||
/// (<c>inactive && acknowledged</c>), and the cap eviction. The parent removes the
|
||||
/// condition's key (<see cref="SourceReference"/>) from its <c>_latestAlarmEvents</c>
|
||||
/// map so the per-instance map and every DebugView snapshot do not accumulate one
|
||||
/// permanently-retained (Normal) entry per distinct native condition the instance has
|
||||
/// ever seen.
|
||||
///
|
||||
/// The actor still emits the condition's return-to-normal <c>AlarmStateChanged</c>
|
||||
/// (so central/UI see it clear) immediately BEFORE this drop signal; only the
|
||||
/// stale-key retention in <c>_latestAlarmEvents</c> is what this evicts. Computed-alarm
|
||||
/// keys are configuration-bounded and are never dropped this way.
|
||||
/// </summary>
|
||||
/// <param name="SourceReference">
|
||||
/// The native condition's source reference — the same value used as the
|
||||
/// <c>AlarmStateChanged.AlarmName</c> key for native alarms, so the parent can remove
|
||||
/// the matching <c>_latestAlarmEvents</c> entry.
|
||||
/// </param>
|
||||
public sealed record NativeAlarmDropped(string SourceReference);
|
||||
@@ -623,6 +623,30 @@ public class SiteStorageService
|
||||
|
||||
// ── WP-33: Notification List CRUD ──
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-025 / SiteRuntime-031: notification delivery is central-only.
|
||||
/// Sites store-and-forward notifications to the central cluster and never deliver
|
||||
/// over SMTP, so notification lists and SMTP configuration must never live on a
|
||||
/// site. This purges every row from the site-local <c>notification_lists</c> and
|
||||
/// <c>smtp_configurations</c> tables, clearing any rows a prior (now-corrected)
|
||||
/// build may have shipped — most importantly the plaintext SMTP password. It is
|
||||
/// idempotent and is invoked on every artifact apply / deploy so existing exposure
|
||||
/// is cleared, not just future writes. The tables themselves are retained (the
|
||||
/// schema is harmless once empty); only their contents are removed.
|
||||
/// </summary>
|
||||
/// <returns>A task that completes when both tables have been emptied.</returns>
|
||||
public async Task PurgeCentralOnlyNotificationConfigAsync()
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = @"
|
||||
DELETE FROM notification_lists;
|
||||
DELETE FROM smtp_configurations;";
|
||||
await command.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stores or updates a notification list.
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user