fix(review): full code-review remediation — 5 High + Medium/Low across 16 modules
Remediation from the full per-module code review at 4307c381 (findings recorded
separately in code-reviews/).
Highs fixed:
- DeploymentManager-025/SiteRuntime-031: stop broadcasting notification lists + SMTP
configs (incl. credentials) to sites; site purges already-persisted rows on apply
(enforces the central-only delivery design; clears plaintext SMTP creds at rest).
- DataConnectionLayer-023: guard the native-alarm subscribe path against the
mid-flight-unsubscribe adapter-feed leak (mirrors the DCL-021 tag-path fix).
- SiteEventLogging-024: normalize From/To query bounds to UTC (the -016 fix the
audit trail claimed but never committed).
- KpiHistory-001: add an in-flight guard to the recorder sample tick.
- ScriptAnalysis-001: harden the trust analyzer's TPA-absent fallback (resolve
forbidden anchors in the minimal reference set; warn on degraded mode) — anchors
added to validation references only, never the compile gate.
(InboundAPI-026 left to the feat/ipsen-movein effort per owner decision.)
Medium/Low: DM-026 deterministic deploy-status tiebreaker; SR-027/028/029/030
native-alarm leak/phantom-active/delete-during-redeploy fixes; AL-013/014/016;
TE-024 (folder-mutation audit rows now persisted)/025; SF-025 gauge-provider
clear-on-stop; ESG-025/026; SEC-023/024/025; SCA-007/008/009; plus doc/test
accuracy COM-023/024, HOST-025/026, HM-024/025, NS-027/028.
Full-solution build 0 warnings; ~3560 tests across 18 touched suites green.
This commit is contained in:
@@ -32,10 +32,17 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit;
|
||||
/// the daily terminal-row purge scheduler (Piece B —
|
||||
/// <see cref="OnPurgeTickAsync"/>, which invokes
|
||||
/// <see cref="ISiteCallAuditRepository.PurgeTerminalAsync"/> on a timer). Both
|
||||
/// background timers are started in <see cref="PreStart"/> and gate on the
|
||||
/// reconciliation collaborators (<see cref="IPullSiteCallsClient"/> +
|
||||
/// <see cref="ISiteEnumerator"/>) being available — the repo-only test ctor
|
||||
/// injects neither, so neither timer runs there.
|
||||
/// background timers are started in <see cref="PreStart"/>, but on independent
|
||||
/// preconditions (SiteCallAudit-007). The purge timer is armed whenever
|
||||
/// background timers are enabled (it needs only the repository, which every
|
||||
/// production / reconciliation ctor always has) — it is NOT gated on the
|
||||
/// reconciliation collaborators, so a host that registers Site Call Audit
|
||||
/// without the reconciliation client still purges and the central
|
||||
/// <c>SiteCalls</c> table cannot grow unbounded. The reconciliation timer
|
||||
/// additionally requires its collaborators (<see cref="IPullSiteCallsClient"/> +
|
||||
/// <see cref="ISiteEnumerator"/>) and logs a Warning when it cannot arm. The
|
||||
/// repo-only MSSQL test ctor disables both timers (background timers off) so the
|
||||
/// read/upsert tests see no scheduled side effects.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" —
|
||||
@@ -68,6 +75,16 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// <summary>Maximum page size honoured by a <see cref="SiteCallQueryRequest"/>.</summary>
|
||||
private const int MaxPageSize = 200;
|
||||
|
||||
/// <summary>
|
||||
/// SiteCallAudit-009: hard ceiling on the number of <c>PullSiteCalls</c> RPCs
|
||||
/// issued for a single site within ONE reconciliation tick when the site keeps
|
||||
/// reporting <see cref="PullSiteCallsResponse.MoreAvailable"/>. Bounds the
|
||||
/// within-tick continuation drain so a misbehaving site (or a pathological
|
||||
/// single-timestamp saturation that pins the cursor) can never spin the
|
||||
/// dispatcher unbounded; the remaining backlog drains on the next tick.
|
||||
/// </summary>
|
||||
private const int MaxReconciliationPagesPerTick = 50;
|
||||
|
||||
private readonly IServiceProvider? _serviceProvider;
|
||||
private readonly ISiteCallAuditRepository? _injectedRepository;
|
||||
private readonly SiteCallAuditOptions _options;
|
||||
@@ -81,12 +98,25 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// singletons registered by <c>AddAuditLogCentralReconciliationClient</c>);
|
||||
/// in the test path they are injected directly. They are <c>null</c> when
|
||||
/// the actor was built via the repo-only test ctor — in that case the
|
||||
/// reconciliation tick is NOT started (see <see cref="StartReconciliationTimer"/>);
|
||||
/// the purge tick gates on the same collaborators (see <see cref="StartPurgeTimer"/>).
|
||||
/// reconciliation tick is NOT started (see <see cref="StartReconciliationTimer"/>).
|
||||
/// The purge tick, by contrast, does NOT depend on these collaborators
|
||||
/// (SiteCallAudit-007): it needs only the repository, so it is armed by
|
||||
/// <see cref="_backgroundTimersEnabled"/> alone (see <see cref="StartPurgeTimer"/>).
|
||||
/// </summary>
|
||||
private readonly IPullSiteCallsClient? _pullClient;
|
||||
private readonly ISiteEnumerator? _siteEnumerator;
|
||||
|
||||
/// <summary>
|
||||
/// Master switch for the two background schedulers (reconciliation + purge),
|
||||
/// set <c>true</c> by the production and reconciliation ctors and <c>false</c>
|
||||
/// by the repo-only MSSQL test ctor. SiteCallAudit-007: the purge timer is
|
||||
/// gated on THIS flag rather than on the reconciliation collaborators, so a
|
||||
/// host that omits the reconciliation client still purges (no unbounded
|
||||
/// central <c>SiteCalls</c> growth) while the MSSQL read/upsert tests stay
|
||||
/// free of any scheduled side effects.
|
||||
/// </summary>
|
||||
private readonly bool _backgroundTimersEnabled;
|
||||
|
||||
/// <summary>
|
||||
/// Per-site reconciliation watermark — the highest
|
||||
/// <see cref="SiteCall.UpdatedAtUtc"/> seen for that site on a previous
|
||||
@@ -123,9 +153,11 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// An optional <paramref name="options"/> lets a test pin the stuck/KPI
|
||||
/// windows; when omitted the production defaults apply.
|
||||
/// <para>
|
||||
/// This ctor injects NO reconciliation client/enumerator, so the
|
||||
/// reconciliation tick is gated off (see <see cref="StartReconciliationTimer"/>)
|
||||
/// — the MSSQL-backed read/upsert tests must not fire phantom pulls.
|
||||
/// This ctor disables BOTH background timers (sets
|
||||
/// <see cref="_backgroundTimersEnabled"/> to <c>false</c>) and injects no
|
||||
/// reconciliation client/enumerator, so neither the reconciliation tick nor
|
||||
/// the purge tick fires — the MSSQL-backed read/upsert tests must see no
|
||||
/// scheduled side effects (no phantom pulls, no background purge).
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="repository">Concrete repository instance to use for all messages.</param>
|
||||
@@ -143,6 +175,10 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
_logger = logger;
|
||||
_options = options ?? new SiteCallAuditOptions();
|
||||
|
||||
// Repo-only MSSQL test ctor: keep BOTH background timers off so the
|
||||
// read/upsert tests see no scheduled side effects (SiteCallAudit-007).
|
||||
_backgroundTimersEnabled = false;
|
||||
|
||||
RegisterHandlers();
|
||||
}
|
||||
|
||||
@@ -150,10 +186,10 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// Test-mode constructor for the reconciliation tick (Piece A) — injects a
|
||||
/// concrete repository PLUS the two reconciliation collaborators directly,
|
||||
/// so the per-site self-heal pull is unit-testable in-memory without a DI
|
||||
/// container or a live gRPC channel. Because the client + enumerator are
|
||||
/// present, the reconciliation tick IS started; the purge tick is also
|
||||
/// started (both gate on the collaborators being available — see
|
||||
/// <see cref="StartReconciliationTimer"/> / <see cref="StartPurgeTimer"/>).
|
||||
/// container or a live gRPC channel. Background timers are enabled, so the
|
||||
/// purge tick starts (it needs only the repository) and the reconciliation
|
||||
/// tick starts too because the client + enumerator are present — see
|
||||
/// <see cref="StartReconciliationTimer"/> / <see cref="StartPurgeTimer"/>.
|
||||
/// </summary>
|
||||
/// <param name="repository">Concrete repository instance used for upserts and purges.</param>
|
||||
/// <param name="siteEnumerator">Enumerates the sites to reconcile each tick.</param>
|
||||
@@ -186,6 +222,11 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
_logger = logger;
|
||||
_options = options ?? new SiteCallAuditOptions();
|
||||
|
||||
// Reconciliation test ctor: collaborators present, so both timers arm
|
||||
// (the reconciliation tick uses the collaborators; the purge tick needs
|
||||
// only the repository).
|
||||
_backgroundTimersEnabled = true;
|
||||
|
||||
RegisterHandlers();
|
||||
}
|
||||
|
||||
@@ -223,6 +264,12 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
_pullClient = serviceProvider.GetService<IPullSiteCallsClient>();
|
||||
_siteEnumerator = serviceProvider.GetService<ISiteEnumerator>();
|
||||
|
||||
// Production path: background timers run. The purge tick is armed
|
||||
// unconditionally here (it needs only the repository); the reconciliation
|
||||
// tick additionally requires its collaborators and logs a Warning if
|
||||
// they were not registered (SiteCallAudit-007).
|
||||
_backgroundTimersEnabled = true;
|
||||
|
||||
RegisterHandlers();
|
||||
}
|
||||
|
||||
@@ -275,16 +322,30 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the periodic reconciliation tick — but ONLY when both the pull
|
||||
/// client and the site enumerator are available. The repo-only test ctor
|
||||
/// injects neither, so the tick is gated off there (the MSSQL read/upsert
|
||||
/// tests must not fire phantom pulls); the reconciliation test ctor and the
|
||||
/// production ctor (which resolves both from the SP) start it.
|
||||
/// Starts the periodic reconciliation tick — but ONLY when background timers
|
||||
/// are enabled AND both the pull client and the site enumerator are
|
||||
/// available. The repo-only test ctor disables background timers, so the tick
|
||||
/// is gated off there (the MSSQL read/upsert tests must not fire phantom
|
||||
/// pulls); the reconciliation test ctor and the production ctor (which
|
||||
/// resolves both from the SP) start it. SiteCallAudit-007: when background
|
||||
/// timers are enabled but the collaborators were not registered, log a
|
||||
/// Warning so a misconfigured host surfaces the missing self-heal rather than
|
||||
/// silently skipping it.
|
||||
/// </summary>
|
||||
private void StartReconciliationTimer()
|
||||
{
|
||||
if (!_backgroundTimersEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (_pullClient is null || _siteEnumerator is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"SiteCallAudit reconciliation timer not started — the reconciliation "
|
||||
+ "collaborators (IPullSiteCallsClient / ISiteEnumerator) were not registered; "
|
||||
+ "lost cached-call telemetry will not self-heal until they are wired up. "
|
||||
+ "The daily terminal-row purge still runs.");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -298,15 +359,20 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the daily purge tick — gated on the same collaborator presence as
|
||||
/// the reconciliation tick. The purge itself only needs the repository, but
|
||||
/// gating both schedulers together keeps the repo-only test ctor (no
|
||||
/// client/enumerator) free of BOTH background timers, so the MSSQL read/
|
||||
/// upsert tests see no scheduled side effects.
|
||||
/// Starts the daily purge tick. SiteCallAudit-007: the purge needs ONLY the
|
||||
/// repository — never the reconciliation collaborators — so it is gated on
|
||||
/// <see cref="_backgroundTimersEnabled"/> alone, NOT on
|
||||
/// <see cref="_pullClient"/> / <see cref="_siteEnumerator"/>. This decouples
|
||||
/// the daily terminal-row purge from the reconciliation client: a host that
|
||||
/// registers Site Call Audit without the reconciliation client still purges,
|
||||
/// so the central <c>SiteCalls</c> table can never grow unbounded just
|
||||
/// because the self-heal puller is absent. Only the repo-only MSSQL test ctor
|
||||
/// (background timers off) skips it, keeping the read/upsert tests free of
|
||||
/// scheduled side effects.
|
||||
/// </summary>
|
||||
private void StartPurgeTimer()
|
||||
{
|
||||
if (_pullClient is null || _siteEnumerator is null)
|
||||
if (!_backgroundTimersEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -501,37 +567,103 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// deduplicated by the idempotent monotonic upsert — the same inclusive-boundary
|
||||
/// contract as <c>SiteAuditReconciliationActor</c>'s cursor.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>SiteCallAudit-009: consumes <see cref="PullSiteCallsResponse.MoreAvailable"/>
|
||||
/// to guarantee forward progress.</b> Whereas the prior implementation ignored
|
||||
/// the flag entirely and relied solely on the tick cadence, this method now
|
||||
/// continues pulling within the same tick while the site reports
|
||||
/// <c>MoreAvailable=true</c>, bounded by
|
||||
/// <see cref="MaxReconciliationPagesPerTick"/>. This closes the
|
||||
/// single-timestamp-saturation edge: if a backlog larger than
|
||||
/// <see cref="SiteCallAuditOptions.ReconciliationBatchSize"/> all shares one
|
||||
/// exact <see cref="SiteCall.UpdatedAtUtc"/>, the inclusive max-timestamp cursor
|
||||
/// cannot advance, so the previous code re-pulled the identical window forever
|
||||
/// across ticks and never drained the tail. Here, a saturated batch whose
|
||||
/// observed max timestamp did NOT advance past <c>since</c> is detected as a
|
||||
/// no-progress pin: the loop stops and logs a Warning (the same observability
|
||||
/// intent as the sibling's stalled signal, without its EventStream state
|
||||
/// machine), so the pathological site surfaces rather than spinning silently.
|
||||
/// This diverges from <c>SiteAuditReconciliationActor</c>, which reads
|
||||
/// <c>MoreAvailable</c> to drive a <c>SiteAuditTelemetryStalledChanged</c>
|
||||
/// stalled-detection state machine instead of a within-tick continuation drain.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
private async Task ReconcileSiteAsync(
|
||||
SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository)
|
||||
{
|
||||
var since = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
||||
var response = await client
|
||||
.PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
var cursor = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
||||
|
||||
var maxUpdated = since;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var row in response.SiteCalls)
|
||||
// SiteCallAudit-009: drain within the tick while the site keeps reporting
|
||||
// MoreAvailable, bounded by MaxReconciliationPagesPerTick so a misbehaving
|
||||
// site can never spin the dispatcher. Each page advances the in-flight
|
||||
// cursor; a saturated page that fails to advance the cursor is the
|
||||
// single-timestamp no-progress pin — break and surface it.
|
||||
for (var page = 0; page < MaxReconciliationPagesPerTick; page++)
|
||||
{
|
||||
// IngestedAtUtc is the "central ingested (or last refreshed) this
|
||||
// row" stamp — owned by the central actor, exactly as OnUpsertAsync
|
||||
// does for the telemetry path. Monotonic UpsertAsync makes a row
|
||||
// already present (from a prior push) a silent no-op.
|
||||
var siteCall = row with { IngestedAtUtc = nowUtc };
|
||||
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
||||
var since = cursor;
|
||||
var response = await client
|
||||
.PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (row.UpdatedAtUtc > maxUpdated)
|
||||
var maxUpdated = since;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var row in response.SiteCalls)
|
||||
{
|
||||
maxUpdated = row.UpdatedAtUtc;
|
||||
// IngestedAtUtc is the "central ingested (or last refreshed) this
|
||||
// row" stamp — owned by the central actor, exactly as OnUpsertAsync
|
||||
// does for the telemetry path. Monotonic UpsertAsync makes a row
|
||||
// already present (from a prior push) a silent no-op.
|
||||
var siteCall = row with { IngestedAtUtc = nowUtc };
|
||||
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
||||
|
||||
if (row.UpdatedAtUtc > maxUpdated)
|
||||
{
|
||||
maxUpdated = row.UpdatedAtUtc;
|
||||
}
|
||||
}
|
||||
|
||||
// Persist the advanced cursor after every page so a fault on a later
|
||||
// page (caught per-site upstream) still keeps the rows already drained.
|
||||
cursor = maxUpdated;
|
||||
_reconciliationCursors[site.SiteId] = cursor;
|
||||
|
||||
if (!response.MoreAvailable)
|
||||
{
|
||||
// Backlog fully drained for this site this tick.
|
||||
return;
|
||||
}
|
||||
|
||||
if (maxUpdated <= since)
|
||||
{
|
||||
// No-progress pin: the site saturated the batch yet the max
|
||||
// observed UpdatedAtUtc did not advance past the inclusive cursor
|
||||
// (a burst of > batch-size rows sharing one exact timestamp).
|
||||
// Continuing would re-pull the identical window forever, so stop
|
||||
// and surface it — the inclusive max-timestamp cursor cannot make
|
||||
// progress on this input without a composite (timestamp,id)
|
||||
// keyset, which the pull contract does not yet support.
|
||||
_logger.LogWarning(
|
||||
"SiteCallAudit reconciliation for site {SiteId} cannot make progress: a saturated "
|
||||
+ "batch of more than {BatchSize} rows shares a single UpdatedAtUtc ({CursorUtc:o}), "
|
||||
+ "so the inclusive cursor is pinned. The backlog tail beyond the batch ceiling will "
|
||||
+ "not reconcile until those rows' timestamps differ.",
|
||||
site.SiteId,
|
||||
_options.ReconciliationBatchSize,
|
||||
since);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Advance the cursor to the newest row seen. A MoreAvailable response
|
||||
// means the site saturated the batch; the next tick continues draining
|
||||
// from the advanced cursor (no immediate re-pull loop — the natural
|
||||
// tick cadence drains the backlog, matching SiteAuditReconciliationActor).
|
||||
_reconciliationCursors[site.SiteId] = maxUpdated;
|
||||
// Hit the within-tick page ceiling while MoreAvailable was still true:
|
||||
// the cursor advanced each page (so the backlog IS draining), there is
|
||||
// simply more than MaxReconciliationPagesPerTick × batch-size of it. The
|
||||
// next tick resumes from the advanced cursor.
|
||||
_logger.LogInformation(
|
||||
"SiteCallAudit reconciliation for site {SiteId} hit the per-tick page ceiling "
|
||||
+ "({MaxPages} pages); the cursor advanced each page and the remaining backlog "
|
||||
+ "drains on the next tick.",
|
||||
site.SiteId,
|
||||
MaxReconciliationPagesPerTick);
|
||||
}
|
||||
|
||||
// ── Piece B: daily terminal-row purge scheduler ──
|
||||
|
||||
Reference in New Issue
Block a user