fix(review): full code-review remediation — 5 High + Medium/Low across 16 modules
Remediation from the full per-module code review at 4307c381 (findings recorded
separately in code-reviews/).
Highs fixed:
- DeploymentManager-025/SiteRuntime-031: stop broadcasting notification lists + SMTP
configs (incl. credentials) to sites; site purges already-persisted rows on apply
(enforces the central-only delivery design; clears plaintext SMTP creds at rest).
- DataConnectionLayer-023: guard the native-alarm subscribe path against the
mid-flight-unsubscribe adapter-feed leak (mirrors the DCL-021 tag-path fix).
- SiteEventLogging-024: normalize From/To query bounds to UTC (the -016 fix the
audit trail claimed but never committed).
- KpiHistory-001: add an in-flight guard to the recorder sample tick.
- ScriptAnalysis-001: harden the trust analyzer's TPA-absent fallback (resolve
forbidden anchors in the minimal reference set; warn on degraded mode) — anchors
added to validation references only, never the compile gate.
(InboundAPI-026 left to the feat/ipsen-movein effort per owner decision.)
Medium/Low: DM-026 deterministic deploy-status tiebreaker; SR-027/028/029/030
native-alarm leak/phantom-active/delete-during-redeploy fixes; AL-013/014/016;
TE-024 (folder-mutation audit rows now persisted)/025; SF-025 gauge-provider
clear-on-stop; ESG-025/026; SEC-023/024/025; SCA-007/008/009; plus doc/test
accuracy COM-023/024, HOST-025/026, HM-024/025, NS-027/028.
Full-solution build 0 warnings; ~3560 tests across 18 touched suites green.
This commit is contained in:
@@ -11,12 +11,14 @@ using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central-side singleton (per Bundle E wiring) that ingests batches of
|
||||
/// Central-side cluster singleton that ingests batches of
|
||||
/// <see cref="AuditEvent"/> rows pushed from sites via the
|
||||
/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
|
||||
/// the central-side IngestedAtUtc (in DetailsJson) and inserted idempotently via
|
||||
/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
|
||||
/// silently swallowed (first-write-wins per Bundle A's hardening).
|
||||
/// ingest timestamp into DetailsJson (there is no promoted IngestedAtUtc
|
||||
/// column — the value is a DetailsJson field set via
|
||||
/// <see cref="AuditRowProjection.WithIngestedAtUtc"/>) and inserted idempotently
|
||||
/// via <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
|
||||
/// silently swallowed (first-write-wins).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
@@ -25,23 +27,24 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
/// consistent and the site is free to flip its local row to <c>Forwarded</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
|
||||
/// action. The actor wraps each repository call in its own try/catch so a
|
||||
/// single bad row cannot cause the rest of the batch to be lost — that
|
||||
/// per-row catch is what keeps this actor alive across handler throws, not
|
||||
/// the supervisor strategy. The <see cref="SupervisorStrategy"/> override
|
||||
/// returns the Akka default decider (Restart for most exceptions) and
|
||||
/// governs children only; this actor has no children today, so the override
|
||||
/// is a forward-compat placeholder.
|
||||
/// Audit-write failures must NEVER abort the user-facing action. The actor
|
||||
/// wraps each repository call in its own try/catch so a single bad row cannot
|
||||
/// cause the rest of the batch to be lost, and it guards scope/repository
|
||||
/// resolution so a transient DI fault cannot restart the singleton — those
|
||||
/// catches are what keep this actor alive across handler throws, not the
|
||||
/// supervisor strategy. The <see cref="SupervisorStrategy"/> override returns
|
||||
/// the Akka default decider (Restart for most exceptions) and governs children
|
||||
/// only; this actor has no children today, so the override is a forward-compat
|
||||
/// placeholder.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
|
||||
/// Two constructors exist for a deliberate reason: the test ctor injects a
|
||||
/// concrete <see cref="IAuditLogRepository"/> against a per-test MSSQL fixture
|
||||
/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency
|
||||
/// end to end), while Bundle E's host wiring registers the actor as a cluster
|
||||
/// singleton and must therefore resolve the repository — which is a scoped EF
|
||||
/// Core service — from a fresh DI scope per message. Mirroring the Notification
|
||||
/// Outbox actor's pattern.
|
||||
/// (the only way to verify the ingest-timestamp stamp + duplicate-key
|
||||
/// idempotency end to end), while the production host wiring registers the
|
||||
/// actor as a cluster singleton and must therefore resolve the repository —
|
||||
/// which is a scoped EF Core service — from a fresh DI scope per message.
|
||||
/// Mirroring the Notification Outbox actor's pattern.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class AuditLogIngestActor : ReceiveActor
|
||||
@@ -53,7 +56,7 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
/// <summary>
|
||||
/// Test-mode constructor — injects a concrete repository instance whose
|
||||
/// lifetime exceeds the test, so the actor reuses the same instance across
|
||||
/// every message. Used by Bundle D's MSSQL-backed TestKit fixture.
|
||||
/// every message. Used by the MSSQL-backed TestKit fixture.
|
||||
/// </summary>
|
||||
/// <param name="repository">Audit log repository instance shared across all messages.</param>
|
||||
/// <param name="logger">Logger for ingest diagnostics.</param>
|
||||
@@ -116,13 +119,12 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
|
||||
// Resolve the repository for the whole batch — one DbContext per
|
||||
// message, mirroring NotificationOutboxActor. The injected-repository
|
||||
// mode (Bundle D tests) skips the scope entirely.
|
||||
// Bundle C (M5-T6): the IAuditRedactor is also resolved from the
|
||||
// per-message scope when one is available so the row is truncated +
|
||||
// redacted before InsertIfNotExistsAsync. The single-repository test
|
||||
// ctor has no service provider — it falls through with no redactor,
|
||||
// which preserves the small-payload assumptions baked into the
|
||||
// existing D2 fixtures.
|
||||
// mode (test ctor) skips the scope entirely.
|
||||
// The IAuditRedactor is also resolved from the per-message scope when
|
||||
// one is available so the row is truncated + redacted before
|
||||
// InsertIfNotExistsAsync. The single-repository test ctor has no
|
||||
// service provider — it falls through with no redactor, which preserves
|
||||
// the small-payload assumptions baked into the existing fixtures.
|
||||
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
|
||||
// services (IAsyncDisposable DbContexts) dispose asynchronously
|
||||
// without blocking on sync Dispose() of pending connection cleanup.
|
||||
@@ -133,15 +135,42 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
}
|
||||
else
|
||||
{
|
||||
await using var scope = _serviceProvider!.CreateAsyncScope();
|
||||
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var redactor = scope.ServiceProvider.GetService<IAuditRedactor>();
|
||||
// M6 Bundle E (T8): central health counter is best-effort —
|
||||
// unregistered (test composition roots) means the per-row catch
|
||||
// simply logs without surfacing on the health dashboard.
|
||||
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
|
||||
await IngestWithRepositoryAsync(repository, redactor, failureCounter, cmd, nowUtc, accepted)
|
||||
.ConfigureAwait(false);
|
||||
// AuditLog-014: guard scope-creation + repository resolution in a
|
||||
// try/catch, mirroring OnCachedTelemetryAsync. A transient DI /
|
||||
// DbContext-factory fault (pooled-context init, SQL-connection
|
||||
// exhaustion, a resolution race during host churn) would otherwise
|
||||
// propagate out of the ReceiveAsync handler, trip the parent's
|
||||
// supervision, and RESTART this central singleton over a transient
|
||||
// fault — dropping the captured reply so the site's Ask times out.
|
||||
// Best-effort audit must never wedge the singleton: log, optionally
|
||||
// bump the failure counter, and still reply with whatever was
|
||||
// accepted (empty on an up-front scope-resolution throw) so the
|
||||
// site keeps its rows Pending and retries on the next drain.
|
||||
try
|
||||
{
|
||||
await using var scope = _serviceProvider!.CreateAsyncScope();
|
||||
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var redactor = scope.ServiceProvider.GetService<IAuditRedactor>();
|
||||
// M6 Bundle E (T8): central health counter is best-effort —
|
||||
// unregistered (test composition roots) means the per-row catch
|
||||
// simply logs without surfacing on the health dashboard.
|
||||
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
|
||||
await IngestWithRepositoryAsync(repository, redactor, failureCounter, cmd, nowUtc, accepted)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Scope creation or a required-service resolution threw before
|
||||
// (or while) processing the batch. Surface a sustained fault on
|
||||
// the dashboard if the counter is registered, but never let the
|
||||
// throw escape the handler and restart the singleton.
|
||||
try { _serviceProvider!.GetService<ICentralAuditWriteFailureCounter>()?.Increment(); }
|
||||
catch { /* counter must never throw — defence in depth */ }
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Audit event batch ingest failed before/while resolving the repository scope; replying with {Accepted} accepted row(s). The site keeps unaccepted rows Pending and retries on the next drain.",
|
||||
accepted.Count);
|
||||
}
|
||||
}
|
||||
|
||||
replyTo.Tell(new IngestAuditEventsReply(accepted));
|
||||
@@ -159,10 +188,10 @@ public class AuditLogIngestActor : ReceiveActor
|
||||
{
|
||||
try
|
||||
{
|
||||
// Stamp IngestedAtUtc here, not at the site. Bundle A's
|
||||
// repository hardening already swallows duplicate-key races,
|
||||
// so the same id arriving twice (site retry, reconciliation)
|
||||
// is a silent no-op.
|
||||
// Stamp the ingest timestamp here, not at the site. The
|
||||
// repository's duplicate-key hardening already swallows
|
||||
// duplicate-key races, so the same id arriving twice (site
|
||||
// retry, reconciliation) is a silent no-op.
|
||||
// Redact BEFORE the IngestedAtUtc stamp so the redacted
|
||||
// copy carries the central-side ingest timestamp. The redactor
|
||||
// is contract-bound to never throw. AuditLog-008: a null
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
@@ -37,6 +39,14 @@ public sealed class AuditLogPurgeOptions
|
||||
/// a large backlog within a tick. Clamped to a sane minimum in
|
||||
/// <see cref="ChannelPurgeBatchSize"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// AuditLog-013: the operator-facing config key is <c>ChannelPurgeBatchSize</c>
|
||||
/// (per Component-AuditLog.md), so the binder maps that documented key onto this
|
||||
/// backing property via <see cref="ConfigurationKeyNameAttribute"/>. The unattributed
|
||||
/// property name (<c>ChannelPurgeBatchSizeConfigured</c>) would otherwise have been
|
||||
/// the bind key, silently ignoring the documented section.
|
||||
/// </remarks>
|
||||
[ConfigurationKeyName("ChannelPurgeBatchSize")]
|
||||
public int ChannelPurgeBatchSizeConfigured { get; set; } = 5000;
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -91,4 +91,29 @@ public static class ScadaBridgeTelemetry
|
||||
|
||||
Volatile.Write(ref _queueDepthProvider, provider);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears the StoreAndForward queue-depth provider, but only if the currently
|
||||
/// registered provider is the exact <paramref name="provider"/> delegate passed in
|
||||
/// (reference-equal compare-and-clear). This lets a StoreAndForward service deregister
|
||||
/// its own provider on graceful stop without stomping a newer instance that already
|
||||
/// re-registered into the process-global slot: if a late stop of the old instance
|
||||
/// passes its (now-superseded) delegate, the identity check fails and the newer
|
||||
/// provider is preserved. After a successful clear the gauge falls back to reporting 0.
|
||||
/// Mirrors <see cref="SetQueueDepthProvider"/>'s signature and <see cref="Volatile"/>
|
||||
/// access pattern.
|
||||
/// </summary>
|
||||
/// <param name="provider">The provider delegate to remove; ignored unless it is the
|
||||
/// one currently registered.</param>
|
||||
public static void ClearQueueDepthProvider(Func<long> provider)
|
||||
{
|
||||
if (provider is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Compare-and-clear: only null the slot when it still holds the caller's
|
||||
// delegate, so a stale stop cannot clobber a successor's provider.
|
||||
Interlocked.CompareExchange(ref _queueDepthProvider, null, provider);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,12 @@ public static class KpiSeriesBucketer
|
||||
/// Empty buckets are omitted — no gap-filling.
|
||||
/// </summary>
|
||||
/// <param name="raw">
|
||||
/// Input series, assumed to be sorted ascending by <see cref="KpiSeriesPoint.BucketStartUtc"/>.
|
||||
/// If not sorted, the point with the largest timestamp within each bucket is selected.
|
||||
/// Input series, which must be sorted ascending by <see cref="KpiSeriesPoint.BucketStartUtc"/>.
|
||||
/// For sorted input the last point in iteration order within a bucket is the one with the
|
||||
/// largest timestamp (the intended last-value-per-bucket result). For unsorted input the
|
||||
/// method still selects the last point in iteration order within each bucket — it does
|
||||
/// <em>not</em> pick the largest-timestamp point — so the result is well-defined but not
|
||||
/// the last-value semantics callers expect; pre-sort the series first.
|
||||
/// If <c>null</c> or empty, an empty list is returned.
|
||||
/// </param>
|
||||
/// <param name="fromUtc">UTC start of the query window (inclusive).</param>
|
||||
@@ -85,8 +89,12 @@ public static class KpiSeriesBucketer
|
||||
if (bucketIndex >= maxPoints)
|
||||
bucketIndex = maxPoints - 1;
|
||||
|
||||
// Keep the point with the highest timestamp in this bucket
|
||||
// (last-value semantics; if ties, keep first encountered — stable).
|
||||
// Keep the last point in iteration order within this bucket. Because the stored
|
||||
// candidate's BucketStartUtc is the bucket-START timestamp (not the raw point's
|
||||
// capture time), the comparison below is true for essentially any in-bucket point,
|
||||
// so each later-in-iteration point overwrites the previous one. For the ascending-
|
||||
// sorted input this method requires, last-in-iteration IS the largest-timestamp
|
||||
// point — i.e. last-value-per-bucket semantics.
|
||||
if (!occupied[bucketIndex] ||
|
||||
point.BucketStartUtc > best[bucketIndex].BucketStartUtc)
|
||||
{
|
||||
|
||||
+8
@@ -54,9 +54,17 @@ public class DeploymentManagerRepository : IDeploymentManagerRepository
|
||||
/// <inheritdoc />
|
||||
public async Task<DeploymentRecord?> GetCurrentDeploymentStatusAsync(int instanceId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
// DeploymentManager-026: deployments are insert-only (one row per deploy
|
||||
// attempt), so two records for the same instance can tie on DeployedAt when
|
||||
// they are created within the same clock tick (a rapid redeploy, or a
|
||||
// redeploy immediately after a timed-out attempt). SQL Server's choice
|
||||
// between equal sort keys is undefined, so reconciliation could read the
|
||||
// wrong "current" record. ThenByDescending(d => d.Id) makes the read
|
||||
// deterministic — the highest Id (the most recently inserted row) wins.
|
||||
return await _dbContext.DeploymentRecords
|
||||
.Where(d => d.InstanceId == instanceId)
|
||||
.OrderByDescending(d => d.DeployedAt)
|
||||
.ThenByDescending(d => d.Id)
|
||||
.FirstOrDefaultAsync(cancellationToken);
|
||||
}
|
||||
|
||||
|
||||
@@ -99,7 +99,12 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
// routed to subscribers (NativeAlarmActors) by source-object reference.
|
||||
/// <summary>sourceReference → set of subscriber actor refs (NativeAlarmActors), for routing + ref-count.</summary>
|
||||
private readonly Dictionary<string, HashSet<IActorRef>> _alarmSourceSubscribers = new();
|
||||
/// <summary>sourceReference → raw condition filter string passed to the adapter (first subscriber wins).</summary>
|
||||
/// <summary>
|
||||
/// sourceReference → raw condition filter string passed to the adapter (last subscriber wins).
|
||||
/// The shared feed carries a single filter: <see cref="HandleSubscribeAlarms"/> overwrites it
|
||||
/// unconditionally on every subscribe, so co-subscribers to one source reference must agree on
|
||||
/// the condition filter (a second subscriber's filter re-gates the first subscriber's transitions).
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, string?> _alarmSourceFilter = new();
|
||||
/// <summary>
|
||||
/// sourceReference → parsed condition-type predicate (M2.4 / #8). The authoritative
|
||||
@@ -1791,6 +1796,30 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
{
|
||||
_alarmSubscribesInFlight.Remove(msg.SourceReference);
|
||||
|
||||
// DataConnectionLayer-023: the last (or only) subscriber may have been
|
||||
// unsubscribed while this alarm subscribe was in flight. HandleUnsubscribeAlarms
|
||||
// emptied/removed _alarmSourceSubscribers for the source but could not tear down
|
||||
// the adapter feed because the subscription id was not stored yet. Mirror the
|
||||
// DCL-021 tag-path guard: if no subscriber remains, release the just-created
|
||||
// adapter feed instead of storing an orphaned subscription id that would stream
|
||||
// transitions to nobody for the lifetime of the adapter.
|
||||
if (!_alarmSourceSubscribers.ContainsKey(msg.SourceReference))
|
||||
{
|
||||
if (msg.Success && msg.SubscriptionId != null &&
|
||||
_adapter is IAlarmSubscribableConnection alarmable)
|
||||
{
|
||||
_log.Warning(
|
||||
"[{0}] AlarmSubscribeCompleted arrived for source {1} but the last " +
|
||||
"subscriber unsubscribed while the subscribe was in flight; releasing " +
|
||||
"the orphaned adapter alarm feed.",
|
||||
_connectionName, msg.SourceReference);
|
||||
_ = alarmable.UnsubscribeAlarmsAsync(msg.SubscriptionId);
|
||||
}
|
||||
|
||||
// No live requester remains to receive a response.
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg.Success && msg.SubscriptionId != null)
|
||||
{
|
||||
_alarmSubscriptionIds[msg.SourceReference] = msg.SubscriptionId;
|
||||
@@ -1874,6 +1903,11 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_alarmSourceSubscribers.Remove(request.SourceReference);
|
||||
_alarmSourceFilter.Remove(request.SourceReference);
|
||||
_alarmSourceFilterPredicate.Remove(request.SourceReference);
|
||||
// DataConnectionLayer-023: clear the in-flight marker so that if an adapter
|
||||
// subscribe is still in flight for this source, the late AlarmSubscribeCompleted
|
||||
// is recognized as orphaned (its guard checks _alarmSourceSubscribers, now empty)
|
||||
// and the just-created feed is released rather than stored and leaked.
|
||||
_alarmSubscribesInFlight.Remove(request.SourceReference);
|
||||
if (_alarmSubscriptionIds.Remove(request.SourceReference, out var subId) &&
|
||||
_adapter is IAlarmSubscribableConnection alarmable)
|
||||
{
|
||||
|
||||
@@ -277,6 +277,19 @@ public class MxGatewayDataConnection : IDataConnection, IBrowsableDataConnection
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
_eventLoopCts?.Cancel();
|
||||
// DataConnectionLayer-025: the DataConnectionActor disposes adapters
|
||||
// fire-and-forget on failover/stop without necessarily calling
|
||||
// DisconnectAsync first, so tear down the alarm stream here too — otherwise
|
||||
// the long-running RunAlarmStreamAsync task and its CTS leak on every
|
||||
// MxGateway failover/teardown that goes through DisposeAsync. Mirror the
|
||||
// lock-guarded block already in DisconnectAsync.
|
||||
lock (_alarmLock)
|
||||
{
|
||||
_alarmCts?.Cancel();
|
||||
_alarmCts?.Dispose();
|
||||
_alarmCts = null;
|
||||
_alarmSubCount = 0;
|
||||
}
|
||||
if (_client is not null)
|
||||
await _client.DisposeAsync();
|
||||
GC.SuppressFinalize(this);
|
||||
|
||||
@@ -12,8 +12,16 @@ namespace ZB.MOM.WW.ScadaBridge.DeploymentManager;
|
||||
|
||||
/// <summary>
|
||||
/// WP-7: System-wide artifact deployment.
|
||||
/// Broadcasts artifacts (shared scripts, external systems, notification lists, DB connections,
|
||||
/// data connections, and SMTP configurations) to all sites with per-site tracking.
|
||||
/// Broadcasts artifacts (shared scripts, external systems, DB connections, and
|
||||
/// data connections) to all sites with per-site tracking.
|
||||
///
|
||||
/// Notification lists and SMTP configuration are deliberately NOT shipped to
|
||||
/// sites: notification delivery is central-only (sites store-and-forward to
|
||||
/// central and never talk to SMTP), so no notification artifact or SMTP
|
||||
/// credential is ever distributed to a site. The
|
||||
/// <see cref="DeployArtifactsCommand"/> still carries the
|
||||
/// <c>NotificationLists</c>/<c>SmtpConfigurations</c> fields for additive
|
||||
/// message-contract compatibility, but central never populates them.
|
||||
///
|
||||
/// - Successful sites are NOT rolled back on other failures.
|
||||
/// - Failed sites are retryable individually.
|
||||
@@ -26,7 +34,6 @@ public class ArtifactDeploymentService
|
||||
private readonly IDeploymentManagerRepository _deploymentRepo;
|
||||
private readonly ITemplateEngineRepository _templateRepo;
|
||||
private readonly IExternalSystemRepository _externalSystemRepo;
|
||||
private readonly INotificationRepository _notificationRepo;
|
||||
private readonly CommunicationService _communicationService;
|
||||
private readonly IAuditService _auditService;
|
||||
private readonly DeploymentManagerOptions _options;
|
||||
@@ -39,7 +46,12 @@ public class ArtifactDeploymentService
|
||||
/// <param name="deploymentRepo">Repository for deployment records.</param>
|
||||
/// <param name="templateRepo">Repository for templates.</param>
|
||||
/// <param name="externalSystemRepo">Repository for external systems.</param>
|
||||
/// <param name="notificationRepo">Repository for notifications.</param>
|
||||
/// <param name="notificationRepo">
|
||||
/// DeploymentManager-025: retained on the signature for DI/source compatibility but
|
||||
/// intentionally NOT consumed. Notification lists and SMTP configuration are
|
||||
/// central-only and are never shipped to sites, so the artifact path must not read
|
||||
/// the notification repository at all.
|
||||
/// </param>
|
||||
/// <param name="communicationService">Service for communicating with sites.</param>
|
||||
/// <param name="auditService">Service for audit logging.</param>
|
||||
/// <param name="options">Deployment manager options.</param>
|
||||
@@ -59,7 +71,9 @@ public class ArtifactDeploymentService
|
||||
_deploymentRepo = deploymentRepo;
|
||||
_templateRepo = templateRepo;
|
||||
_externalSystemRepo = externalSystemRepo;
|
||||
_notificationRepo = notificationRepo;
|
||||
// DeploymentManager-025: notificationRepo is deliberately not stored — notification
|
||||
// lists and SMTP configs are central-only and are never fetched for shipping to sites.
|
||||
_ = notificationRepo;
|
||||
_communicationService = communicationService;
|
||||
_auditService = auditService;
|
||||
_options = options.Value;
|
||||
@@ -98,15 +112,19 @@ public class ArtifactDeploymentService
|
||||
/// <summary>
|
||||
/// Builds a per-site <see cref="DeployArtifactsCommand"/> using a previously-fetched
|
||||
/// snapshot of the global artifact sets (shared scripts, external systems + methods,
|
||||
/// DB connections, notification lists, SMTP configurations). Only the per-site
|
||||
/// data-connection query runs here.
|
||||
/// DB connections). Only the per-site data-connection query runs here.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// DeploymentManager-023: separating the global fetch from the per-site build lets
|
||||
/// <see cref="DeployToAllSitesAsync"/> issue the global queries exactly once across
|
||||
/// the whole multi-site sweep, eliminating the N+1 re-query of shared scripts,
|
||||
/// external systems, methods, DB connections, notification lists, and SMTP
|
||||
/// configurations.
|
||||
/// external systems, methods, and DB connections.
|
||||
///
|
||||
/// DeploymentManager-025: the command's <c>NotificationLists</c> and
|
||||
/// <c>SmtpConfigurations</c> fields are always sent <c>null</c> — notification
|
||||
/// delivery is central-only and no notification artifact or SMTP credential is
|
||||
/// ever distributed to a site. The fields remain on the contract only for
|
||||
/// additive compatibility.
|
||||
/// </remarks>
|
||||
private async Task<DeployArtifactsCommand> BuildDeployArtifactsCommandAsync(
|
||||
int siteId,
|
||||
@@ -125,30 +143,34 @@ public class ArtifactDeploymentService
|
||||
globals.SharedScripts,
|
||||
globals.ExternalSystems,
|
||||
globals.DatabaseConnections,
|
||||
globals.NotificationLists,
|
||||
// DeploymentManager-025: notification lists are central-only — never shipped to sites.
|
||||
NotificationLists: null,
|
||||
dataConnectionArtifacts,
|
||||
globals.SmtpConfigurations,
|
||||
// DeploymentManager-025: SMTP config (incl. credentials) is central-only — never shipped to sites.
|
||||
SmtpConfigurations: null,
|
||||
DateTimeOffset.UtcNow);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fetches the system-wide artifact sets that are identical across every site —
|
||||
/// shared scripts, external systems (with their methods serialized in), database
|
||||
/// connections, notification lists, and SMTP configurations. Used by
|
||||
/// <see cref="DeployToAllSitesAsync"/> to pre-load once before the per-site loop.
|
||||
/// shared scripts, external systems (with their methods serialized in), and
|
||||
/// database connections. Used by <see cref="DeployToAllSitesAsync"/> to pre-load
|
||||
/// once before the per-site loop.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// DeploymentManager-023: the per-site artifact build path previously re-issued
|
||||
/// every one of these queries per site (≈ 5·N + M·N round trips for N sites
|
||||
/// every one of these queries per site (≈ N + M·N round trips for N sites
|
||||
/// and M external systems). Hoisting them here drops that to a single fetch.
|
||||
///
|
||||
/// DeploymentManager-025: notification lists and SMTP configurations are NOT
|
||||
/// fetched here. Notification delivery is central-only, so they are never
|
||||
/// shipped to sites — the artifact path must not even read them.
|
||||
/// </remarks>
|
||||
private async Task<GlobalArtifactSnapshot> FetchGlobalArtifactsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var sharedScripts = await _templateRepo.GetAllSharedScriptsAsync(cancellationToken);
|
||||
var externalSystems = await _externalSystemRepo.GetAllExternalSystemsAsync(cancellationToken);
|
||||
var dbConnections = await _externalSystemRepo.GetAllDatabaseConnectionsAsync(cancellationToken);
|
||||
var notificationLists = await _notificationRepo.GetAllNotificationListsAsync(cancellationToken);
|
||||
var smtpConfigurations = await _notificationRepo.GetAllSmtpConfigurationsAsync(cancellationToken);
|
||||
|
||||
// Map shared scripts
|
||||
var scriptArtifacts = sharedScripts.Select(s =>
|
||||
@@ -177,35 +199,23 @@ public class ArtifactDeploymentService
|
||||
var dbConnectionArtifacts = dbConnections.Select(d =>
|
||||
new DatabaseConnectionArtifact(d.Name, d.ConnectionString, d.MaxRetries, d.RetryDelay)).ToList();
|
||||
|
||||
// Map notification lists
|
||||
var notificationListArtifacts = notificationLists.Select(nl =>
|
||||
new NotificationListArtifact(nl.Name, nl.Recipients.Where(r => r.EmailAddress is not null).Select(r => r.EmailAddress!).ToList())).ToList();
|
||||
|
||||
// Map SMTP configurations — use Host as the artifact name (matches SQLite PK on site)
|
||||
var smtpArtifacts = smtpConfigurations.Select(smtp =>
|
||||
new SmtpConfigurationArtifact(
|
||||
$"{smtp.Host}:{smtp.Port}", smtp.Host, smtp.Port, smtp.AuthType, smtp.FromAddress,
|
||||
smtp.Credentials, null, smtp.TlsMode)).ToList();
|
||||
|
||||
return new GlobalArtifactSnapshot(
|
||||
scriptArtifacts,
|
||||
externalSystemArtifacts,
|
||||
dbConnectionArtifacts,
|
||||
notificationListArtifacts,
|
||||
smtpArtifacts);
|
||||
dbConnectionArtifacts);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bag of the global artifact sets that do not vary per site, captured once at
|
||||
/// the start of <see cref="DeployToAllSitesAsync"/> and reused for every per-site
|
||||
/// command build (DeploymentManager-023).
|
||||
/// command build (DeploymentManager-023). Notification lists and SMTP
|
||||
/// configurations are deliberately absent — they are central-only and never
|
||||
/// shipped to sites (DeploymentManager-025).
|
||||
/// </summary>
|
||||
private sealed record GlobalArtifactSnapshot(
|
||||
IReadOnlyList<SharedScriptArtifact> SharedScripts,
|
||||
IReadOnlyList<ExternalSystemArtifact> ExternalSystems,
|
||||
IReadOnlyList<DatabaseConnectionArtifact> DatabaseConnections,
|
||||
IReadOnlyList<NotificationListArtifact> NotificationLists,
|
||||
IReadOnlyList<SmtpConfigurationArtifact> SmtpConfigurations);
|
||||
IReadOnlyList<DatabaseConnectionArtifact> DatabaseConnections);
|
||||
|
||||
/// <summary>
|
||||
/// Deploys artifacts to all sites. Builds a per-site command with that site's data connections.
|
||||
@@ -226,9 +236,10 @@ public class ArtifactDeploymentService
|
||||
var perSiteResults = new Dictionary<string, SiteArtifactResult>();
|
||||
|
||||
// DeploymentManager-023: hoist the system-wide artifact queries (shared scripts,
|
||||
// external systems + methods, DB connections, notification lists, SMTP configs)
|
||||
// OUT of the per-site loop so they run ONCE instead of once per site. Only
|
||||
// data connections legitimately vary per site, so they stay inside the loop.
|
||||
// external systems + methods, DB connections) OUT of the per-site loop so they
|
||||
// run ONCE instead of once per site. Only data connections legitimately vary
|
||||
// per site, so they stay inside the loop. (Notification lists and SMTP config
|
||||
// are central-only and not fetched at all — DeploymentManager-025.)
|
||||
var globals = await FetchGlobalArtifactsAsync(cancellationToken);
|
||||
|
||||
// Build per-site commands sequentially (DbContext is not thread-safe).
|
||||
|
||||
@@ -332,20 +332,30 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// [2] The caller asked to abandon the work — propagate the cancellation
|
||||
// [1] The caller asked to abandon the work — propagate the cancellation
|
||||
// unchanged; it must never be reclassified as a transient DB error.
|
||||
throw;
|
||||
}
|
||||
catch (SqlException ex)
|
||||
{
|
||||
// Classify by SqlException.Number and rethrow as the strongly-typed
|
||||
// transient / permanent failure the callers branch on. The context
|
||||
// is the connection NAME, never the connection string.
|
||||
// [2] ExternalSystemGateway-025: a caller-token cancellation can surface
|
||||
// from the SQL driver as a SqlException (a mid-flight cancel), not an
|
||||
// OperationCanceledException, so the [1] filter above never sees it.
|
||||
// Re-check the caller's token at the TOP of this block so such a cancel
|
||||
// propagates as OperationCanceledException regardless of the driver's
|
||||
// exception shape — never reclassified as a permanent DB error (the
|
||||
// "-008 cancel-not-reclassified" contract). Version-independent: no need
|
||||
// to match a specific SqlException number.
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
// Otherwise classify by SqlException.Number and rethrow as the
|
||||
// strongly-typed transient / permanent failure the callers branch on.
|
||||
// The context is the connection NAME, never the connection string.
|
||||
throw SqlErrorClassifier.Throw(connectionName, ex);
|
||||
}
|
||||
catch (Exception ex) when (SqlErrorClassifier.IsTransient(ex))
|
||||
{
|
||||
// [1] A live outage that did not surface as a SqlException — treat as
|
||||
// [3] A live outage that did not surface as a SqlException — treat as
|
||||
// transient so the caller buffers + retries. The message uses the
|
||||
// connection NAME, never the connection string (credential safety).
|
||||
throw new TransientDatabaseException(
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Transport/ZB.MOM.WW.ScadaBridge.Transport.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.AuditLog/ZB.MOM.WW.ScadaBridge.AuditLog.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.SiteCallAudit/ZB.MOM.WW.ScadaBridge.SiteCallAudit.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.KpiHistory/ZB.MOM.WW.ScadaBridge.KpiHistory.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.CentralUI/ZB.MOM.WW.ScadaBridge.CentralUI.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Security/ZB.MOM.WW.ScadaBridge.Security.csproj" />
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.HealthMonitoring/ZB.MOM.WW.ScadaBridge.HealthMonitoring.csproj" />
|
||||
|
||||
@@ -68,6 +68,19 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
|
||||
/// </summary>
|
||||
private CancellationTokenSource? _shutdownCts;
|
||||
|
||||
/// <summary>
|
||||
/// In-flight guard for the sample loop. Set true at the start of a sample pass and cleared
|
||||
/// when the pass's <see cref="SampleComplete"/> arrives. While true, further
|
||||
/// <see cref="SampleTick"/>s are skipped so passes never overlap — Akka periodic timers
|
||||
/// enqueue (not coalesce) missed ticks, so without this guard a pass running longer than
|
||||
/// <see cref="KpiHistoryOptions.SampleInterval"/> (slow/recovering DB) would let each
|
||||
/// subsequent tick spawn another concurrent pass, amplifying load on the struggling store
|
||||
/// and double-writing samples for overlapping windows. Mirrors the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor"/> dispatch
|
||||
/// in-flight guard the timer pattern is modelled on.
|
||||
/// </summary>
|
||||
private bool _sampleInFlight;
|
||||
|
||||
/// <summary>Akka timer scheduler, assigned by the actor system via <see cref="IWithTimers"/>.</summary>
|
||||
public ITimerScheduler Timers { get; set; } = null!;
|
||||
|
||||
@@ -87,7 +100,7 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
|
||||
Receive<SampleTick>(_ => HandleSampleTick());
|
||||
Receive<SampleComplete>(_ => { }); // best-effort: no actor state to reset on completion
|
||||
Receive<SampleComplete>(_ => _sampleInFlight = false); // lower the in-flight guard (success or fault)
|
||||
Receive<PurgeTick>(_ => HandlePurgeTick());
|
||||
Receive<PurgeComplete>(_ => { }); // best-effort: no actor state to reset on completion
|
||||
}
|
||||
@@ -135,19 +148,31 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles a sample tick: captures the shared <c>capturedAtUtc</c> instant on the actor
|
||||
/// thread, then launches the asynchronous sampling pass off-thread and pipes a
|
||||
/// completion back to <see cref="Self"/> so the mailbox is never blocked while sources
|
||||
/// are collected and the batch is written.
|
||||
/// Handles a sample tick. If a sample pass is already in flight the tick is skipped
|
||||
/// (logged at debug) so passes never overlap; otherwise the in-flight guard is raised,
|
||||
/// the shared <c>capturedAtUtc</c> instant is captured on the actor thread, and the
|
||||
/// asynchronous sampling pass is launched off-thread with a <see cref="SampleComplete"/>
|
||||
/// piped back to <see cref="Self"/> to lower the guard on the actor thread — so the
|
||||
/// mailbox is never blocked while sources are collected and the batch is written.
|
||||
/// </summary>
|
||||
private void HandleSampleTick()
|
||||
{
|
||||
if (_sampleInFlight)
|
||||
{
|
||||
// A prior pass is still awaiting its DB round-trip; coalesce this tick rather than
|
||||
// piling a second concurrent pass onto a slow/recovering store.
|
||||
_logger.LogDebug("KPI sample tick skipped — a sample pass is already in flight.");
|
||||
return;
|
||||
}
|
||||
|
||||
_sampleInFlight = true;
|
||||
var capturedAt = DateTime.UtcNow;
|
||||
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
|
||||
|
||||
// RunSamplePass self-isolates its faults (it never throws), but the failure
|
||||
// projection is kept as a belt-and-braces guard so even a faulted task still
|
||||
// produces a SampleComplete.
|
||||
// produces a SampleComplete that lowers the in-flight guard — otherwise the loop
|
||||
// would wedge permanently.
|
||||
RunSamplePass(capturedAt, cancellationToken).PipeTo(
|
||||
Self,
|
||||
success: () => SampleComplete.Instance,
|
||||
@@ -282,7 +307,10 @@ public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
|
||||
private SampleTick() { }
|
||||
}
|
||||
|
||||
/// <summary>Piped-back completion of a sampling pass; lets the pass run off the actor thread.</summary>
|
||||
/// <summary>
|
||||
/// Piped-back completion of a sampling pass; lets the pass run off the actor thread and
|
||||
/// lowers the <c>_sampleInFlight</c> guard on the actor thread (fires on success and fault).
|
||||
/// </summary>
|
||||
internal sealed class SampleComplete
|
||||
{
|
||||
public static readonly SampleComplete Instance = new();
|
||||
|
||||
@@ -1887,8 +1887,26 @@ public class ManagementActor : ReceiveActor
|
||||
return await repo.GetAllMappingsAsync();
|
||||
}
|
||||
|
||||
// Security-023 (membership half): an LDAP-group mapping's Role is a free string on the
|
||||
// wire (CLI/API), so reject anything outside the canonical Roles.All set at the single
|
||||
// server-side write path. A non-canonical role never functioned (no policy or authz
|
||||
// check matches it), so rejecting it removes a silent-misconfiguration footgun rather
|
||||
// than changing behaviour. Membership is checked case-insensitively to match the rest
|
||||
// of the actor's role comparisons; the existing case-sensitivity asymmetry between the
|
||||
// UI RequireClaim policies and the ManagementActor check is a separately-deferred change
|
||||
// and is deliberately NOT altered here — the stored value's verbatim casing is preserved.
|
||||
private static void ValidateMappingRole(string role)
|
||||
{
|
||||
if (!Roles.All.Contains(role, StringComparer.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new ManagementCommandException(
|
||||
$"Role '{role}' is not a recognized role. Valid roles are: {string.Join(", ", Roles.All)}.");
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<object?> HandleCreateRoleMapping(IServiceProvider sp, CreateRoleMappingCommand cmd, string user)
|
||||
{
|
||||
ValidateMappingRole(cmd.Role);
|
||||
var repo = sp.GetRequiredService<ISecurityRepository>();
|
||||
var mapping = new LdapGroupMapping(cmd.LdapGroupName, cmd.Role);
|
||||
await repo.AddMappingAsync(mapping);
|
||||
@@ -1899,6 +1917,7 @@ public class ManagementActor : ReceiveActor
|
||||
|
||||
private static async Task<object?> HandleUpdateRoleMapping(IServiceProvider sp, UpdateRoleMappingCommand cmd, string user)
|
||||
{
|
||||
ValidateMappingRole(cmd.Role);
|
||||
var repo = sp.GetRequiredService<ISecurityRepository>();
|
||||
var mapping = await repo.GetMappingByIdAsync(cmd.MappingId)
|
||||
?? throw new ManagementCommandException($"RoleMapping with ID {cmd.MappingId} not found.");
|
||||
|
||||
@@ -103,6 +103,19 @@ public static class ScriptTrustPolicy
|
||||
/// scripting options and the semantic-analysis compilation built for trust
|
||||
/// validation, so the validator resolves symbols against exactly the same
|
||||
/// metadata the script is compiled against.
|
||||
///
|
||||
/// <para>
|
||||
/// This is the <b>minimal, runtime-fidelity</b> set. It deliberately does
|
||||
/// NOT reference the assemblies that host the forbidden APIs (e.g.
|
||||
/// <c>System.Diagnostics.Process.dll</c>, <c>System.Net.Sockets.dll</c>) — a
|
||||
/// forbidden type must remain an <i>undefined symbol</i> at compile time so
|
||||
/// the <see cref="RoslynScriptCompiler"/> gate independently rejects it. Do
|
||||
/// not widen this set with forbidden-API anchor assemblies; that second layer
|
||||
/// of defence (a forbidden type fails to bind at execution-time compilation)
|
||||
/// depends on it staying minimal. The trust validator's symbol-resolution
|
||||
/// needs are met separately by <see cref="AnalysisReferences"/> (which adds
|
||||
/// <see cref="ForbiddenAnchorAssemblies"/> on the minimal-fallback path).
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public static readonly IReadOnlyList<Assembly> DefaultAssemblies =
|
||||
[
|
||||
@@ -113,6 +126,42 @@ public static class ScriptTrustPolicy
|
||||
typeof(DynamicJsonElement).Assembly,
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Anchor assemblies that <b>host the forbidden-API types named in
|
||||
/// <see cref="ForbiddenScopes"/></b>. Used ONLY to enrich the <i>trust
|
||||
/// validator's</i> semantic reference set (<see cref="AnalysisReferences"/>)
|
||||
/// when the <c>TRUSTED_PLATFORM_ASSEMBLIES</c> list is unavailable — a
|
||||
/// single-file, AOT, or trimmed host. Without these, the minimal fallback set
|
||||
/// cannot resolve a <i>bare</i> forbidden type that lives inside an
|
||||
/// <i>allowed</i> namespace (the documented case being <c>Process</c> via
|
||||
/// <c>using System.Diagnostics;</c>): the symbol resolves to nothing, Pass 1's
|
||||
/// syntactic fallback ignores dotless identifiers, and Pass 2 never flags a
|
||||
/// bare identifier — so the forbidden reference slips the validator entirely
|
||||
/// (ScriptAnalysis-001). Anchoring these assemblies in the fallback keeps the
|
||||
/// semantic pass authoritative even in the degraded mode.
|
||||
///
|
||||
/// <para>
|
||||
/// These are <b>never</b> added to <see cref="DefaultReferences"/> — see the
|
||||
/// remark on <see cref="DefaultAssemblies"/>. Most forbidden anchor types
|
||||
/// (<c>System.IO.File</c>, <c>System.Threading.Thread</c>,
|
||||
/// <c>System.Reflection.Assembly</c>,
|
||||
/// <c>System.Runtime.InteropServices.Marshal</c>) already live in the same
|
||||
/// assembly as <c>typeof(object)</c> (<c>System.Private.CoreLib</c>), so the
|
||||
/// minimal set already resolves them; the ones that ship in their own
|
||||
/// assemblies — <c>System.Diagnostics.Process</c> and
|
||||
/// <c>System.Net.Sockets.Socket</c> — are listed here explicitly.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public static readonly IReadOnlyList<Assembly> ForbiddenAnchorAssemblies =
|
||||
[
|
||||
typeof(System.Diagnostics.Process).Assembly,
|
||||
typeof(System.IO.File).Assembly,
|
||||
typeof(System.Threading.Thread).Assembly,
|
||||
typeof(System.Reflection.Assembly).Assembly,
|
||||
typeof(System.Net.Sockets.Socket).Assembly,
|
||||
typeof(System.Runtime.InteropServices.Marshal).Assembly,
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Metadata references for the trust-validation semantic compilation and
|
||||
/// the design-time script compilation.
|
||||
@@ -143,6 +192,20 @@ public static class ScriptTrustPolicy
|
||||
/// </summary>
|
||||
public static readonly IReadOnlyList<MetadataReference> AnalysisReferences = BuildAnalysisReferences();
|
||||
|
||||
/// <summary>
|
||||
/// True when <see cref="AnalysisReferences"/> was built WITHOUT the
|
||||
/// trusted-platform-assemblies (TPA) list — i.e. the semantic pass is running
|
||||
/// against the minimal fallback set (a single-file/AOT/trimmed host) rather
|
||||
/// than the full framework. In this degraded mode the validator still
|
||||
/// resolves the documented forbidden anchors (because
|
||||
/// <see cref="ForbiddenAnchorAssemblies"/> is folded into the fallback), but
|
||||
/// types outside that anchor set may resolve as unknown and rely on the
|
||||
/// syntactic pass / downstream compile gate. A warning is also emitted via
|
||||
/// <see cref="System.Diagnostics.Trace"/> at type initialisation. Consumers
|
||||
/// (and tests) can read this flag to detect / surface the weakened mode.
|
||||
/// </summary>
|
||||
public static bool AnalysisReferencesDegraded { get; private set; }
|
||||
|
||||
private static IReadOnlyList<MetadataReference> BuildAnalysisReferences()
|
||||
{
|
||||
var byPath = new Dictionary<string, MetadataReference>(StringComparer.OrdinalIgnoreCase);
|
||||
@@ -166,23 +229,77 @@ public static class ScriptTrustPolicy
|
||||
}
|
||||
}
|
||||
|
||||
// The TPA list was unavailable (single-file / AOT / trimmed host) — we
|
||||
// are about to fall back to the minimal set, which weakens the semantic
|
||||
// pass. Make the degradation LOUD (not silent): record the flag and emit
|
||||
// a warning so operators/tests can detect the mode (ScriptAnalysis-001).
|
||||
var tpaAvailable = byPath.Count > 0;
|
||||
if (!tpaAvailable)
|
||||
{
|
||||
AnalysisReferencesDegraded = true;
|
||||
System.Diagnostics.Trace.TraceWarning(
|
||||
"ScriptTrustPolicy: TRUSTED_PLATFORM_ASSEMBLIES unavailable; the script-trust " +
|
||||
"semantic pass is running against the MINIMAL fallback reference set (plus the " +
|
||||
"forbidden-API anchor assemblies). Symbol resolution is reduced — forbidden anchors " +
|
||||
"(Process, Socket, File, Thread, Assembly, Marshal) remain caught, but other types " +
|
||||
"may resolve as unknown. This typically indicates a single-file/AOT/trimmed host.");
|
||||
}
|
||||
|
||||
// Ensure app assemblies the script API surface needs are present even if
|
||||
// not in the TPA list (e.g. Commons / DynamicJsonElement).
|
||||
foreach (var asm in DefaultAssemblies)
|
||||
{
|
||||
var loc = asm.Location;
|
||||
if (loc.Length == 0 || byPath.ContainsKey(loc) || !File.Exists(loc))
|
||||
continue;
|
||||
TryAddAssembly(byPath, asm);
|
||||
|
||||
try { byPath[loc] = MetadataReference.CreateFromFile(loc); }
|
||||
catch { /* ignore */ }
|
||||
// On the minimal-fallback path, also fold in the assemblies that HOST the
|
||||
// forbidden-API types so a bare forbidden type inside an allowed namespace
|
||||
// (the documented `Process` via `using System.Diagnostics;` case) still
|
||||
// resolves and is flagged authoritatively by the semantic pass. When the
|
||||
// TPA list is present these are already covered, so this only matters in
|
||||
// the degraded mode (ScriptAnalysis-001). NOTE: these anchors are added
|
||||
// ONLY here, never to DefaultReferences — the compile gate must keep
|
||||
// rejecting forbidden types as undefined symbols.
|
||||
if (!tpaAvailable)
|
||||
{
|
||||
foreach (var asm in ForbiddenAnchorAssemblies)
|
||||
TryAddAssembly(byPath, asm);
|
||||
}
|
||||
|
||||
// Fallback to the minimal set if the TPA list was unavailable (e.g. a
|
||||
// single-file/AOT host) so validation still functions.
|
||||
// Should never be empty (DefaultAssemblies always resolve), but guard
|
||||
// against a pathological host and fall back to the minimal references.
|
||||
return byPath.Count > 0 ? byPath.Values.ToList() : DefaultReferences;
|
||||
}
|
||||
|
||||
private static void TryAddAssembly(
|
||||
Dictionary<string, MetadataReference> byPath, Assembly asm)
|
||||
{
|
||||
var loc = asm.Location;
|
||||
if (loc.Length == 0 || byPath.ContainsKey(loc) || !File.Exists(loc))
|
||||
return;
|
||||
|
||||
try { byPath[loc] = MetadataReference.CreateFromFile(loc); }
|
||||
catch { /* skip an unreadable assembly rather than fail validation */ }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds exactly the reference set the trust validator would use on the
|
||||
/// minimal TPA-fallback path: <see cref="DefaultAssemblies"/> plus the
|
||||
/// <see cref="ForbiddenAnchorAssemblies"/> that host the forbidden-API types.
|
||||
/// This is what <see cref="BuildAnalysisReferences"/> produces when
|
||||
/// <c>TRUSTED_PLATFORM_ASSEMBLIES</c> is unavailable. Exposed so the degraded
|
||||
/// mode can be exercised directly (e.g. by the adversarial tests proving the
|
||||
/// SA-001 fallback hole is closed) without depending on the host actually
|
||||
/// lacking a TPA list.
|
||||
/// </summary>
|
||||
public static IReadOnlyList<MetadataReference> BuildMinimalFallbackReferences()
|
||||
{
|
||||
var byPath = new Dictionary<string, MetadataReference>(StringComparer.OrdinalIgnoreCase);
|
||||
foreach (var asm in DefaultAssemblies)
|
||||
TryAddAssembly(byPath, asm);
|
||||
foreach (var asm in ForbiddenAnchorAssemblies)
|
||||
TryAddAssembly(byPath, asm);
|
||||
return byPath.Values.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default namespace imports made available to compiled scripts.
|
||||
/// </summary>
|
||||
|
||||
@@ -55,17 +55,42 @@ public static class ScriptTrustValidator
|
||||
/// </summary>
|
||||
/// <param name="code">The C# script source to analyse.</param>
|
||||
/// <param name="extraReferences">
|
||||
/// Optional additional metadata references to add to
|
||||
/// <see cref="ScriptTrustPolicy.DefaultReferences"/> for semantic
|
||||
/// resolution — e.g. the compile-surface globals assembly so a script
|
||||
/// referencing the API surface resolves cleanly. Forbidden references are
|
||||
/// NOT added here (a script can't reach a forbidden API just because the
|
||||
/// assembly is referenced; the deny-list still applies).
|
||||
/// Optional additional metadata references to <b>widen symbol resolution</b>
|
||||
/// for the semantic pass — e.g. the compile-surface globals assembly so a
|
||||
/// script referencing the API surface resolves cleanly. Extra references can
|
||||
/// ONLY improve resolution; they can NEVER whitelist a forbidden API. The
|
||||
/// verdict is by resolved namespace/type against
|
||||
/// <see cref="ScriptTrustPolicy.ForbiddenScopes"/>, so passing more
|
||||
/// references — even the forbidden-API assemblies themselves — can only make
|
||||
/// the verdict more accurate (more types resolve to their true namespace and
|
||||
/// are judged), never produce a false allow. Callers therefore need not (and
|
||||
/// cannot, for safety purposes) curate this set; the Central UI run gate
|
||||
/// forwards its full compilation reference surface here precisely because
|
||||
/// doing so is safe.
|
||||
/// </param>
|
||||
/// <returns>A list of trust-model violation messages; empty if the script is clean.</returns>
|
||||
public static IReadOnlyList<string> FindViolations(
|
||||
string code,
|
||||
IEnumerable<MetadataReference>? extraReferences = null)
|
||||
=> FindViolations(code, ScriptTrustPolicy.AnalysisReferences, extraReferences);
|
||||
|
||||
/// <summary>
|
||||
/// Overload that runs the trust analysis against an explicit base reference
|
||||
/// set instead of <see cref="ScriptTrustPolicy.AnalysisReferences"/>. The
|
||||
/// public entry point above always uses the full analysis set; this overload
|
||||
/// exists so tests can pin behaviour against the minimal TPA-fallback set
|
||||
/// (<see cref="ScriptTrustPolicy.BuildMinimalFallbackReferences"/>) and prove
|
||||
/// the degraded mode still catches the documented forbidden anchors
|
||||
/// (ScriptAnalysis-001). The two passes are otherwise identical.
|
||||
/// </summary>
|
||||
/// <param name="code">The C# script source to analyse.</param>
|
||||
/// <param name="baseReferences">The base reference set Pass 1 resolves against.</param>
|
||||
/// <param name="extraReferences">Optional additional references that only widen resolution.</param>
|
||||
/// <returns>A list of trust-model violation messages; empty if the script is clean.</returns>
|
||||
public static IReadOnlyList<string> FindViolations(
|
||||
string code,
|
||||
IReadOnlyList<MetadataReference> baseReferences,
|
||||
IEnumerable<MetadataReference>? extraReferences = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(code))
|
||||
return Array.Empty<string>();
|
||||
@@ -84,7 +109,7 @@ public static class ScriptTrustValidator
|
||||
// resolves and is judged by its true namespace — closing the
|
||||
// forbidden-type-in-allowed-namespace blind spot (e.g. a bare
|
||||
// System.Diagnostics.Process via `using System.Diagnostics;`).
|
||||
var references = ScriptTrustPolicy.AnalysisReferences.ToList();
|
||||
var references = baseReferences.ToList();
|
||||
if (extraReferences != null)
|
||||
references.AddRange(extraReferences);
|
||||
|
||||
|
||||
@@ -32,10 +32,17 @@ namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit;
|
||||
/// the daily terminal-row purge scheduler (Piece B —
|
||||
/// <see cref="OnPurgeTickAsync"/>, which invokes
|
||||
/// <see cref="ISiteCallAuditRepository.PurgeTerminalAsync"/> on a timer). Both
|
||||
/// background timers are started in <see cref="PreStart"/> and gate on the
|
||||
/// reconciliation collaborators (<see cref="IPullSiteCallsClient"/> +
|
||||
/// <see cref="ISiteEnumerator"/>) being available — the repo-only test ctor
|
||||
/// injects neither, so neither timer runs there.
|
||||
/// background timers are started in <see cref="PreStart"/>, but on independent
|
||||
/// preconditions (SiteCallAudit-007). The purge timer is armed whenever
|
||||
/// background timers are enabled (it needs only the repository, which every
|
||||
/// production / reconciliation ctor always has) — it is NOT gated on the
|
||||
/// reconciliation collaborators, so a host that registers Site Call Audit
|
||||
/// without the reconciliation client still purges and the central
|
||||
/// <c>SiteCalls</c> table cannot grow unbounded. The reconciliation timer
|
||||
/// additionally requires its collaborators (<see cref="IPullSiteCallsClient"/> +
|
||||
/// <see cref="ISiteEnumerator"/>) and logs a Warning when it cannot arm. The
|
||||
/// repo-only MSSQL test ctor disables both timers (background timers off) so the
|
||||
/// read/upsert tests see no scheduled side effects.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" —
|
||||
@@ -68,6 +75,16 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// <summary>Maximum page size honoured by a <see cref="SiteCallQueryRequest"/>.</summary>
|
||||
private const int MaxPageSize = 200;
|
||||
|
||||
/// <summary>
|
||||
/// SiteCallAudit-009: hard ceiling on the number of <c>PullSiteCalls</c> RPCs
|
||||
/// issued for a single site within ONE reconciliation tick when the site keeps
|
||||
/// reporting <see cref="PullSiteCallsResponse.MoreAvailable"/>. Bounds the
|
||||
/// within-tick continuation drain so a misbehaving site (or a pathological
|
||||
/// single-timestamp saturation that pins the cursor) can never spin the
|
||||
/// dispatcher unbounded; the remaining backlog drains on the next tick.
|
||||
/// </summary>
|
||||
private const int MaxReconciliationPagesPerTick = 50;
|
||||
|
||||
private readonly IServiceProvider? _serviceProvider;
|
||||
private readonly ISiteCallAuditRepository? _injectedRepository;
|
||||
private readonly SiteCallAuditOptions _options;
|
||||
@@ -81,12 +98,25 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// singletons registered by <c>AddAuditLogCentralReconciliationClient</c>);
|
||||
/// in the test path they are injected directly. They are <c>null</c> when
|
||||
/// the actor was built via the repo-only test ctor — in that case the
|
||||
/// reconciliation tick is NOT started (see <see cref="StartReconciliationTimer"/>);
|
||||
/// the purge tick gates on the same collaborators (see <see cref="StartPurgeTimer"/>).
|
||||
/// reconciliation tick is NOT started (see <see cref="StartReconciliationTimer"/>).
|
||||
/// The purge tick, by contrast, does NOT depend on these collaborators
|
||||
/// (SiteCallAudit-007): it needs only the repository, so it is armed by
|
||||
/// <see cref="_backgroundTimersEnabled"/> alone (see <see cref="StartPurgeTimer"/>).
|
||||
/// </summary>
|
||||
private readonly IPullSiteCallsClient? _pullClient;
|
||||
private readonly ISiteEnumerator? _siteEnumerator;
|
||||
|
||||
/// <summary>
|
||||
/// Master switch for the two background schedulers (reconciliation + purge),
|
||||
/// set <c>true</c> by the production and reconciliation ctors and <c>false</c>
|
||||
/// by the repo-only MSSQL test ctor. SiteCallAudit-007: the purge timer is
|
||||
/// gated on THIS flag rather than on the reconciliation collaborators, so a
|
||||
/// host that omits the reconciliation client still purges (no unbounded
|
||||
/// central <c>SiteCalls</c> growth) while the MSSQL read/upsert tests stay
|
||||
/// free of any scheduled side effects.
|
||||
/// </summary>
|
||||
private readonly bool _backgroundTimersEnabled;
|
||||
|
||||
/// <summary>
|
||||
/// Per-site reconciliation watermark — the highest
|
||||
/// <see cref="SiteCall.UpdatedAtUtc"/> seen for that site on a previous
|
||||
@@ -123,9 +153,11 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// An optional <paramref name="options"/> lets a test pin the stuck/KPI
|
||||
/// windows; when omitted the production defaults apply.
|
||||
/// <para>
|
||||
/// This ctor injects NO reconciliation client/enumerator, so the
|
||||
/// reconciliation tick is gated off (see <see cref="StartReconciliationTimer"/>)
|
||||
/// — the MSSQL-backed read/upsert tests must not fire phantom pulls.
|
||||
/// This ctor disables BOTH background timers (sets
|
||||
/// <see cref="_backgroundTimersEnabled"/> to <c>false</c>) and injects no
|
||||
/// reconciliation client/enumerator, so neither the reconciliation tick nor
|
||||
/// the purge tick fires — the MSSQL-backed read/upsert tests must see no
|
||||
/// scheduled side effects (no phantom pulls, no background purge).
|
||||
/// </para>
|
||||
/// </summary>
|
||||
/// <param name="repository">Concrete repository instance to use for all messages.</param>
|
||||
@@ -143,6 +175,10 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
_logger = logger;
|
||||
_options = options ?? new SiteCallAuditOptions();
|
||||
|
||||
// Repo-only MSSQL test ctor: keep BOTH background timers off so the
|
||||
// read/upsert tests see no scheduled side effects (SiteCallAudit-007).
|
||||
_backgroundTimersEnabled = false;
|
||||
|
||||
RegisterHandlers();
|
||||
}
|
||||
|
||||
@@ -150,10 +186,10 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// Test-mode constructor for the reconciliation tick (Piece A) — injects a
|
||||
/// concrete repository PLUS the two reconciliation collaborators directly,
|
||||
/// so the per-site self-heal pull is unit-testable in-memory without a DI
|
||||
/// container or a live gRPC channel. Because the client + enumerator are
|
||||
/// present, the reconciliation tick IS started; the purge tick is also
|
||||
/// started (both gate on the collaborators being available — see
|
||||
/// <see cref="StartReconciliationTimer"/> / <see cref="StartPurgeTimer"/>).
|
||||
/// container or a live gRPC channel. Background timers are enabled, so the
|
||||
/// purge tick starts (it needs only the repository) and the reconciliation
|
||||
/// tick starts too because the client + enumerator are present — see
|
||||
/// <see cref="StartReconciliationTimer"/> / <see cref="StartPurgeTimer"/>.
|
||||
/// </summary>
|
||||
/// <param name="repository">Concrete repository instance used for upserts and purges.</param>
|
||||
/// <param name="siteEnumerator">Enumerates the sites to reconcile each tick.</param>
|
||||
@@ -186,6 +222,11 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
_logger = logger;
|
||||
_options = options ?? new SiteCallAuditOptions();
|
||||
|
||||
// Reconciliation test ctor: collaborators present, so both timers arm
|
||||
// (the reconciliation tick uses the collaborators; the purge tick needs
|
||||
// only the repository).
|
||||
_backgroundTimersEnabled = true;
|
||||
|
||||
RegisterHandlers();
|
||||
}
|
||||
|
||||
@@ -223,6 +264,12 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
_pullClient = serviceProvider.GetService<IPullSiteCallsClient>();
|
||||
_siteEnumerator = serviceProvider.GetService<ISiteEnumerator>();
|
||||
|
||||
// Production path: background timers run. The purge tick is armed
|
||||
// unconditionally here (it needs only the repository); the reconciliation
|
||||
// tick additionally requires its collaborators and logs a Warning if
|
||||
// they were not registered (SiteCallAudit-007).
|
||||
_backgroundTimersEnabled = true;
|
||||
|
||||
RegisterHandlers();
|
||||
}
|
||||
|
||||
@@ -275,16 +322,30 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the periodic reconciliation tick — but ONLY when both the pull
|
||||
/// client and the site enumerator are available. The repo-only test ctor
|
||||
/// injects neither, so the tick is gated off there (the MSSQL read/upsert
|
||||
/// tests must not fire phantom pulls); the reconciliation test ctor and the
|
||||
/// production ctor (which resolves both from the SP) start it.
|
||||
/// Starts the periodic reconciliation tick — but ONLY when background timers
|
||||
/// are enabled AND both the pull client and the site enumerator are
|
||||
/// available. The repo-only test ctor disables background timers, so the tick
|
||||
/// is gated off there (the MSSQL read/upsert tests must not fire phantom
|
||||
/// pulls); the reconciliation test ctor and the production ctor (which
|
||||
/// resolves both from the SP) start it. SiteCallAudit-007: when background
|
||||
/// timers are enabled but the collaborators were not registered, log a
|
||||
/// Warning so a misconfigured host surfaces the missing self-heal rather than
|
||||
/// silently skipping it.
|
||||
/// </summary>
|
||||
private void StartReconciliationTimer()
|
||||
{
|
||||
if (!_backgroundTimersEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (_pullClient is null || _siteEnumerator is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"SiteCallAudit reconciliation timer not started — the reconciliation "
|
||||
+ "collaborators (IPullSiteCallsClient / ISiteEnumerator) were not registered; "
|
||||
+ "lost cached-call telemetry will not self-heal until they are wired up. "
|
||||
+ "The daily terminal-row purge still runs.");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -298,15 +359,20 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the daily purge tick — gated on the same collaborator presence as
|
||||
/// the reconciliation tick. The purge itself only needs the repository, but
|
||||
/// gating both schedulers together keeps the repo-only test ctor (no
|
||||
/// client/enumerator) free of BOTH background timers, so the MSSQL read/
|
||||
/// upsert tests see no scheduled side effects.
|
||||
/// Starts the daily purge tick. SiteCallAudit-007: the purge needs ONLY the
|
||||
/// repository — never the reconciliation collaborators — so it is gated on
|
||||
/// <see cref="_backgroundTimersEnabled"/> alone, NOT on
|
||||
/// <see cref="_pullClient"/> / <see cref="_siteEnumerator"/>. This decouples
|
||||
/// the daily terminal-row purge from the reconciliation client: a host that
|
||||
/// registers Site Call Audit without the reconciliation client still purges,
|
||||
/// so the central <c>SiteCalls</c> table can never grow unbounded just
|
||||
/// because the self-heal puller is absent. Only the repo-only MSSQL test ctor
|
||||
/// (background timers off) skips it, keeping the read/upsert tests free of
|
||||
/// scheduled side effects.
|
||||
/// </summary>
|
||||
private void StartPurgeTimer()
|
||||
{
|
||||
if (_pullClient is null || _siteEnumerator is null)
|
||||
if (!_backgroundTimersEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -501,37 +567,103 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
/// deduplicated by the idempotent monotonic upsert — the same inclusive-boundary
|
||||
/// contract as <c>SiteAuditReconciliationActor</c>'s cursor.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>SiteCallAudit-009: consumes <see cref="PullSiteCallsResponse.MoreAvailable"/>
|
||||
/// to guarantee forward progress.</b> Whereas the prior implementation ignored
|
||||
/// the flag entirely and relied solely on the tick cadence, this method now
|
||||
/// continues pulling within the same tick while the site reports
|
||||
/// <c>MoreAvailable=true</c>, bounded by
|
||||
/// <see cref="MaxReconciliationPagesPerTick"/>. This closes the
|
||||
/// single-timestamp-saturation edge: if a backlog larger than
|
||||
/// <see cref="SiteCallAuditOptions.ReconciliationBatchSize"/> all shares one
|
||||
/// exact <see cref="SiteCall.UpdatedAtUtc"/>, the inclusive max-timestamp cursor
|
||||
/// cannot advance, so the previous code re-pulled the identical window forever
|
||||
/// across ticks and never drained the tail. Here, a saturated batch whose
|
||||
/// observed max timestamp did NOT advance past <c>since</c> is detected as a
|
||||
/// no-progress pin: the loop stops and logs a Warning (the same observability
|
||||
/// intent as the sibling's stalled signal, without its EventStream state
|
||||
/// machine), so the pathological site surfaces rather than spinning silently.
|
||||
/// This diverges from <c>SiteAuditReconciliationActor</c>, which reads
|
||||
/// <c>MoreAvailable</c> to drive a <c>SiteAuditTelemetryStalledChanged</c>
|
||||
/// stalled-detection state machine instead of a within-tick continuation drain.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
private async Task ReconcileSiteAsync(
|
||||
SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository)
|
||||
{
|
||||
var since = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
||||
var response = await client
|
||||
.PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
var cursor = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
||||
|
||||
var maxUpdated = since;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var row in response.SiteCalls)
|
||||
// SiteCallAudit-009: drain within the tick while the site keeps reporting
|
||||
// MoreAvailable, bounded by MaxReconciliationPagesPerTick so a misbehaving
|
||||
// site can never spin the dispatcher. Each page advances the in-flight
|
||||
// cursor; a saturated page that fails to advance the cursor is the
|
||||
// single-timestamp no-progress pin — break and surface it.
|
||||
for (var page = 0; page < MaxReconciliationPagesPerTick; page++)
|
||||
{
|
||||
// IngestedAtUtc is the "central ingested (or last refreshed) this
|
||||
// row" stamp — owned by the central actor, exactly as OnUpsertAsync
|
||||
// does for the telemetry path. Monotonic UpsertAsync makes a row
|
||||
// already present (from a prior push) a silent no-op.
|
||||
var siteCall = row with { IngestedAtUtc = nowUtc };
|
||||
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
||||
var since = cursor;
|
||||
var response = await client
|
||||
.PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (row.UpdatedAtUtc > maxUpdated)
|
||||
var maxUpdated = since;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var row in response.SiteCalls)
|
||||
{
|
||||
maxUpdated = row.UpdatedAtUtc;
|
||||
// IngestedAtUtc is the "central ingested (or last refreshed) this
|
||||
// row" stamp — owned by the central actor, exactly as OnUpsertAsync
|
||||
// does for the telemetry path. Monotonic UpsertAsync makes a row
|
||||
// already present (from a prior push) a silent no-op.
|
||||
var siteCall = row with { IngestedAtUtc = nowUtc };
|
||||
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
||||
|
||||
if (row.UpdatedAtUtc > maxUpdated)
|
||||
{
|
||||
maxUpdated = row.UpdatedAtUtc;
|
||||
}
|
||||
}
|
||||
|
||||
// Persist the advanced cursor after every page so a fault on a later
|
||||
// page (caught per-site upstream) still keeps the rows already drained.
|
||||
cursor = maxUpdated;
|
||||
_reconciliationCursors[site.SiteId] = cursor;
|
||||
|
||||
if (!response.MoreAvailable)
|
||||
{
|
||||
// Backlog fully drained for this site this tick.
|
||||
return;
|
||||
}
|
||||
|
||||
if (maxUpdated <= since)
|
||||
{
|
||||
// No-progress pin: the site saturated the batch yet the max
|
||||
// observed UpdatedAtUtc did not advance past the inclusive cursor
|
||||
// (a burst of > batch-size rows sharing one exact timestamp).
|
||||
// Continuing would re-pull the identical window forever, so stop
|
||||
// and surface it — the inclusive max-timestamp cursor cannot make
|
||||
// progress on this input without a composite (timestamp,id)
|
||||
// keyset, which the pull contract does not yet support.
|
||||
_logger.LogWarning(
|
||||
"SiteCallAudit reconciliation for site {SiteId} cannot make progress: a saturated "
|
||||
+ "batch of more than {BatchSize} rows shares a single UpdatedAtUtc ({CursorUtc:o}), "
|
||||
+ "so the inclusive cursor is pinned. The backlog tail beyond the batch ceiling will "
|
||||
+ "not reconcile until those rows' timestamps differ.",
|
||||
site.SiteId,
|
||||
_options.ReconciliationBatchSize,
|
||||
since);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Advance the cursor to the newest row seen. A MoreAvailable response
|
||||
// means the site saturated the batch; the next tick continues draining
|
||||
// from the advanced cursor (no immediate re-pull loop — the natural
|
||||
// tick cadence drains the backlog, matching SiteAuditReconciliationActor).
|
||||
_reconciliationCursors[site.SiteId] = maxUpdated;
|
||||
// Hit the within-tick page ceiling while MoreAvailable was still true:
|
||||
// the cursor advanced each page (so the backlog IS draining), there is
|
||||
// simply more than MaxReconciliationPagesPerTick × batch-size of it. The
|
||||
// next tick resumes from the advanced cursor.
|
||||
_logger.LogInformation(
|
||||
"SiteCallAudit reconciliation for site {SiteId} hit the per-tick page ceiling "
|
||||
+ "({MaxPages} pages); the cursor advanced each page and the remaining backlog "
|
||||
+ "drains on the next tick.",
|
||||
site.SiteId,
|
||||
MaxReconciliationPagesPerTick);
|
||||
}
|
||||
|
||||
// ── Piece B: daily terminal-row purge scheduler ──
|
||||
|
||||
@@ -73,14 +73,23 @@ public class EventLogQueryService : IEventLogQueryService
|
||||
|
||||
if (request.From.HasValue)
|
||||
{
|
||||
// SiteEventLogging-024 (re-opens -016): timestamps are stored as ISO
|
||||
// 8601 "o" UTC strings (always +00:00). The store compares them
|
||||
// lexicographically (BINARY collation), so the bound MUST be
|
||||
// normalised to UTC before ToString("o") — otherwise a non-UTC
|
||||
// DateTimeOffset from a central client (e.g. +05:00) produces a
|
||||
// string that sorts wrongly against the +00:00 stored values and the
|
||||
// range filter silently includes/excludes the wrong rows.
|
||||
whereClauses.Add("timestamp >= $from");
|
||||
parameters.Add(new SqliteParameter("$from", request.From.Value.ToString("o")));
|
||||
parameters.Add(new SqliteParameter("$from", request.From.Value.ToUniversalTime().ToString("o")));
|
||||
}
|
||||
|
||||
if (request.To.HasValue)
|
||||
{
|
||||
// SiteEventLogging-024: normalise the upper bound to UTC for the same
|
||||
// reason as $from above.
|
||||
whereClauses.Add("timestamp <= $to");
|
||||
parameters.Add(new SqliteParameter("$to", request.To.Value.ToString("o")));
|
||||
parameters.Add(new SqliteParameter("$to", request.To.Value.ToUniversalTime().ToString("o")));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(request.EventType))
|
||||
|
||||
@@ -528,7 +528,28 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
var instanceName = command.InstanceUniqueName;
|
||||
|
||||
if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
// SiteRuntime-029: a disable arriving mid-redeploy must cancel the buffered
|
||||
// redeploy. Otherwise HandleTerminated re-creates the Instance Actor and
|
||||
// re-stores its config with isEnabled: true when the predecessor terminates,
|
||||
// silently reverting the operator's disable back to enabled. Mirror the
|
||||
// last-write-wins handling in HandleDeploy/HandleDelete: drop the pending
|
||||
// command (so HandleTerminated returns early), clear the shadow, and tell the
|
||||
// displaced deployer it was superseded. The disable itself still persists
|
||||
// is_enabled = false below, which becomes the durable state.
|
||||
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
|
||||
{
|
||||
if (_pendingRedeploys.Remove(terminatingRef, out var pending))
|
||||
{
|
||||
pending.OriginalSender.Tell(new DeploymentStatusResponse(
|
||||
pending.Command.DeploymentId,
|
||||
instanceName,
|
||||
DeploymentStatus.Failed,
|
||||
$"superseded by disable of {instanceName} before redeploy finished terminating",
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
_terminatingActorsByName.Remove(instanceName);
|
||||
}
|
||||
else if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
{
|
||||
Context.Stop(actor);
|
||||
_instanceActors.Remove(instanceName);
|
||||
@@ -628,12 +649,45 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
var instanceName = command.InstanceUniqueName;
|
||||
|
||||
if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
// SiteRuntime-029: a delete arriving while a redeploy is still terminating must
|
||||
// be authoritative over the mid-redeploy bookkeeping. HandleDeploy already
|
||||
// removed the instance from _instanceActors and buffered a PendingRedeploy
|
||||
// keyed by the terminating ref. If we fall straight through to the
|
||||
// _instanceActors miss + unconditional decrement, the buffered redeploy is
|
||||
// left intact — so when Terminated fires, HandleTerminated calls
|
||||
// ApplyDeployment(isRedeploy: true) and RESURRECTS the just-deleted instance,
|
||||
// with the counter now inconsistent. Cancel the pending redeploy first.
|
||||
var wasPresent = false;
|
||||
if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef))
|
||||
{
|
||||
// Drop the buffered command so HandleTerminated's _pendingRedeploys.Remove
|
||||
// misses and it returns early (no resurrection). Clear the shadow too.
|
||||
if (_pendingRedeploys.Remove(terminatingRef, out var pending))
|
||||
{
|
||||
pending.OriginalSender.Tell(new DeploymentStatusResponse(
|
||||
pending.Command.DeploymentId,
|
||||
instanceName,
|
||||
DeploymentStatus.Failed,
|
||||
$"superseded by delete of {instanceName} before redeploy finished terminating",
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
_terminatingActorsByName.Remove(instanceName);
|
||||
// The terminating predecessor is already being stopped by HandleDeploy;
|
||||
// no Context.Stop needed here.
|
||||
wasPresent = true;
|
||||
}
|
||||
else if (_instanceActors.TryGetValue(instanceName, out var actor))
|
||||
{
|
||||
Context.Stop(actor);
|
||||
_instanceActors.Remove(instanceName);
|
||||
wasPresent = true;
|
||||
}
|
||||
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
|
||||
|
||||
// SiteRuntime-029: only decrement when the instance was actually present
|
||||
// (live in _instanceActors OR mid-redeploy in _terminatingActorsByName).
|
||||
// A delete for a wholly-unknown instance must not drive the count negative.
|
||||
if (wasPresent)
|
||||
_totalDeployedCount = Math.Max(0, _totalDeployedCount - 1);
|
||||
UpdateInstanceCounts();
|
||||
|
||||
var sender = Sender;
|
||||
@@ -1379,14 +1433,15 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
}
|
||||
}
|
||||
|
||||
// WP-33: Store notification lists
|
||||
if (command.NotificationLists != null)
|
||||
{
|
||||
foreach (var nl in command.NotificationLists)
|
||||
{
|
||||
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
|
||||
}
|
||||
}
|
||||
// DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP
|
||||
// configuration are central-only — sites store-and-forward notifications
|
||||
// to central and never deliver over SMTP. Central no longer ships these
|
||||
// (the DeployArtifactsCommand fields stay for additive compatibility but
|
||||
// are always null), so the site neither persists them nor reads them.
|
||||
// Purge any rows a prior (pre-fix) build may have written — including the
|
||||
// plaintext SMTP password — so existing exposure is cleared, not just
|
||||
// future writes. Purge is idempotent and runs on every artifact apply.
|
||||
await _storage.PurgeCentralOnlyNotificationConfigAsync();
|
||||
|
||||
// Store data connection definitions (OPC UA endpoints, etc.)
|
||||
if (command.DataConnections != null)
|
||||
@@ -1413,16 +1468,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections));
|
||||
}
|
||||
|
||||
// Store SMTP configurations
|
||||
if (command.SmtpConfigurations != null)
|
||||
{
|
||||
foreach (var smtp in command.SmtpConfigurations)
|
||||
{
|
||||
await _storage.StoreSmtpConfigurationAsync(
|
||||
smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
|
||||
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
|
||||
}
|
||||
}
|
||||
// DeploymentManager-025 / SiteRuntime-031: SMTP configuration is
|
||||
// central-only and is never stored on a site (see the purge above).
|
||||
|
||||
// Replicate artifacts to standby node
|
||||
_replicationActor?.Tell(new ReplicateArtifacts(command));
|
||||
|
||||
@@ -12,6 +12,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Scripts;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Streaming;
|
||||
@@ -209,6 +210,12 @@ public class InstanceActor : ReceiveActor
|
||||
// WP-16: Handle alarm state changes from Alarm Actors (Tell pattern)
|
||||
Receive<AlarmStateChanged>(HandleAlarmStateChanged);
|
||||
|
||||
// SiteRuntime-027: a NativeAlarmActor tells us when one of its native
|
||||
// conditions has left the mirror for good (snapshot-swap removal, retention
|
||||
// drop, or cap eviction) so we can evict the stale _latestAlarmEvents key
|
||||
// and not leak per-instance memory / bloat DebugView snapshots.
|
||||
Receive<NativeAlarmDropped>(HandleNativeAlarmDropped);
|
||||
|
||||
// WP-25: Debug view subscribe/unsubscribe (Ask pattern for snapshot)
|
||||
Receive<SubscribeDebugViewRequest>(HandleSubscribeDebugView);
|
||||
Receive<UnsubscribeDebugViewRequest>(HandleUnsubscribeDebugView);
|
||||
@@ -1016,6 +1023,25 @@ public class InstanceActor : ReceiveActor
|
||||
_streamManager?.PublishAlarmStateChanged(changed);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-027: evicts a native condition's key from the alarm-state maps once
|
||||
/// the owning <see cref="NativeAlarmActor"/> has dropped it from its mirror (after
|
||||
/// emitting the condition's final return-to-normal). Without this the
|
||||
/// <c>_latestAlarmEvents</c> map grows without bound on a source that mints a fresh
|
||||
/// <c>SourceReference</c> per occurrence (one permanently-retained Normal entry per
|
||||
/// distinct condition the instance has ever seen), leaking per-instance memory and
|
||||
/// bloating every DebugView snapshot.
|
||||
///
|
||||
/// Native-only by construction: the key is a native condition's <c>SourceReference</c>.
|
||||
/// Computed-alarm keys (configuration-bounded) are never sent here and never removed.
|
||||
/// </summary>
|
||||
private void HandleNativeAlarmDropped(NativeAlarmDropped dropped)
|
||||
{
|
||||
_latestAlarmEvents.Remove(dropped.SourceReference);
|
||||
_alarmStates.Remove(dropped.SourceReference);
|
||||
_alarmTimestamps.Remove(dropped.SourceReference);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-25: Debug view subscribe — returns snapshot and begins streaming.
|
||||
/// </summary>
|
||||
|
||||
@@ -8,6 +8,7 @@ using ZB.MOM.WW.ScadaBridge.Commons.Types.Alarms;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Flattening;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
||||
@@ -204,6 +205,10 @@ public class NativeAlarmActor : ReceiveActor
|
||||
{
|
||||
Emit(prior, prior.Condition with { Active = false });
|
||||
PersistDelete(sourceRef);
|
||||
// SiteRuntime-027: this condition is gone for good — tell the parent
|
||||
// to evict its _latestAlarmEvents key so it does not retain a stale
|
||||
// (Normal) entry forever.
|
||||
NotifyParentDropped(sourceRef);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -244,6 +249,9 @@ public class NativeAlarmActor : ReceiveActor
|
||||
{
|
||||
_alarms.Remove(t.SourceReference);
|
||||
PersistDelete(t.SourceReference);
|
||||
// SiteRuntime-027: evict the parent's _latestAlarmEvents key for the
|
||||
// now-resolved condition so it does not leak.
|
||||
NotifyParentDropped(t.SourceReference);
|
||||
}
|
||||
|
||||
EnforceCap();
|
||||
@@ -289,19 +297,48 @@ public class NativeAlarmActor : ReceiveActor
|
||||
var overflow = _alarms.Values
|
||||
.OrderBy(a => a.TransitionTime)
|
||||
.Take(_alarms.Count - cap)
|
||||
.Select(a => a.SourceReference)
|
||||
.ToList();
|
||||
|
||||
foreach (var sourceRef in overflow)
|
||||
foreach (var evicted in overflow)
|
||||
{
|
||||
var sourceRef = evicted.SourceReference;
|
||||
|
||||
// SiteRuntime-028: the sibling drop paths (ApplySnapshotSwap, the
|
||||
// ApplyLiveTransition retention drop) always emit a return-to-normal
|
||||
// before the condition leaves the mirror. EnforceCap previously dropped
|
||||
// a condition whose last-emitted state could still be Active, with no
|
||||
// compensating emit — so the Instance Actor (and central's stream / the
|
||||
// operator Alarm Summary) kept showing it Active forever, a phantom
|
||||
// stuck alarm the mirror could never clear. Emit the return-to-normal
|
||||
// for any still-active evicted condition (mirroring ApplySnapshotSwap)
|
||||
// before removing it.
|
||||
if (evicted.Condition.Active)
|
||||
{
|
||||
Emit(evicted, evicted.Condition with { Active = false });
|
||||
}
|
||||
|
||||
_alarms.Remove(sourceRef);
|
||||
PersistDelete(sourceRef);
|
||||
// SiteRuntime-027: this condition is gone for good — evict the parent's
|
||||
// _latestAlarmEvents key so it does not retain a stale entry.
|
||||
NotifyParentDropped(sourceRef);
|
||||
_logger.LogWarning(
|
||||
"Native alarm cap {Cap} exceeded for {Source} on {Instance}; dropped oldest mirrored alarm {Ref}",
|
||||
cap, _source.CanonicalName, _instanceName, sourceRef);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-027: signals the parent Instance Actor that a native condition has
|
||||
/// left the mirror for good so it can evict the matching <c>_latestAlarmEvents</c>
|
||||
/// key. Always sent AFTER the condition's final return-to-normal
|
||||
/// <see cref="AlarmStateChanged"/> emit, so the stream still sees the clear.
|
||||
/// </summary>
|
||||
private void NotifyParentDropped(string sourceReference)
|
||||
{
|
||||
_instanceActor.Tell(new NativeAlarmDropped(sourceReference));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds and tells the parent an enriched <see cref="AlarmStateChanged"/> for a condition.
|
||||
/// </summary>
|
||||
|
||||
@@ -189,18 +189,16 @@ public class SiteReplicationActor : ReceiveActor
|
||||
foreach (var db in command.DatabaseConnections)
|
||||
await _storage.StoreDatabaseConnectionAsync(db.Name, db.ConnectionString, db.MaxRetries, db.RetryDelay);
|
||||
|
||||
if (command.NotificationLists != null)
|
||||
foreach (var nl in command.NotificationLists)
|
||||
await _storage.StoreNotificationListAsync(nl.Name, nl.RecipientEmails);
|
||||
// DeploymentManager-025 / SiteRuntime-031: notification lists and SMTP
|
||||
// configuration are central-only and are never persisted on a site.
|
||||
// Mirror the primary apply path: purge any pre-fix rows (including the
|
||||
// plaintext SMTP password) instead of writing the command's
|
||||
// (now-always-null) NotificationLists/SmtpConfigurations.
|
||||
await _storage.PurgeCentralOnlyNotificationConfigAsync();
|
||||
|
||||
if (command.DataConnections != null)
|
||||
foreach (var dc in command.DataConnections)
|
||||
await _storage.StoreDataConnectionDefinitionAsync(dc.Name, dc.Protocol, dc.PrimaryConfigurationJson, dc.BackupConfigurationJson, dc.FailoverRetryCount);
|
||||
|
||||
if (command.SmtpConfigurations != null)
|
||||
foreach (var smtp in command.SmtpConfigurations)
|
||||
await _storage.StoreSmtpConfigurationAsync(smtp.Name, smtp.Server, smtp.Port, smtp.AuthMode,
|
||||
smtp.FromAddress, smtp.Username, smtp.Password, smtp.OAuthConfig);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Messages;
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-027: terminal-drop signal sent from a <c>NativeAlarmActor</c> to its
|
||||
/// parent <c>InstanceActor</c> when a native condition leaves the mirror for good —
|
||||
/// the snapshot-swap removal, the live-transition retention drop
|
||||
/// (<c>inactive && acknowledged</c>), and the cap eviction. The parent removes the
|
||||
/// condition's key (<see cref="SourceReference"/>) from its <c>_latestAlarmEvents</c>
|
||||
/// map so the per-instance map and every DebugView snapshot do not accumulate one
|
||||
/// permanently-retained (Normal) entry per distinct native condition the instance has
|
||||
/// ever seen.
|
||||
///
|
||||
/// The actor still emits the condition's return-to-normal <c>AlarmStateChanged</c>
|
||||
/// (so central/UI see it clear) immediately BEFORE this drop signal; only the
|
||||
/// stale-key retention in <c>_latestAlarmEvents</c> is what this evicts. Computed-alarm
|
||||
/// keys are configuration-bounded and are never dropped this way.
|
||||
/// </summary>
|
||||
/// <param name="SourceReference">
|
||||
/// The native condition's source reference — the same value used as the
|
||||
/// <c>AlarmStateChanged.AlarmName</c> key for native alarms, so the parent can remove
|
||||
/// the matching <c>_latestAlarmEvents</c> entry.
|
||||
/// </param>
|
||||
public sealed record NativeAlarmDropped(string SourceReference);
|
||||
@@ -623,6 +623,30 @@ public class SiteStorageService
|
||||
|
||||
// ── WP-33: Notification List CRUD ──
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-025 / SiteRuntime-031: notification delivery is central-only.
|
||||
/// Sites store-and-forward notifications to the central cluster and never deliver
|
||||
/// over SMTP, so notification lists and SMTP configuration must never live on a
|
||||
/// site. This purges every row from the site-local <c>notification_lists</c> and
|
||||
/// <c>smtp_configurations</c> tables, clearing any rows a prior (now-corrected)
|
||||
/// build may have shipped — most importantly the plaintext SMTP password. It is
|
||||
/// idempotent and is invoked on every artifact apply / deploy so existing exposure
|
||||
/// is cleared, not just future writes. The tables themselves are retained (the
|
||||
/// schema is harmless once empty); only their contents are removed.
|
||||
/// </summary>
|
||||
/// <returns>A task that completes when both tables have been emptied.</returns>
|
||||
public async Task PurgeCentralOnlyNotificationConfigAsync()
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = @"
|
||||
DELETE FROM notification_lists;
|
||||
DELETE FROM smtp_configurations;";
|
||||
await command.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stores or updates a notification list.
|
||||
/// </summary>
|
||||
|
||||
@@ -163,6 +163,16 @@ public class StoreAndForwardService
|
||||
/// </summary>
|
||||
private int _queueDepthProviderRegistered;
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-025: the exact provider delegate this instance registered with
|
||||
/// the process-global <see cref="ScadaBridgeTelemetry"/> gauge slot, retained so
|
||||
/// <see cref="StopAsync"/> can deregister it by identity (compare-and-clear). Holding
|
||||
/// the same reference both registers and clears the slot; the identity check ensures a
|
||||
/// late stop of this instance cannot clear a newer instance's provider. Null until
|
||||
/// <see cref="StartAsync"/> registers.
|
||||
/// </summary>
|
||||
private Func<long>? _queueDepthProvider;
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Delivery handler delegate. The return value / exception is interpreted
|
||||
/// the same way on both the immediate-delivery path (<see cref="EnqueueAsync"/>)
|
||||
@@ -347,7 +357,12 @@ public class StoreAndForwardService
|
||||
if (Interlocked.CompareExchange(ref _queueDepthProviderRegistered, 1, 0) == 0)
|
||||
{
|
||||
Interlocked.Add(ref _bufferedCount, pending);
|
||||
ScadaBridgeTelemetry.SetQueueDepthProvider(() => Interlocked.Read(ref _bufferedCount));
|
||||
// StoreAndForward-025: retain the exact delegate so StopAsync can
|
||||
// deregister it by identity (compare-and-clear) without stomping a
|
||||
// newer instance that may have re-registered into the global slot.
|
||||
var provider = (Func<long>)(() => Interlocked.Read(ref _bufferedCount));
|
||||
_queueDepthProvider = provider;
|
||||
ScadaBridgeTelemetry.SetQueueDepthProvider(provider);
|
||||
}
|
||||
|
||||
_retryTimer = new Timer(
|
||||
@@ -380,6 +395,15 @@ public class StoreAndForwardService
|
||||
/// DI container ran its own shutdown. We now await the captured sweep task
|
||||
/// (with a bounded <see cref="SweepShutdownWaitTimeout"/> so a hung
|
||||
/// dependency cannot block host shutdown indefinitely) before returning.
|
||||
///
|
||||
/// StoreAndForward-025: after the timer is disposed and the in-flight sweep has
|
||||
/// drained, the queue-depth provider this instance registered with the process-global
|
||||
/// <see cref="ScadaBridgeTelemetry"/> gauge is deregistered by identity (compare-and-
|
||||
/// clear) — otherwise a stopped service would report a frozen depth forever and the
|
||||
/// provider closure would pin this dead instance for the process lifetime. The clear is
|
||||
/// identity-checked so a newer instance that already re-registered the global slot is
|
||||
/// not stomped, and <see cref="_queueDepthProviderRegistered"/> is reset so a later
|
||||
/// <see cref="StartAsync"/> on this instance re-registers cleanly.
|
||||
/// </summary>
|
||||
/// <returns>A task representing the asynchronous stop operation.</returns>
|
||||
public async Task StopAsync()
|
||||
@@ -393,35 +417,46 @@ public class StoreAndForwardService
|
||||
}
|
||||
|
||||
var inflight = Volatile.Read(ref _sweepTask);
|
||||
if (inflight is null || inflight.IsCompleted)
|
||||
if (inflight is not null && !inflight.IsCompleted)
|
||||
{
|
||||
return;
|
||||
try
|
||||
{
|
||||
// WaitAsync with a finite timeout: a hung delivery handler /
|
||||
// storage call cannot block host shutdown indefinitely. On timeout
|
||||
// the sweep keeps running but the host is free to proceed with
|
||||
// disposal — preferred to never returning.
|
||||
await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false);
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Store-and-forward retry sweep did not finish within {Timeout}; " +
|
||||
"shutdown is proceeding while the sweep is still in-flight",
|
||||
SweepShutdownWaitTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// The sweep itself already logs at Error on failure (see
|
||||
// RetryPendingMessagesAsync's catch); we only log here so a
|
||||
// surprise fault during shutdown is still visible. Swallow so the
|
||||
// host's shutdown sequence can continue regardless.
|
||||
_logger.LogWarning(ex,
|
||||
"Store-and-forward retry sweep faulted during shutdown wait");
|
||||
}
|
||||
}
|
||||
|
||||
try
|
||||
// StoreAndForward-025: release the process-global queue-depth gauge provider so a
|
||||
// stopped service stops reporting a frozen depth and the closure no longer pins
|
||||
// this dead instance. Identity-checked (compare-and-clear) so a successor
|
||||
// instance's provider is left intact; reset the one-time guard so a later
|
||||
// StartAsync re-registers.
|
||||
var provider = _queueDepthProvider;
|
||||
if (provider is not null)
|
||||
{
|
||||
// WaitAsync with a finite timeout: a hung delivery handler /
|
||||
// storage call cannot block host shutdown indefinitely. On timeout
|
||||
// the sweep keeps running but the host is free to proceed with
|
||||
// disposal — preferred to never returning.
|
||||
await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false);
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Store-and-forward retry sweep did not finish within {Timeout}; " +
|
||||
"shutdown is proceeding while the sweep is still in-flight",
|
||||
SweepShutdownWaitTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// The sweep itself already logs at Error on failure (see
|
||||
// RetryPendingMessagesAsync's catch); we only log here so a
|
||||
// surprise fault during shutdown is still visible. Swallow so the
|
||||
// host's shutdown sequence can continue regardless.
|
||||
_logger.LogWarning(ex,
|
||||
"Store-and-forward retry sweep faulted during shutdown wait");
|
||||
ScadaBridgeTelemetry.ClearQueueDepthProvider(provider);
|
||||
_queueDepthProvider = null;
|
||||
}
|
||||
Interlocked.Exchange(ref _queueDepthProviderRegistered, 0);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -56,6 +56,10 @@ public class TemplateFolderService
|
||||
await _repository.AddFolderAsync(folder, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
await _auditService.LogAsync(user, "Create", "TemplateFolder", folder.Id.ToString(), name, folder, cancellationToken);
|
||||
// The audit entry is staged on the change tracker by LogAsync and needs its
|
||||
// own SaveChangesAsync to persist (mirrors TemplateService) — otherwise the
|
||||
// row is discarded when the ManagementActor's DI scope is disposed.
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
return Result<TemplateFolder>.Success(folder);
|
||||
}
|
||||
@@ -89,6 +93,8 @@ public class TemplateFolderService
|
||||
await _repository.UpdateFolderAsync(folder, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
await _auditService.LogAsync(user, "Update", "TemplateFolder", folder.Id.ToString(), newName, folder, cancellationToken);
|
||||
// Persist the staged audit entry (see CreateFolderAsync).
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
return Result<TemplateFolder>.Success(folder);
|
||||
}
|
||||
@@ -152,6 +158,8 @@ public class TemplateFolderService
|
||||
await _repository.UpdateFolderAsync(folder, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
await _auditService.LogAsync(user, "Move", "TemplateFolder", folder.Id.ToString(), folder.Name, folder, cancellationToken);
|
||||
// Persist the staged audit entry (see CreateFolderAsync).
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
return Result<TemplateFolder>.Success(folder);
|
||||
}
|
||||
@@ -202,6 +210,8 @@ public class TemplateFolderService
|
||||
await _repository.UpdateFolderAsync(adjacent, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
await _auditService.LogAsync(user, "Reorder", "TemplateFolder", folder.Id.ToString(), folder.Name, folder, cancellationToken);
|
||||
// Persist the staged audit entry (see CreateFolderAsync).
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
return Result<TemplateFolder>.Success(folder);
|
||||
}
|
||||
@@ -242,6 +252,8 @@ public class TemplateFolderService
|
||||
await _repository.DeleteFolderAsync(folderId, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
await _auditService.LogAsync(user, "Delete", "TemplateFolder", folderId.ToString(), folder.Name, null, cancellationToken);
|
||||
// Persist the staged audit entry (see CreateFolderAsync).
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
return Result<bool>.Success(true);
|
||||
}
|
||||
|
||||
@@ -565,43 +565,6 @@ public class SemanticValidator
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses a parameter definitions JSON string (JSON Schema or legacy flat array) and returns the declared parameter names.
|
||||
/// </summary>
|
||||
/// <param name="parameterDefinitionsJson">JSON Schema or legacy flat-array string; null/empty returns an empty list.</param>
|
||||
/// <returns>The list of parameter names declared in the definition.</returns>
|
||||
internal static List<string> ParseParameterDefinitions(string? parameterDefinitionsJson)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(parameterDefinitionsJson))
|
||||
return [];
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(parameterDefinitionsJson);
|
||||
// JSON Schema: { type:"object", properties:{ name:{...}, ... }, required:[...] }
|
||||
if (doc.RootElement.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
if (doc.RootElement.TryGetProperty("properties", out var props)
|
||||
&& props.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
return props.EnumerateObject().Select(p => p.Name).ToList();
|
||||
}
|
||||
}
|
||||
// Legacy flat form: [{ name, type, required? }]
|
||||
else if (doc.RootElement.ValueKind == JsonValueKind.Array)
|
||||
{
|
||||
return doc.RootElement.EnumerateArray()
|
||||
.Select(e => e.TryGetProperty("type", out var t) ? t.GetString() ?? "unknown" : "unknown")
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts call targets from script code by simple pattern matching.
|
||||
/// Looks for CallScript("name", ...) and CallShared("name", ...) patterns.
|
||||
|
||||
Reference in New Issue
Block a user