diff --git a/docs/requirements/Component-SiteRuntime.md b/docs/requirements/Component-SiteRuntime.md index 9e6eae96..7b866bc5 100644 --- a/docs/requirements/Component-SiteRuntime.md +++ b/docs/requirements/Component-SiteRuntime.md @@ -269,7 +269,7 @@ When the Instance Actor is stopped (due to disable, delete, or redeployment), Ak - **Snapshot / SnapshotComplete (reconnect reconciliation)**: `Snapshot` updates buffer into a staging set; `SnapshotComplete` performs an **atomic swap** of the mirrored set with the staged set. Any condition that was previously mirrored but is **not present** in the new snapshot emits a return-to-normal `AlarmStateChanged` and drops out. This is how the mirror self-corrects after an outage. - **Live transitions** (`Raise` / `Ack` / `Clear` / `Retrigger` / `StateChange`): upsert the condition by `SourceReference`. Updates carrying a `TransitionTime` **older** than the currently held transition are ignored (out-of-order protection). Accepted transitions persist to SQLite and emit an enriched `AlarmStateChanged` upward to the Instance Actor. - **Retention**: a mirrored condition is dropped once it is both inactive **and** acknowledged (`!Active && Acknowledged`) — the alarm has fully run its course at the source and no longer needs mirroring. The drop emits a final state change and deletes the SQLite row. -- **Per-source cap**: at most `MirroredAlarmCapPerSource` conditions are retained per source. When the cap is exceeded the **oldest** condition is dropped and the eviction is **logged** — there is no silent truncation. +- **Per-source cap**: at most `MirroredAlarmCapPerSource` conditions are retained per source. When the cap is exceeded the **oldest** condition is dropped and the eviction is **logged** — there is no silent truncation. If the evicted condition is still **active**, a final return-to-normal `AlarmStateChanged` is emitted before it is dropped (mirroring the retention drop above), so a capped-out active condition is never left stuck-active downstream. ### Persistence - Mirrored condition state is persisted to the site SQLite `native_alarm_state` table on every accepted transition and removed on drop-out. @@ -291,7 +291,7 @@ The Instance Actor owns native-alarm setup alongside its computed Script and Ala - **Latest-event retention**: the Instance Actor retains the latest enriched `AlarmStateChanged` per alarm name in `_latestAlarmEvents`. The DebugView snapshot is built from this map so it carries the **unified condition view plus native metadata** for both computed and native alarms. Computed alarms that have not yet produced an event fall back to a **Normal projection** so the snapshot is complete. - **Idle native source binding placeholders**: `BuildAlarmStatesSnapshot()` additionally emits one placeholder `AlarmStateChanged` for each configured native alarm source binding that currently has **no live conditions** in `_latestAlarmEvents`. The placeholder has `IsConfiguredPlaceholder = true` and carries the binding's canonical name in `NativeSourceCanonicalName`. The Instance Actor maintains a `_nativeAlarmKinds` map (`sourceCanonicalName → AlarmKind`) populated at spawn time to stamp the correct `Kind` on each placeholder. This ensures the Debug View Alarms tab shows every configured binding as a tree node even when quiet. - **`NativeSourceCanonicalName` on live events**: every `AlarmStateChanged` emitted by a `NativeAlarmActor` stamps its source binding's canonical name in `NativeSourceCanonicalName`. The Debug View uses this field to place each live condition under the correct native-source binding node in the tree. -- **Reset semantics**: `_latestAlarmEvents` and the mirrored native state are cleared on redeploy/undeploy (same trigger as static-override reset) but rehydrate from SQLite on failover. +- **Reset semantics**: `_latestAlarmEvents` and the mirrored native state are cleared on redeploy/undeploy (same trigger as static-override reset) but rehydrate from SQLite on failover. In addition, a native condition's `_latestAlarmEvents` key is **evicted per-condition** whenever that condition leaves the mirror — snapshot-swap drop, retention drop, or cap eviction — driven by a `NativeAlarmDropped` signal from the `NativeAlarmActor`; this prevents unbounded `_latestAlarmEvents` growth for sources that mint a fresh `SourceReference` per occurrence. --- diff --git a/src/ZB.MOM.WW.ScadaBridge.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs b/src/ZB.MOM.WW.ScadaBridge.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs index a111a93d..fa1cc00e 100644 --- a/src/ZB.MOM.WW.ScadaBridge.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.CentralUI/ScriptAnalysis/ScriptAnalysisService.cs @@ -161,7 +161,9 @@ public class ScriptAnalysisService /// . CallScript still throws /// because a shared script has no template siblings in this context. /// For the SandboxInboundScriptHost surface, every Route call throws - /// because cross-site routing needs a deployed site. + /// because cross-site routing needs a deployed site, and every Database + /// call (QuerySingleAsync/QueryAsync/ExecuteAsync) throws + /// because a Test Run has no configured central database connection. /// Console.Out / Console.Error are captured per-call via an AsyncLocal /// scope (see ) so writes from the script /// land in the result without mutating process-global Console state — two diff --git a/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs b/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs index 4d501750..2ab5a1f1 100644 --- a/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/Actors/DataConnectionActor.cs @@ -1822,8 +1822,29 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers if (msg.Success && msg.SubscriptionId != null) { - _alarmSubscriptionIds[msg.SourceReference] = msg.SubscriptionId; - _log.Info("[{0}] Alarm feed subscribed for source {1}", _connectionName, msg.SourceReference); + // DataConnectionLayer-029: a concurrent unsubscribe clears the in-flight + // marker (DCL-023), so a fresh subscribe for the same source can issue a + // SECOND adapter feed before this completion fires — yielding two completions + // for one source. Mirror the tag-path re-check (see HandleTagSubscribeCompleted, + // the `_subscriptionIds.ContainsKey` guard): if a feed is already stored, THIS + // completion is the redundant one — release its feed rather than overwriting + // the stored id and leaking the already-tracked subscription. + if (_alarmSubscriptionIds.ContainsKey(msg.SourceReference)) + { + if (_adapter is IAlarmSubscribableConnection alarmable) + { + _log.Warning( + "[{0}] Duplicate alarm feed for source {1}; releasing the redundant " + + "subscription instead of overwriting the stored one.", + _connectionName, msg.SourceReference); + _ = alarmable.UnsubscribeAlarmsAsync(msg.SubscriptionId); + } + } + else + { + _alarmSubscriptionIds[msg.SourceReference] = msg.SubscriptionId; + _log.Info("[{0}] Alarm feed subscribed for source {1}", _connectionName, msg.SourceReference); + } } else if (!msg.Success) { diff --git a/src/ZB.MOM.WW.ScadaBridge.InboundAPI/RouteHelper.cs b/src/ZB.MOM.WW.ScadaBridge.InboundAPI/RouteHelper.cs index 0354e3ae..5ae0fa15 100644 --- a/src/ZB.MOM.WW.ScadaBridge.InboundAPI/RouteHelper.cs +++ b/src/ZB.MOM.WW.ScadaBridge.InboundAPI/RouteHelper.cs @@ -104,13 +104,6 @@ public class RouteHelper /// public class RouteTarget { - // InboundAPI-029: a small grace past the wait timeout. The SITE enforces the wait - // timeout and returns Matched=false when it elapses; the local backstop fires only - // if the site fails to respond, so it must sit slightly LATER than the wait timeout - // (it must not pre-empt the site's own timed-out response and turn a clean `false` - // into a cancellation). - private static readonly TimeSpan WaitResponseGrace = TimeSpan.FromSeconds(5); - private readonly string _instanceCode; private readonly IInstanceLocator _instanceLocator; private readonly IInstanceRouter _instanceRouter; @@ -261,11 +254,18 @@ public class RouteTarget TimeSpan timeout, CancellationToken cancellationToken = default) { - // InboundAPI-029: bound the wait by the WAIT timeout (+ grace backstop), the - // client-disconnect token, and an explicit caller token — NOT the method deadline. - using var waitCts = new CancellationTokenSource(timeout + WaitResponseGrace); + // InboundAPI-031: do NOT impose a local wait-timeout backstop here. The site + // enforces the wait `timeout` and returns Matched=false when it elapses, and the + // cluster Ask in CommunicationService.RouteToWaitForAttributeAsync already bounds + // the round trip by `timeout + IntegrationTimeout` — the authoritative backstop + // for a missing site response. A local CTS of `timeout + small grace` (the prior + // InboundAPI-029 approach) was TIGHTER than that round-trip budget, so a + // slow-but-valid timed-out response could be cancelled into an exception instead + // of the spec-mandated `false`. Link ONLY the client-disconnect token and an + // explicit caller token — NOT the method deadline — so a client abort still + // cancels the wait while the wait timeout itself governs the duration. using var linked = CancellationTokenSource.CreateLinkedTokenSource( - waitCts.Token, _requestAbortedToken, cancellationToken); + _requestAbortedToken, cancellationToken); var token = linked.Token; var siteId = await ResolveSiteAsync(token); diff --git a/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs b/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs index 3ceb3d8b..5ee845fc 100644 --- a/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs +++ b/src/ZB.MOM.WW.ScadaBridge.ScriptAnalysis/ScriptTrustValidator.cs @@ -104,11 +104,13 @@ public static class ScriptTrustValidator var violations = new SortedSet(StringComparer.Ordinal); // ---- Pass 1: semantic symbol analysis (ported from SiteRuntime) ---- - // Use the full trusted-platform reference set (not the minimal - // runtime-fidelity DefaultReferences) so EVERY type a script names + // Resolve against the supplied baseReferences so EVERY type a script names // resolves and is judged by its true namespace — closing the // forbidden-type-in-allowed-namespace blind spot (e.g. a bare - // System.Diagnostics.Process via `using System.Diagnostics;`). + // System.Diagnostics.Process via `using System.Diagnostics;`). The public + // entry point passes the full trusted-platform reference set; a caller on the + // degraded/test path may instead pass the minimal anchor-enriched fallback + // (BuildMinimalFallbackReferences()). var references = baseReferences.ToList(); if (extraReferences != null) references.AddRange(extraReferences); diff --git a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs index f44f1f33..4717e951 100644 --- a/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -73,7 +73,14 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers /// Cleared in alongside . /// private readonly Dictionary _terminatingActorsByName = new(); - private int _totalDeployedCount; + /// + /// SiteRuntime-032: authoritative set of deployed instance config names (enabled + /// AND disabled). The deployed/disabled health counts are derived from this set's + /// size, so add-on-deploy / remove-on-delete keeps the count correct for every + /// path — including deleting a DISABLED instance, which has a config row but is + /// absent from both and . + /// + private readonly HashSet _deployedInstanceNames = new(); /// Akka timer scheduler injected by the framework via . public ITimerScheduler Timers { get; set; } = null!; @@ -268,7 +275,9 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } var enabledConfigs = msg.Configs.Where(c => c.IsEnabled).ToList(); - _totalDeployedCount = msg.Configs.Count; + _deployedInstanceNames.Clear(); + foreach (var c in msg.Configs) + _deployedInstanceNames.Add(c.InstanceUniqueName); _logger.LogInformation( "Loaded {Total} deployed configs ({Enabled} enabled) from SQLite", msg.Configs.Count, enabledConfigs.Count); @@ -436,7 +445,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // Create the Instance Actor immediately CreateInstanceActor(instanceName, command.FlattenedConfigurationJson); if (!isRedeploy) - _totalDeployedCount++; + _deployedInstanceNames.Add(instanceName); UpdateInstanceCounts(); // Persist to SQLite and clear static overrides asynchronously @@ -510,7 +519,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers if (_instanceActors.Remove(result.InstanceName, out var orphan)) Context.Stop(orphan); if (!result.IsRedeploy) - _totalDeployedCount = Math.Max(0, _totalDeployedCount - 1); + _deployedInstanceNames.Remove(result.InstanceName); UpdateInstanceCounts(); result.OriginalSender.Tell(new DeploymentStatusResponse( @@ -657,7 +666,6 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // left intact — so when Terminated fires, HandleTerminated calls // ApplyDeployment(isRedeploy: true) and RESURRECTS the just-deleted instance, // with the counter now inconsistent. Cancel the pending redeploy first. - var wasPresent = false; if (_terminatingActorsByName.TryGetValue(instanceName, out var terminatingRef)) { // Drop the buffered command so HandleTerminated's _pendingRedeploys.Remove @@ -674,20 +682,20 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _terminatingActorsByName.Remove(instanceName); // The terminating predecessor is already being stopped by HandleDeploy; // no Context.Stop needed here. - wasPresent = true; } else if (_instanceActors.TryGetValue(instanceName, out var actor)) { Context.Stop(actor); _instanceActors.Remove(instanceName); - wasPresent = true; } - // SiteRuntime-029: only decrement when the instance was actually present - // (live in _instanceActors OR mid-redeploy in _terminatingActorsByName). - // A delete for a wholly-unknown instance must not drive the count negative. - if (wasPresent) - _totalDeployedCount = Math.Max(0, _totalDeployedCount - 1); + // SiteRuntime-032: the deployed count is derived from the authoritative set of + // deployed config names, so removing the name here decrements it. Correct for a + // live, mid-redeploy, OR DISABLED instance (a disabled instance has a config row + // but is absent from both in-memory maps); a delete for a never-deployed instance + // removes nothing and leaves the count unchanged. Supersedes SiteRuntime-029's + // map-presence gate, which leaked the count on disabled-instance deletes. + _deployedInstanceNames.Remove(instanceName); UpdateInstanceCounts(); var sender = Sender; @@ -1547,14 +1555,14 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers /// /// Updates the health collector with current instance counts. - /// Total deployed = _totalDeployedCount, enabled = running actors, disabled = difference. + /// Total deployed = _deployedInstanceNames.Count, enabled = running actors, disabled = difference. /// private void UpdateInstanceCounts() { _healthCollector?.SetInstanceCounts( - deployed: _totalDeployedCount, + deployed: _deployedInstanceNames.Count, enabled: _instanceActors.Count, - disabled: _totalDeployedCount - _instanceActors.Count); + disabled: _deployedInstanceNames.Count - _instanceActors.Count); } // ── Internal messages ── diff --git a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs index 6908cb1d..3cf19641 100644 --- a/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs +++ b/src/ZB.MOM.WW.ScadaBridge.StoreAndForward/StoreAndForwardService.cs @@ -456,6 +456,13 @@ public class StoreAndForwardService ScadaBridgeTelemetry.ClearQueueDepthProvider(provider); _queueDepthProvider = null; } + // StoreAndForward-028: reset the cached depth alongside the registration guard. + // StartAsync re-seeds _bufferedCount from the durable Pending count under this + // same guard; without resetting here, a same-instance Stop->Start would ADD the + // re-read count on top of the leftover in-memory value, double-counting the gauge + // (~2N). Reset to zero so the restart seeds from a clean base. Buffered rows + // remain durable in SQLite, so the re-seed restores the true count. + Interlocked.Exchange(ref _bufferedCount, 0); Interlocked.Exchange(ref _queueDepthProviderRegistered, 0); } diff --git a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/AuditLogIngestActorTests.cs b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/AuditLogIngestActorTests.cs index 5bf9ee19..5c3b922d 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/AuditLogIngestActorTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.AuditLog.Tests/Central/AuditLogIngestActorTests.cs @@ -1,6 +1,7 @@ using Akka.Actor; using Akka.TestKit.Xunit2; using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging.Abstractions; using ZB.MOM.WW.ScadaBridge.AuditLog.Central; using ZB.MOM.WW.Audit; @@ -184,6 +185,51 @@ public class AuditLogIngestActorTests : TestKit, IClassFixture r.EventId == poisonId); } + [Fact] + public async Task Receive_WhenRepositoryResolutionThrows_ActorSurvives_RepliesEmpty_CountsFailure() + { + // AuditLog-017 (covers the AuditLog-014 guard): the production ctor resolves the + // scoped repository per message. If scope creation / repository resolution throws + // (transient DI or DbContext-factory fault, pooled-context init, a resolution race + // during host churn), the outer guard must keep the singleton ALIVE, increment the + // failure counter, and still reply with whatever was accepted (empty here) so the + // site keeps its rows Pending and retries — rather than letting the throw restart + // the singleton and drop the captured reply (the site's Ask would then time out). + var counter = new CountingFailureCounter(); + + // A provider with NO IAuditLogRepository registered → GetRequiredService throws + // inside the per-message scope; the failure counter IS registered so the guard's + // catch can surface the fault. + var services = new ServiceCollection(); + services.AddSingleton(counter); + await using var provider = services.BuildServiceProvider(); + + var actor = Sys.ActorOf(Props.Create(() => new AuditLogIngestActor( + (IServiceProvider)provider, NullLogger.Instance))); + + // First batch: resolution throws → empty reply, one counted failure, no restart. + actor.Tell(new IngestAuditEventsCommand( + Enumerable.Range(0, 3).Select(_ => NewEvent(NewSiteId())).ToList()), TestActor); + var reply = ExpectMsg(TimeSpan.FromSeconds(10)); + Assert.Empty(reply.AcceptedEventIds); + Assert.Equal(1, counter.Count); + + // Second batch proves the actor was not restarted/wedged: it still processes + // messages and the guard fires again. + actor.Tell(new IngestAuditEventsCommand( + new List { NewEvent(NewSiteId()) }), TestActor); + var reply2 = ExpectMsg(TimeSpan.FromSeconds(10)); + Assert.Empty(reply2.AcceptedEventIds); + Assert.Equal(2, counter.Count); + } + + /// Counts how many times the guard's catch surfaced a write failure. + private sealed class CountingFailureCounter : ICentralAuditWriteFailureCounter + { + public int Count { get; private set; } + public void Increment() => Count++; + } + /// /// Tiny test double that delegates to a real repository but throws on a /// specified EventId. Used to verify per-row failure isolation: one bad diff --git a/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs b/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs index bce8f57e..dd3bf5b5 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.DataConnectionLayer.Tests/DataConnectionActorAlarmTests.cs @@ -306,4 +306,84 @@ public class DataConnectionActorAlarmTests : TestKit await alarmable.Received(2).SubscribeAlarmsAsync( "Tank01", Arg.Any(), Arg.Any(), Arg.Any()); } + + // ── DataConnectionLayer-029: a re-subscribe during an orphaned in-flight subscribe + // must not leak a duplicate adapter feed ── + + [Fact] + public async Task DCL029_ResubscribeDuringOrphanedInFlightSubscribe_ReleasesDuplicateFeed_NoLeak() + { + // Regression test for DataConnectionLayer-029. The DCL-023 fix clears the in-flight + // marker on unsubscribe, which reopens a double-subscribe window: unsubscribe (last + // subscriber, subId not stored yet) → a fresh subscribe for the SAME source sees + // neither a stored id nor an in-flight marker, so it issues a SECOND adapter feed → + // both completions fire. The DCL-023 orphan guard does NOT trigger on either + // completion (the re-subscribe re-added the subscriber), so the alarm completion + // handler used to OVERWRITE _alarmSubscriptionIds with the second id — leaking the + // first feed (never unsubscribed, kept streaming). After DCL-029 the handler mirrors + // the tag-path re-check: when a feed is already stored, the redundant completion + // releases its just-created feed instead of overwriting + leaking. + var sub1Started = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var sub1Release = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var sub2Started = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var sub2Release = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + + var adapter = Substitute.For(); + adapter.ConnectAsync(Arg.Any>(), Arg.Any()) + .Returns(Task.CompletedTask); + var alarmable = (IAlarmSubscribableConnection)adapter; + + var calls = 0; + alarmable.SubscribeAlarmsAsync(Arg.Any(), Arg.Any(), + Arg.Any(), Arg.Any()) + .Returns(_ => + { + if (Interlocked.Increment(ref calls) == 1) + { + sub1Started.TrySetResult(); + return sub1Release.Task; + } + sub2Started.TrySetResult(); + return sub2Release.Task; + }); + alarmable.UnsubscribeAlarmsAsync(Arg.Any(), Arg.Any()) + .Returns(Task.CompletedTask); + + var actor = Sys.ActorOf(Props.Create(() => new DataConnectionActor( + "conn", adapter, _options, _health, _factory, "OpcUa"))); + + // (1) Subscribe A — adapter subscribe #1 parks, in-flight={Tank01}. + actor.Tell(new SubscribeAlarmsRequest("c1", "instA", "conn", "Tank01", null, DateTimeOffset.UtcNow)); + await sub1Started.Task.WaitAsync(TimeSpan.FromSeconds(5)); + + // (2) Last subscriber unsubscribes while subscribe #1 is in flight — clears the + // in-flight marker (DCL-023); subId#1 is not stored yet so no teardown happens. + actor.Tell(new UnsubscribeAlarmsRequest("unsub-c1", "instA", "conn", "Tank01", DateTimeOffset.UtcNow)); + await Task.Delay(150); + + // (3) Fresh subscribe for the SAME source before #1 completes — neither a stored id + // nor an in-flight marker exists, so the actor issues a SECOND adapter subscribe. + actor.Tell(new SubscribeAlarmsRequest("c2", "instB", "conn", "Tank01", null, DateTimeOffset.UtcNow)); + await sub2Started.Task.WaitAsync(TimeSpan.FromSeconds(5)); + + // (4) Complete subscribe #1 → a subscriber exists again (B re-added), so the orphan + // guard does NOT fire and subId#1 is stored as the live feed. + sub1Release.SetResult("alarm-sub-1"); + await Task.Delay(150); + + // (5) Complete subscribe #2 → a feed is already stored, so this redundant completion + // releases its just-created feed (#2) instead of overwriting + leaking subId#1. + sub2Release.SetResult("alarm-sub-2"); + await Task.Delay(300); + + // The duplicate feed (#2) is released exactly once; the first feed (#1) is retained. + await alarmable.Received(1).UnsubscribeAlarmsAsync("alarm-sub-2", Arg.Any()); + await alarmable.DidNotReceive().UnsubscribeAlarmsAsync("alarm-sub-1", Arg.Any()); + + // The retained feed (#1) is what a later unsubscribe tears down — proving subId#1, + // not the duplicate, is the id _alarmSubscriptionIds actually tracks (no leak). + actor.Tell(new UnsubscribeAlarmsRequest("unsub-c2", "instB", "conn", "Tank01", DateTimeOffset.UtcNow)); + await Task.Delay(200); + await alarmable.Received(1).UnsubscribeAlarmsAsync("alarm-sub-1", Arg.Any()); + } } diff --git a/tests/ZB.MOM.WW.ScadaBridge.InboundAPI.Tests/RouteHelperTests.cs b/tests/ZB.MOM.WW.ScadaBridge.InboundAPI.Tests/RouteHelperTests.cs index 6fa7827b..f8f0b36c 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.InboundAPI.Tests/RouteHelperTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.InboundAPI.Tests/RouteHelperTests.cs @@ -279,6 +279,41 @@ public class RouteHelperTests Assert.True(seen.IsCancellationRequested); // the client-abort token cancels the wait } + [Fact] + public async Task WaitForAttribute_SlowTimedOutResponse_NotPreemptedByLocalBackstop() + { + // InboundAPI-031: WaitForAttribute must NOT impose a local wait-timeout backstop. + // The site enforces the wait timeout and returns Matched=false; the round trip is + // bounded by CommunicationService's Ask (timeout + IntegrationTimeout). A local CTS + // of `timeout + small grace` (the prior InboundAPI-029 approach) was TIGHTER than + // that round-trip budget, so a slow-but-valid timed-out response would be cancelled + // into an exception (a 500) instead of the spec §6 `false`. With the backstop + // removed, a response arriving well after the (tiny) wait timeout still returns a + // clean `false`, and the token the router observed was never cancelled by RouteHelper + // — if a tight local backstop were reintroduced, the honoured token below would throw. + SiteResolves("inst-1", "SiteA"); + CancellationToken seen = default; + _router.RouteToWaitForAttributeAsync("SiteA", Arg.Any(), Arg.Do(t => seen = t)) + .Returns(async ci => + { + var token = (CancellationToken)ci[2]; + // Simulate the site enforcing the wait + a round trip far longer than the + // tiny wait timeout; honour the token so a (re)introduced local backstop + // would surface here as an OperationCanceledException. + await Task.Delay(TimeSpan.FromMilliseconds(400), token); + return new RouteToWaitForAttributeResponse( + ((RouteToWaitForAttributeRequest)ci[1]).CorrelationId, + Matched: false, Value: null, Quality: null, TimedOut: true, + Success: true, ErrorMessage: null, DateTimeOffset.UtcNow); + }); + + var matched = await CreateHelper().To("inst-1") + .WaitForAttribute("Flag", true, TimeSpan.FromMilliseconds(20)); + + Assert.False(matched); // clean spec §6 false, not a cancellation + Assert.False(seen.IsCancellationRequested); // no local wait-timeout backstop fired + } + [Fact] public async Task WaitForAttribute_WithParentExecutionId_CarriesItOnRequest() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs index 0f4ca1ac..e6980a99 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests/Actors/DeploymentManagerRedeployTests.cs @@ -240,6 +240,47 @@ public class DeploymentManagerRedeployTests : TestKit, IDisposable Assert.Equal(0, health.LastDeployedCount); } + [Fact] + public async Task SR032_DeleteDisabledInstance_DecrementsDeployedCount() + { + // Regression test for SiteRuntime-032. SiteRuntime-029 gated the deployed-count + // decrement on the instance being present in _instanceActors OR mid-redeploy in + // _terminatingActorsByName. A DISABLED instance is in NEITHER map (disable removes + // it from _instanceActors and never adds it to the terminating shadow) yet still has + // a deployed-config row counted as deployed — so deleting a disabled instance + // skipped the decrement and leaked the deployed/disabled tally on the health + // dashboard. After the fix the count is derived from the authoritative set of + // deployed config names, so a delete decrements for a disabled instance too. + var health = new CountCapturingHealthCollector(); + var actor = CreateDeploymentManager(health); + await Task.Delay(500); + + // Deploy → deployed count 1. + actor.Tell(new DeployInstanceCommand( + "dep-1", "DisablePump", "h1", MakeConfigJson("DisablePump"), "admin", DateTimeOffset.UtcNow)); + var deploy = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.Equal(DeploymentStatus.Success, deploy.Status); + await Task.Delay(300); + Assert.Equal(1, health.LastDeployedCount); + + // Disable → the instance is still deployed (count stays 1), just not enabled. + actor.Tell(new DisableInstanceCommand("cmd-1", "DisablePump", DateTimeOffset.UtcNow)); + var disable = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.True(disable.Success); + Assert.Equal(1, health.LastDeployedCount); + + // Delete the DISABLED instance → the deployed count must return to 0. + // (The SiteRuntime-029 regression left it stuck at 1.) + actor.Tell(new DeleteInstanceCommand("del-1", "DisablePump", DateTimeOffset.UtcNow)); + var delete = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.True(delete.Success); + Assert.Equal(0, health.LastDeployedCount); + + // No deployed-config row remains. + var configs = await _storage.GetAllDeployedConfigsAsync(); + Assert.DoesNotContain(configs, c => c.InstanceUniqueName == "DisablePump"); + } + [Fact] public async Task Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances() { diff --git a/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs index f89d5840..57dcea6c 100644 --- a/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs +++ b/tests/ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests/QueueDepthGaugeTests.cs @@ -209,6 +209,50 @@ public class QueueDepthGaugeTests : IAsyncLifetime, IDisposable Assert.Equal(0, ReadQueueDepthGauge()); } + /// + /// StoreAndForward-028: a same-instance Stop→Start must re-seed the cached depth from a + /// clean base. resets the one-time + /// registration guard so a later + /// re-registers and re-seeds _bufferedCount from the durable Pending count; if + /// StopAsync did not also reset _bufferedCount, the restart would ADD the re-read + /// count on top of the leftover in-memory value, double-counting the gauge to ~2N. + /// + [Fact] + public async Task StartAsync_AfterStop_ReseedsFromCleanBase_NoDoubleCount() + { + // One durable Pending row that survives the stop/restart in SQLite. + await _storage.EnqueueAsync(new StoreAndForwardMessage + { + Id = Guid.NewGuid().ToString("N"), + Category = StoreAndForwardCategory.ExternalSystem, + Target = "api", + PayloadJson = "{}", + Status = StoreAndForwardMessageStatus.Pending, + CreatedAt = DateTimeOffset.UtcNow, + MaxRetries = 3 + }); + + var svc = new StoreAndForwardService( + _storage, + new StoreAndForwardOptions { RetryTimerInterval = TimeSpan.FromMinutes(10) }, + NullLogger.Instance); + + // First start seeds the cached count from the durable store → 1. + await svc.StartAsync(); + Assert.Equal(1, ReadQueueDepthGauge()); + + // Graceful stop resets the registration guard AND the cached count (the fix). + await svc.StopAsync(); + + // Restart the SAME instance: the guard was reset so StartAsync re-seeds from the + // store (still 1 Pending). With the _bufferedCount reset the gauge reports 1, not 2; + // without it the seed would ADD onto the leftover 1 → 2 (the double-count bug). + await svc.StartAsync(); + Assert.Equal(1, ReadQueueDepthGauge()); + + await svc.StopAsync(); + } + [Fact] public async Task Gauge_SeedsFromExistingPendingRows_OnStart() {