Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 016bdf9c3c | |||
| 9f634e37c3 | |||
| 2502e4d10a | |||
| 8c67ffad2a | |||
| c9b236e507 | |||
| 0c82ffcbe6 |
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 10 |
|
||||
| Open findings | 6 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -113,7 +113,7 @@ template-aggregate contract the callers depend on.
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Security |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ConfigurationDatabase/DesignTimeDbContextFactory.cs:21-22` |
|
||||
|
||||
**Description**
|
||||
@@ -136,7 +136,19 @@ and never use `sa`.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: the factory
|
||||
fell back to a literal `User Id=sa;Password=YourPassword;...` connection string when no
|
||||
configured value was found. Removed the hardcoded fallback entirely. The factory now
|
||||
resolves the connection string from the Host's appsettings files or, when those are not
|
||||
present, from the `SCADALINK_DESIGNTIME_CONNECTIONSTRING` environment variable, and
|
||||
throws a clear `InvalidOperationException` (naming both the config key and the env var)
|
||||
when neither yields a value. Also hardened `SetBasePath` to be applied only when the
|
||||
`ScadaLink.Host` directory exists, so the factory degrades cleanly instead of throwing
|
||||
`DirectoryNotFoundException` when run from a context without a sibling Host folder.
|
||||
Regression tests added in `DesignTimeDbContextFactoryTests.cs`:
|
||||
`CreateDbContext_NoConnectionStringConfigured_ThrowsClearException`,
|
||||
`CreateDbContext_ConnectionStringFromEnvironmentVariable_IsUsed`, and
|
||||
`DesignTimeDbContextFactory_SourceContainsNoHardcodedSaCredential`.
|
||||
|
||||
### ConfigurationDatabase-003 — No-arg `AddConfigurationDatabase()` silently registers nothing
|
||||
|
||||
@@ -144,7 +156,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs:44-49` |
|
||||
|
||||
**Description**
|
||||
@@ -166,7 +178,21 @@ name (e.g. `AddConfigurationDatabaseNoOp()`), and remove the stale "Phase 0" wor
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: the
|
||||
parameterless `AddConfigurationDatabase()` overload returned `services` unchanged,
|
||||
registering no `DbContext`, repositories, `IAuditService`, or `IInstanceLocator`.
|
||||
Applied the recommendation's first option: the overload is now marked
|
||||
`[Obsolete(..., error: true)]` so any source reference is a compile-time failure, and
|
||||
its body throws `InvalidOperationException` with an actionable message as
|
||||
defence-in-depth (covering reflection-based invocation or suppressed warnings). The
|
||||
stale "Phase 0 stubs / backward compatibility" XML comment was replaced with one
|
||||
explaining the obsoletion. The pre-existing
|
||||
`ServiceRegistrationTests.AddConfigurationDatabase_NoArgs_DoesNotThrow` test in
|
||||
`UnitTest1.cs`, which encoded the old buggy no-op contract, was updated to
|
||||
`AddConfigurationDatabase_NoArgs_FailsFast` to assert the corrected behaviour.
|
||||
New regression tests added in `ServiceCollectionExtensionsTests.cs`:
|
||||
`AddConfigurationDatabase_NoArgOverload_FailsFastWithClearMessage` and
|
||||
`AddConfigurationDatabase_NoArgOverload_IsMarkedObsoleteAsError`.
|
||||
|
||||
### ConfigurationDatabase-004 — Secret-bearing columns stored in plaintext with no protection
|
||||
|
||||
@@ -174,7 +200,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Security |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ConfigurationDatabase/Configurations/NotificationConfiguration.cs:56-57`, `src/ScadaLink.ConfigurationDatabase/Configurations/ExternalSystemConfiguration.cs:25-26,75-77` |
|
||||
|
||||
**Description**
|
||||
@@ -199,7 +225,35 @@ doc to state the chosen at-rest protection.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed against source:
|
||||
`SmtpConfiguration.Credentials`, `ExternalSystemDefinition.AuthConfiguration`, and
|
||||
`DatabaseConnectionDefinition.ConnectionString` were mapped as ordinary `nvarchar(4000)`
|
||||
columns and persisted verbatim.
|
||||
|
||||
Implemented the recommendation's first option — an in-module EF Core value converter
|
||||
backed by ASP.NET Data Protection, which the module already uses
|
||||
(`IDataProtectionKeyContext`, `AddDataProtection().PersistKeysToDbContext`). Added
|
||||
`EncryptedStringConverter` (purpose-scoped `IDataProtector`; `Protect` on write,
|
||||
`Unprotect` on read; null-safe; surfaces a clear message on a `CryptographicException`).
|
||||
`ScadaLinkDbContext` gained an `(options, IDataProtectionProvider)` constructor and
|
||||
applies the converter to the three secret columns in `OnModelCreating`; the DI
|
||||
registration in `ServiceCollectionExtensions` now constructs the context with the
|
||||
registered provider. The secret columns were widened to `HasMaxLength(8000)` (EF maps
|
||||
this to `nvarchar(max)` on SQL Server) so ciphertext expansion cannot truncate the
|
||||
value; migration `20260517010521_EncryptSecretColumns` carries the column-type change.
|
||||
Regression tests added in `SecretEncryptionTests.cs` verify the raw column value is
|
||||
never the plaintext secret and that EF transparently decrypts on read, for all three
|
||||
columns plus a null round-trip.
|
||||
|
||||
The encryption scheme itself is fully in-module; the only remaining cross-cutting item
|
||||
is a documentation gap — the design doc does not yet state encryption-at-rest for these
|
||||
fields. That doc update is outside this module's editable scope (constraint: edit only
|
||||
`src/ScadaLink.ConfigurationDatabase`, the tests, and this file) and is surfaced here
|
||||
for a follow-up to `docs/requirements/Component-ConfigurationDatabase.md`. The audit
|
||||
secret-leak concern is mitigated separately by CD-007's serializer hardening; whether
|
||||
callers should additionally redact secret-bearing entities before passing them to
|
||||
`IAuditService` is a caller-side concern in other modules and is also surfaced for
|
||||
follow-up. The code fix in this module is complete.
|
||||
|
||||
### ConfigurationDatabase-005 — Audit `Id` type disagrees with the design doc
|
||||
|
||||
@@ -264,7 +318,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ConfigurationDatabase/Services/AuditService.cs:28-30` |
|
||||
|
||||
**Description**
|
||||
@@ -289,7 +343,23 @@ and document that decision against the design doc's transactional-guarantee sect
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Root cause confirmed against source: `LogAsync`
|
||||
called `JsonSerializer.Serialize(afterState)` with default options, so any `afterState`
|
||||
graph containing a reference cycle threw `JsonException` — and because the audit entry
|
||||
commits in the same transaction as the change it records, that exception rolled back
|
||||
the entire business operation.
|
||||
|
||||
Fix applied per the recommendation: `AuditService` now serializes via a static
|
||||
`JsonSerializerOptions` configured with `ReferenceHandler.IgnoreCycles` and
|
||||
`MaxDepth = 32`. The serialization is additionally wrapped in a `SerializeAfterState`
|
||||
helper that catches a residual `JsonException`/`NotSupportedException` and substitutes a
|
||||
small diagnostic placeholder JSON (`AuditSerializationError` + `StateType`) — an explicit
|
||||
decision that an audit-serialization failure must **degrade gracefully** and never roll
|
||||
back the audited operation. The audit entry is always recorded; the design doc's
|
||||
transactional-guarantee section ("if the change succeeds, the audit entry is always
|
||||
recorded") is thereby honoured even for pathological state objects. Regression test
|
||||
added in `AuditServiceTests.cs`:
|
||||
`LogAsync_AfterStateWithReferenceCycle_DoesNotThrow_AndDoesNotRollBackOperation`.
|
||||
|
||||
### ConfigurationDatabase-008 — `GetApprovedKeysForMethodAsync` CSV parsing silently drops malformed ids
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 8 |
|
||||
| Open findings | 2 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -287,7 +287,7 @@ unbounded code and passes after. Fixed by the commit whose message references
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:645-673,721-756` |
|
||||
|
||||
**Description**
|
||||
@@ -303,6 +303,14 @@ decrement, and the totals can drift above `_totalSubscribed`. Over repeated
|
||||
disconnect/reconnect cycles the health report's good/bad/uncertain counts become
|
||||
unreliable.
|
||||
|
||||
**Verification note**: Confirmed against source. The root cause is broader than the
|
||||
reconnect path the finding describes: `HandleUnsubscribe` also never removes a tag
|
||||
from `_lastTagQuality` nor decrements its quality bucket, so an unsubscribed tag
|
||||
lingers and `PushBadQualityForAllTags` (which sets `_tagsBadQuality =
|
||||
_lastTagQuality.Count`) over-counts it — driving the bad-quality count above
|
||||
`_totalSubscribed` even without a re-subscribe. Both the unsubscribe leak and the
|
||||
re-subscribe drift are real.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
On `BecomeConnected` after a re-subscribe (or in `ReSubscribeAll`), clear
|
||||
@@ -312,7 +320,17 @@ fresh `TagValueReceived` messages. Alternatively recompute the buckets from
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). `HandleUnsubscribe` now removes each
|
||||
unsubscribed tag from `_lastTagQuality` and decrements the corresponding quality
|
||||
bucket, then reports the corrected counters via `UpdateTagQuality`/`UpdateTagResolution`;
|
||||
`ReSubscribeAll` clears `_lastTagQuality` and zeroes the three quality counters so
|
||||
post-reconnect tags are repopulated from fresh `TagValueReceived` messages instead of
|
||||
only incrementing. Regression test
|
||||
`DCL006_DisconnectAfterUnsubscribe_BadQualityCountMatchesRemainingTags` subscribes two
|
||||
tags, pushes Good values, unsubscribes one, then disconnects and asserts
|
||||
`PushBadQualityForAllTags` reports exactly 1 bad tag (the reconnect is gated open so
|
||||
`ReSubscribeAll` does not run before the assertion); it reports 2 against the pre-fix
|
||||
code and 1 after.
|
||||
|
||||
### DataConnectionLayer-007 — `ReadBatchAsync` aborts the whole batch on the first failing tag
|
||||
|
||||
@@ -320,7 +338,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:187-195` |
|
||||
|
||||
**Description**
|
||||
@@ -333,6 +351,9 @@ has a `Success`/`ErrorMessage` shape designed to carry per-tag failures. The bat
|
||||
also fully serial (one round-trip per tag), defeating the point of a batch API; the
|
||||
design doc lists `ReadBatch`/`WriteBatch` as first-class operations.
|
||||
|
||||
**Verification note**: Confirmed against source — `ReadAsync` re-throws on any
|
||||
non-`OperationCanceledException`, aborting the whole batch.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Catch per-tag exceptions inside the loop and store a failed `ReadResult` for that tag
|
||||
@@ -342,7 +363,17 @@ for all node IDs (`RealOpcUaClient.ReadValueAsync` already builds a
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). `ReadBatchAsync` now wraps each per-tag
|
||||
`ReadAsync` in a try/catch: a per-tag exception is recorded as a failed `ReadResult`
|
||||
(`Success: false`, message = the exception message) so the batch returns a complete
|
||||
result map for every requested tag; `OperationCanceledException` is still propagated
|
||||
so a cancelled batch aborts as a whole. The per-tag-serial loop and single-service-call
|
||||
optimisation were deliberately left for a follow-up — they are a performance concern,
|
||||
not the correctness bug this finding raised. Regression test
|
||||
`DCL007_ReadBatch_ReturnsPerTagResults_WhenOneTagFails` reads three tags where the
|
||||
middle one throws and asserts all three appear in the result map with the failing one
|
||||
marked unsuccessful; it threw (no map returned) against the pre-fix code and passes
|
||||
after.
|
||||
|
||||
### DataConnectionLayer-008 — `HandleUnsubscribe` is O(n^2) over instances and rechecks `_unresolvedTags` redundantly
|
||||
|
||||
@@ -379,9 +410,9 @@ _Unresolved._
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — partially design-doc work outside this module's editable scope |
|
||||
| Category | Design-document adherence |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:189,242-297,379-449`, `docs/requirements/Component-DataConnectionLayer.md:73-85` |
|
||||
|
||||
**Description**
|
||||
@@ -398,6 +429,10 @@ all. A reviewer or operator reading `Component-DataConnectionLayer.md` would not
|
||||
predict this behaviour, and the 60 s threshold is a magic constant not exposed via
|
||||
`DataConnectionOptions`.
|
||||
|
||||
**Verification note**: Confirmed against source. The hard-coded
|
||||
`StableConnectionThreshold = TimeSpan.FromSeconds(60)` `static readonly` field and the
|
||||
`_consecutiveUnstableDisconnects` failover path both exist as described.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Update `Component-DataConnectionLayer.md` to document the unstable-disconnect failover
|
||||
@@ -406,7 +441,19 @@ path and the stability threshold, and move the 60 s threshold into
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). The configurability half of the recommendation
|
||||
is done: the hard-coded `StableConnectionThreshold` constant was removed from
|
||||
`DataConnectionActor` and replaced with a new `DataConnectionOptions.StableConnectionThreshold`
|
||||
property (60 s default), bindable from the `DataConnectionLayer` `appsettings.json`
|
||||
section like `ReconnectInterval`/`TagResolutionRetryInterval`/`WriteTimeout`. Regression
|
||||
test `DCL009_StableConnectionThreshold_IsConfigurable_WithSixtySecondDefault` guards
|
||||
the default and the setter. **The documentation half is out of this module's editable
|
||||
scope** — `docs/requirements/Component-DataConnectionLayer.md` (lines 73-85) still
|
||||
describes only the connect-failure failover path and does not mention the
|
||||
unstable-disconnect trigger. **Action required (surfaced):** the DCL design doc should
|
||||
be updated to document the unstable-disconnect failover path and the configurable
|
||||
stability threshold; that edit was deliberately not made here because this task is
|
||||
scoped to `src/ScadaLink.DataConnectionLayer`, tests, and this findings file only.
|
||||
|
||||
### DataConnectionLayer-010 — Tag-resolution retry can issue duplicate concurrent subscribe attempts
|
||||
|
||||
@@ -414,7 +461,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:594-619,689-703` |
|
||||
|
||||
**Description**
|
||||
@@ -429,6 +476,10 @@ monitored items / leaked subscription IDs (the second success overwrites
|
||||
with no `UnsubscribeAsync` call). The timer-cancel condition in
|
||||
`HandleTagResolutionSucceeded` is also non-deterministic for the same reason.
|
||||
|
||||
**Verification note**: Confirmed against source — `HandleRetryTagResolution` dispatched
|
||||
`SubscribeAsync` for every tag in `_unresolvedTags` on every tick with no in-flight
|
||||
guard.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Remove tags from `_unresolvedTags` (into an "in-flight" set) when a retry is
|
||||
@@ -437,7 +488,18 @@ subscribe attempts and makes the timer-cancel condition deterministic.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). A new `_resolutionInFlight` `HashSet<string>`
|
||||
tracks tags whose retry `SubscribeAsync` is currently outstanding.
|
||||
`HandleRetryTagResolution` now dispatches only for unresolved tags **not** already in
|
||||
flight (and skips entirely if all are in flight), adding each dispatched tag to the
|
||||
set; `HandleTagResolutionSucceeded` and `HandleTagResolutionFailed` remove the tag
|
||||
from the set when its attempt completes, and `HandleUnsubscribe`/`ReSubscribeAll`
|
||||
clear stale entries. This prevents overlapping duplicate subscribe attempts and the
|
||||
resulting orphaned monitored items. Regression test
|
||||
`DCL010_TagResolutionRetry_DoesNotIssueDuplicateConcurrentSubscribes` gives a tag a
|
||||
genuine initial failure then a retry `SubscribeAsync` that never completes, lets six
|
||||
100 ms retry ticks elapse, and asserts exactly one retry was dispatched (2 total
|
||||
subscribe calls); the pre-fix code dispatched on every tick (6 total).
|
||||
|
||||
### DataConnectionLayer-011 — Stale subscription callbacks from disposed adapters can still reach the actor
|
||||
|
||||
@@ -445,7 +507,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DataConnectionLayer/Actors/DataConnectionActor.cs:486-489,278-285,416-425`, `src/ScadaLink.DataConnectionLayer/Adapters/OpcUaDataConnection.cs:252-262` |
|
||||
|
||||
**Description**
|
||||
@@ -460,6 +522,10 @@ data with the new endpoint's data and briefly reporting a value the active endpo
|
||||
never produced. There is no per-adapter generation/epoch tag on `TagValueReceived` to
|
||||
distinguish current from stale callbacks.
|
||||
|
||||
**Verification note**: Confirmed against source — `TagValueReceived` carried no
|
||||
adapter identity, and `HandleTagValueReceived` (reachable in `Connected`) processed
|
||||
any such message regardless of which adapter produced it.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Add an adapter-generation counter incremented on every adapter swap; stamp it onto
|
||||
@@ -468,15 +534,28 @@ generation does not match the current adapter in `HandleTagValueReceived`.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). Implemented exactly as recommended: a new
|
||||
`_adapterGeneration` `int` field is incremented at both adapter-swap sites (the
|
||||
unstable-disconnect failover in `BecomeReconnecting` and the connect-failure failover
|
||||
in `HandleReconnectResult`). The `TagValueReceived` record gained an
|
||||
`AdapterGeneration` field; every subscription callback closure (`HandleSubscribe`, the
|
||||
initial-read seed, `HandleRetryTagResolution`, `ReSubscribeAll`) captures the
|
||||
generation in effect at subscribe time and stamps it onto each `TagValueReceived`.
|
||||
`HandleTagValueReceived` drops any message whose generation no longer matches the
|
||||
current adapter, so a callback fired by a disposed adapter after failover cannot reach
|
||||
an Instance Actor. Regression test
|
||||
`DCL011_StaleTagValueFromOldAdapter_IsNotForwardedAfterFailover` subscribes on the
|
||||
primary, fails over to the backup, then invokes the captured primary callback with a
|
||||
stale value and asserts the subscriber receives nothing; the stale value reached the
|
||||
subscriber against the pre-fix code and is dropped after.
|
||||
|
||||
### DataConnectionLayer-012 — `AutoAcceptUntrustedCerts` defaults to `true`, accepting any server certificate
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — full secure default also requires a Commons + design-doc change outside this module |
|
||||
| Category | Security |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DataConnectionLayer/Adapters/IOpcUaClient.cs:17`, `src/ScadaLink.DataConnectionLayer/Adapters/RealOpcUaClient.cs:49,60-61`, `docs/requirements/Component-DataConnectionLayer.md:116` |
|
||||
|
||||
**Description**
|
||||
@@ -490,6 +569,13 @@ UA link. The design doc explicitly lists `true` as the default. For an industria
|
||||
control link this is a meaningful exposure; a secure-by-default posture would reject
|
||||
untrusted certs unless an operator opts in per connection.
|
||||
|
||||
**Verification note**: Confirmed against source. Note the *authoritative* runtime
|
||||
default does not actually live on `OpcUaConnectionOptions` — for a real connection
|
||||
`OpcUaDataConnection.ConnectAsync` builds `OpcUaConnectionOptions` from
|
||||
`OpcUaEndpointConfig` (in `ScadaLink.Commons`), whose `AutoAcceptUntrustedCerts`
|
||||
property also defaults to `true`. `OpcUaConnectionOptions`' own default is only the
|
||||
fallback used when an `OpcUaConnectionOptions` is constructed directly.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Default `AutoAcceptUntrustedCerts` to `false` and require explicit per-connection
|
||||
@@ -498,7 +584,21 @@ installed. Update the design doc to reflect the secure default.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). The two in-scope parts of the recommendation
|
||||
are done: (1) `OpcUaConnectionOptions.AutoAcceptUntrustedCerts` now defaults to
|
||||
`false`; (2) `RealOpcUaClient.ConnectAsync` logs a prominent `ILogger` warning
|
||||
whenever the auto-accept certificate validator is installed (an `ILogger<RealOpcUaClient>`
|
||||
was added as an optional constructor parameter, defaulting to `NullLogger`, so
|
||||
existing callers are unaffected). Regression test
|
||||
`DCL012_OpcUaConnectionOptions_AutoAcceptUntrustedCerts_DefaultsToFalse` guards the
|
||||
new secure default. **Two parts remain outside this module's editable scope and are
|
||||
surfaced as action required:** (a) `ScadaLink.Commons.Types.DataConnections.OpcUaEndpointConfig.AutoAcceptUntrustedCerts`
|
||||
still defaults to `true` — since that is the value actually used for a real connection
|
||||
(see verification note above), the Commons default must also be flipped to `false`
|
||||
for the system to be secure-by-default; (b) `docs/requirements/Component-DataConnectionLayer.md`
|
||||
line 116 still documents `true` as the default and must be updated. Both edits were
|
||||
deliberately not made here because this task is scoped to
|
||||
`src/ScadaLink.DataConnectionLayer`, tests, and this findings file only.
|
||||
|
||||
### DataConnectionLayer-013 — Misleading XML comment: `RaiseDisconnected` claims thread safety it does not provide
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 11 |
|
||||
| Open findings | 5 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -134,7 +134,7 @@ error) if persistence still fails. Regression test:
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:155-170` |
|
||||
|
||||
**Description**
|
||||
@@ -150,6 +150,11 @@ deployment. Central and site are now divergent: the site is running the new
|
||||
config but central still shows the old state and a non-`Success` deployment
|
||||
record.
|
||||
|
||||
**Verification:** Confirmed against source. The DeploymentManager-001 fix made
|
||||
this strictly worse, not better — after that fix a snapshot-store failure is
|
||||
caught and the record is flipped from `Success` back to `Failed`, so central
|
||||
reports a *failed* deployment while the site is running the new config.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Wrap the post-success persistence so that, at minimum, the deployment record's
|
||||
@@ -160,7 +165,15 @@ apply.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending): `DeployInstanceAsync` now commits the
|
||||
deployment record's terminal status (`UpdateDeploymentRecordAsync` +
|
||||
`SaveChangesAsync`) immediately after the site confirms the apply, *before*
|
||||
touching instance state or the deployed-config snapshot. The post-success
|
||||
instance-state update and `StoreDeployedSnapshotAsync` are wrapped in a
|
||||
best-effort `try`/`catch` that logs loudly for operator reconciliation but no
|
||||
longer flips the already-committed `Success` record back to `Failed`.
|
||||
Regression test:
|
||||
`DeployInstanceAsync_SiteSucceeds_SnapshotWriteFails_RecordStillCommittedSuccess`.
|
||||
|
||||
### DeploymentManager-004 — Site-success but central-delete-failure leaves orphaned site config
|
||||
|
||||
@@ -168,7 +181,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:312-319` |
|
||||
|
||||
**Description**
|
||||
@@ -182,6 +195,10 @@ normal path (the site no longer has it, so a re-issued delete may fail) and is
|
||||
permanently orphaned. The design states central must not mark the instance
|
||||
deleted until the site confirms — but it does not address the inverse failure.
|
||||
|
||||
**Verification:** Confirmed against source. `DeleteInstanceAsync` has no
|
||||
`try`/`catch` around the post-success block, so any exception from
|
||||
`DeleteInstanceAsync`/`SaveChangesAsync` escapes uncaught to the caller.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Catch persistence failures in the post-success block and surface a distinct
|
||||
@@ -191,7 +208,13 @@ idempotent and retryable independently of the site command.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending): the post-success removal in
|
||||
`DeleteInstanceAsync` (`DeleteInstanceAsync` + `SaveChangesAsync`) is now
|
||||
wrapped in a `try`/`catch`. A persistence failure no longer escapes uncaught —
|
||||
it is logged, recorded with a `DeleteOrphaned` audit entry, and surfaced as a
|
||||
distinct `Result` failure stating the site deleted the instance but the central
|
||||
record is orphaned and must be reconciled. Regression test:
|
||||
`DeleteInstanceAsync_SiteSucceeds_CentralDeleteFails_ReturnsDistinctFailure`.
|
||||
|
||||
### DeploymentManager-005 — `OperationLockManager` leaks a `SemaphoreSlim` per instance name
|
||||
|
||||
@@ -199,7 +222,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Performance & resource management |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DeploymentManager/OperationLockManager.cs:15-33` |
|
||||
|
||||
**Description**
|
||||
@@ -213,6 +236,9 @@ with the bulk "deploy all out-of-date instances" workflow and instances that
|
||||
are created and deleted over time — this is an unbounded leak of both managed
|
||||
memory and OS handles. Deleted instances' semaphores are never reclaimed.
|
||||
|
||||
**Verification:** Confirmed against source. `_locks` is a `ConcurrentDictionary`
|
||||
with no removal path anywhere in the type.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Either accept the leak explicitly and document the expected bounded cardinality
|
||||
@@ -223,7 +249,17 @@ At minimum, remove the semaphore entry when an instance is deleted
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending): `OperationLockManager` now ref-counts each
|
||||
lock entry. A reference is reserved (creating the entry if needed) before the
|
||||
`SemaphoreSlim.WaitAsync`, so concurrent waiters for the same instance share one
|
||||
semaphore and the entry survives until every waiter/holder has released. When
|
||||
the reference count reaches zero — on release, timeout, or cancellation — the
|
||||
entry is removed from the dictionary and the semaphore is `Dispose()`d, so the
|
||||
process no longer accumulates one kernel wait handle per distinct instance name.
|
||||
A `TrackedLockCount` diagnostic property was added to make reclamation testable.
|
||||
Regression tests: `AcquireAsync_ReleasedLock_RemovesSemaphoreEntry`,
|
||||
`AcquireAsync_ManyDistinctInstances_DoesNotAccumulateSemaphores`,
|
||||
`AcquireAsync_ContendedLock_KeepsSemaphoreUntilLastReleaseThenReclaims`.
|
||||
|
||||
### DeploymentManager-006 — Query-the-site-before-redeploy idempotency requirement not implemented
|
||||
|
||||
@@ -294,7 +330,7 @@ stale-rejection) when the query fails. Regression tests:
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Design-document adherence |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DeploymentManager/DeploymentService.cs:334-358,401-406` |
|
||||
|
||||
**Description**
|
||||
@@ -308,6 +344,12 @@ added/removed/changed detail is produced, and the Template Engine's diff
|
||||
capability is not invoked. The UI cannot render a meaningful diff from this
|
||||
result.
|
||||
|
||||
**Verification:** Confirmed against source. The Template Engine already provides
|
||||
`DiffService` + `ConfigurationDiff` (structured Added/Removed/Changed entries
|
||||
for attributes, alarms, and scripts, including data connection binding fields),
|
||||
and `DiffService` is DI-registered — it was simply never wired into the
|
||||
Deployment Manager's comparison path.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Either implement a real diff (deserialize the stored
|
||||
@@ -318,7 +360,15 @@ down to staleness detection only.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending): `GetDeploymentComparisonAsync` now
|
||||
deserializes the stored `DeployedConfigSnapshot.ConfigurationJson` and runs the
|
||||
Template Engine `DiffService` against the freshly flattened current
|
||||
configuration, attaching the resulting `ConfigurationDiff` (added/removed/changed
|
||||
attributes, alarms, scripts) to a new optional `Diff` property on
|
||||
`DeploymentComparisonResult`. `DiffService` is injected into `DeploymentService`.
|
||||
A snapshot that cannot be deserialized (corrupt / older schema) still yields the
|
||||
hash-based staleness result with a null diff, logged at warning level.
|
||||
Regression test: `GetDeploymentComparisonAsync_ProducesStructuredDiff`.
|
||||
|
||||
### DeploymentManager-008 — `DeploymentManagerOptions` is never bound to configuration
|
||||
|
||||
@@ -326,7 +376,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Code organization & conventions |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.DeploymentManager/ServiceCollectionExtensions.cs:7-14` |
|
||||
|
||||
**Description**
|
||||
@@ -341,6 +391,9 @@ to options classes (Options pattern)." `Host/Program.cs` binds
|
||||
`SecurityOptions` and `InboundApiOptions` from configuration sections but has
|
||||
no equivalent for `DeploymentManagerOptions`.
|
||||
|
||||
**Verification:** Confirmed against source. Neither `AddDeploymentManager` nor
|
||||
`Host/Program.cs` binds `DeploymentManagerOptions`.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Add an `IConfiguration` parameter (or a configure callback) to
|
||||
@@ -349,7 +402,18 @@ Add an `IConfiguration` parameter (or a configure callback) to
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending): added an
|
||||
`AddDeploymentManager(IServiceCollection, IConfiguration)` overload that binds
|
||||
`DeploymentManagerOptions` to the `ScadaLink:DeploymentManager` configuration
|
||||
section (exposed as `ServiceCollectionExtensions.OptionsSection`). The
|
||||
`Microsoft.Extensions.Options.ConfigurationExtensions` package was added to the
|
||||
project. The original parameterless overload is retained for callers/tests that
|
||||
do not bind configuration. Regression tests:
|
||||
`AddDeploymentManager_WithConfiguration_BindsDeploymentManagerOptions`,
|
||||
`AddDeploymentManager_WithConfiguration_MissingSection_UsesDefaults`. Note: a
|
||||
one-line follow-up in `Host/Program.cs` (call the new overload with
|
||||
`builder.Configuration`) is required to take effect at runtime — that file is
|
||||
outside this module's edit scope and is surfaced for the Host owner.
|
||||
|
||||
### DeploymentManager-009 — Misleading timeout comment on `DeleteInstanceAsync`
|
||||
|
||||
@@ -415,7 +479,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Testing coverage |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `tests/ScadaLink.DeploymentManager.Tests/DeploymentServiceTests.cs:100-151,155-199` |
|
||||
|
||||
**Description**
|
||||
@@ -432,6 +496,13 @@ critical post-response branch (`DeploymentService.cs:154-184`) and the entire
|
||||
delete/disable/enable success path are untested. The `AuditLogs` test
|
||||
(lines 277-289) asserts nothing.
|
||||
|
||||
**Verification:** Partially confirmed. By the time this finding was being
|
||||
resolved, the DeploymentManager-006 fix had already introduced a TestKit-actor
|
||||
seam (`CreateServiceWithCommActor` + `ReconcileProbeActor`) and successful-deploy
|
||||
tests. The genuinely-still-missing coverage was: successful Disable/Enable/Delete
|
||||
paths, per-instance lock serialization during deploy, and the assertionless
|
||||
`AuditLogs` test — those gaps were addressed.
|
||||
|
||||
**Recommendation**
|
||||
|
||||
Introduce a seam to inject a fake/substitute communication path (e.g. an
|
||||
@@ -442,7 +513,20 @@ test assert on `IAuditService.LogAsync`.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending): extended the TestKit-actor seam
|
||||
(`ReconcileProbeActor` now also answers lifecycle commands) and added the
|
||||
missing coverage — successful Disable/Enable/Delete (state transition + audit
|
||||
assertions), a successful-deploy audit assertion, and per-instance lock
|
||||
serialization via a new deferred-reply `SerializationProbeActor` that asserts a
|
||||
single instance's concurrent deploys never overlap. The assertionless `AuditLogs`
|
||||
test was replaced with `DeployInstanceAsync_FlatteningFails_DoesNotReachAudit`,
|
||||
which asserts on `IAuditService.LogAsync`. Regression tests:
|
||||
`DisableInstanceAsync_SiteSucceeds_SetsDisabledStateAndAudits`,
|
||||
`EnableInstanceAsync_SiteSucceeds_SetsEnabledStateAndAudits`,
|
||||
`DeleteInstanceAsync_SiteSucceeds_RemovesRecordAndAudits`,
|
||||
`DeployInstanceAsync_SiteSucceeds_WritesDeployAuditEntry`,
|
||||
`DeployInstanceAsync_FlatteningFails_DoesNotReachAudit`,
|
||||
`DeployInstanceAsync_SameInstance_OperationLockSerializesConcurrentDeploys`.
|
||||
|
||||
### DeploymentManager-012 — `LifecycleCommandTimeout` option is dead code
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 11 |
|
||||
| Open findings | 4 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -215,7 +215,7 @@ is flipped back to `true`.
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Design-document adherence |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:114-115`, `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:86-87` |
|
||||
|
||||
**Description**
|
||||
@@ -242,7 +242,21 @@ should be tracked against the SiteRuntime module.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). `CachedCallAsync` and `CachedWriteAsync` now pass
|
||||
the definition's `MaxRetries` to `EnqueueAsync` verbatim — the `> 0` guard is dropped, so
|
||||
a legitimately-configured `MaxRetries` of 0 ("never retry") is honoured instead of being
|
||||
collapsed to the S&F default. The `RetryDelay > TimeSpan.Zero` guard is deliberately
|
||||
**kept**: `TimeSpan.Zero` is the entity default for an unconfigured field and a literal
|
||||
zero-delay retry loop is not a valid configuration, so falling back to the S&F default
|
||||
interval for an unset delay is correct (only `MaxRetries == 0` is a meaningful operator
|
||||
choice). Regression test `CachedCall_TransientFailure_ZeroMaxRetriesIsHonouredNotTreatedAsUnset`
|
||||
buffers a transient failure and asserts the buffered message carries `MaxRetries == 0`
|
||||
rather than the S&F default; `CachedCall_TransientFailure_BuffersWithSystemRetrySettings`
|
||||
additionally covers a non-default settings pass-through. The companion fix in
|
||||
`SiteExternalSystemRepository.MapExternalSystem` to actually read the `MaxRetries` /
|
||||
`RetryDelay` columns from SQLite remains a tracked follow-up against the SiteRuntime
|
||||
module (outside this module's edit scope) — until then, sites still supply
|
||||
`MaxRetries == 0`, which this fix now correctly honours as "never retry".
|
||||
|
||||
### ExternalSystemGateway-005 — `HttpRequestMessage` and `HttpResponseMessage` are not disposed
|
||||
|
||||
@@ -250,7 +264,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Performance & resource management |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:133-167` |
|
||||
|
||||
**Description**
|
||||
@@ -272,15 +286,22 @@ occurs on the exception paths.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). `InvokeHttpAsync` now declares the request as
|
||||
`using var request` and wraps all response handling in a `using (response)` block, so
|
||||
both `IDisposable` instances (and the request's `StringContent` / the response content
|
||||
stream) are released on the success path **and** on the permanent/transient
|
||||
exception paths. Regression tests `Call_SuccessfulHttp_DisposesRequestAndResponse` and
|
||||
`Call_PermanentFailure_StillDisposesRequestAndResponse` use a disposal-tracking
|
||||
`HttpMessageHandler`/`HttpContent` and assert both the request and the response content
|
||||
are disposed; both were verified to fail before the `using` wrappers were added.
|
||||
|
||||
### ExternalSystemGateway-006 — `BuildUrl` ignores path templates and appends a trailing slash for empty paths
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — partially re-triaged: trailing-slash bug fixed; path-templating sub-issue is a design decision (see Resolution) |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:180-196` |
|
||||
|
||||
**Description**
|
||||
@@ -305,7 +326,25 @@ example and state that paths are literal. Also avoid appending a trailing `/` wh
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). The **trailing-slash bug** is fixed: `BuildUrl`
|
||||
now appends a `/`-joined path segment only when the method's path is non-empty after
|
||||
trimming, so a method targeting the base URL itself produces `https://host/api` rather
|
||||
than `https://host/api/`. Regression tests `Call_MethodWithEmptyPath_DoesNotAppendTrailingSlash`
|
||||
and `Call_MethodWithPath_BuildsExpectedUrl` (asserting on the captured request URI)
|
||||
cover the empty-path and normal-path cases; the empty-path test was verified to fail
|
||||
before the fix.
|
||||
|
||||
Re-triage of the **path-templating sub-issue** (`{id}` placeholder substitution): this
|
||||
is a genuine design decision, not a code bug, and it requires editing the component
|
||||
design doc — both outside this module's edit scope (`src/`, `tests/`, this file only).
|
||||
The current code treats method paths as literal strings and routes parameters to the
|
||||
query string (GET/DELETE) or JSON body (POST/PUT); a method authored as `/recipes/{id}`
|
||||
sends the `{id}` token verbatim. **Tracked follow-up / surfaced design question:** the
|
||||
design owner must decide whether path templating is in scope — if yes, implement
|
||||
`{name}` substitution in `BuildUrl` and exclude substituted params from the
|
||||
query/body; if no, the `Component-ExternalSystemGateway.md` `/recipes/{id}` example must
|
||||
be changed to a literal path. The trailing-slash defect (the concrete correctness bug
|
||||
in this finding) is fully resolved.
|
||||
|
||||
### ExternalSystemGateway-007 — External error response bodies are echoed verbatim into script-visible error messages
|
||||
|
||||
@@ -313,7 +352,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Security |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:167-177` |
|
||||
|
||||
**Description**
|
||||
@@ -335,15 +374,25 @@ the script. Optionally only include the body when the content type is textual.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). `InvokeHttpAsync` now truncates the external error
|
||||
response body to `MaxErrorBodyChars` (2048) via a `Truncate` helper before embedding it
|
||||
into the transient/permanent exception message — so a misbehaving or hostile endpoint
|
||||
can no longer inflate every script-visible `ErrorMessage` and Site Event Logging entry
|
||||
with a multi-megabyte body. When truncation occurs the message is suffixed with
|
||||
`… [truncated, N chars total]` so the original size is still visible. Regression test
|
||||
`Call_PermanentFailureWithHugeErrorBody_TruncatesErrorMessage` drives a 400 with a
|
||||
500 KB body and asserts the resulting `ErrorMessage` is bounded (< 4096 chars); it was
|
||||
verified to fail (500 040-char message) before the cap was added. Content-type
|
||||
filtering was considered optional in the recommendation and was not implemented — the
|
||||
size cap alone closes the inflation/disclosure vector.
|
||||
|
||||
### ExternalSystemGateway-008 — Cancellation is conflated with transient timeout failure
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — re-triaged: root cause already fixed in current source (see Resolution) |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ErrorClassifier.cs:24-30`, `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:157-159` |
|
||||
|
||||
**Description**
|
||||
@@ -367,15 +416,37 @@ classification. Only treat a cancellation as a timeout when the supplied token i
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). **Re-triage:** the root cause described — a
|
||||
caller-initiated cancellation being misclassified as a transient failure — is **no
|
||||
longer present in the current source** and is not reproducible. `InvokeHttpAsync`
|
||||
already wraps both `SendAsync` and the response-body `ReadAsStringAsync` in ordered
|
||||
`catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)`
|
||||
filters that rethrow the caller's cancellation *before* the
|
||||
`catch (Exception ex) when (ErrorClassifier.IsTransient(ex))` branch is ever reached
|
||||
(this was added alongside the `ExternalSystemGateway-002` timeout fix). A caller-cancel
|
||||
therefore propagates as `OperationCanceledException` and is never buffered; only the
|
||||
gateway's own timeout token reclassifies as transient.
|
||||
|
||||
`ErrorClassifier.IsTransient(Exception)` does still return `true` for
|
||||
`TaskCanceledException`/`OperationCanceledException`, but that is **correct and
|
||||
intentional**: a `TaskCanceledException` raised by an HTTP timeout *is* a genuine
|
||||
transient failure, and the only caller (`InvokeHttpAsync:238`) is unreachable for a
|
||||
caller-cancellation because the two preceding `when`-filtered catches intercept it
|
||||
first. The transient-vs-cancel decision is contextual (which token fired) and cannot
|
||||
be made from the exception type alone, which is exactly why the call site does it.
|
||||
No source change was required. A regression guard,
|
||||
`CachedCall_CallerCancellation_IsNotBufferedAsTransient`, was added: it cancels the
|
||||
caller token mid-`CachedCall` and asserts an `OperationCanceledException` is thrown and
|
||||
the S&F buffer remains empty (the cancelled work is not retried). The existing
|
||||
`Call_CallerCancellation_IsNotMisreportedAsTimeout` covers the synchronous `Call` path.
|
||||
|
||||
### ExternalSystemGateway-009 — `StoreAndForwardResult` from `EnqueueAsync` is discarded; permanent failures during buffering are swallowed
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — re-triaged: root cause subsumed by the ExternalSystemGateway-003 dispatch redesign (see Resolution) |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/ExternalSystemClient.cs:109-117` |
|
||||
|
||||
**Description**
|
||||
@@ -398,7 +469,27 @@ partly subsumed by the dispatch redesign in finding 003.)
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). **Re-triage:** the stated root cause — "a
|
||||
permanent failure surfaced by `EnqueueAsync`'s immediate-delivery attempt is silently
|
||||
lost" — **can no longer occur** in the current source, and the dead `sfResult` variable
|
||||
the finding cites has already been removed. The `ExternalSystemGateway-003` fix changed
|
||||
`CachedCallAsync` to call `EnqueueAsync` with `attemptImmediateDelivery: false`. With
|
||||
that flag, `EnqueueAsync` never invokes the registered delivery handler: it skips the
|
||||
immediate-delivery block entirely (so the `StoreAndForwardResult(false, …, …)`
|
||||
permanent-failure return at `StoreAndForwardService.cs:147` is unreachable from this
|
||||
caller) and unconditionally buffers, returning `Accepted: true, WasBuffered: true`
|
||||
(`StoreAndForwardService.cs:180`). The `ExternalCallResult(true, null, null,
|
||||
WasBuffered: true)` that `CachedCallAsync` returns is therefore now factually correct
|
||||
in every reachable case — the message *is* buffered and there is no swallowed permanent
|
||||
failure. Permanent HTTP 4xx failures are still surfaced synchronously, because
|
||||
`CachedCallAsync` makes its own first HTTP attempt and catches
|
||||
`PermanentExternalSystemException` *before* it ever reaches `EnqueueAsync`. No source
|
||||
change was required beyond the `ExternalSystemGateway-003` redesign that already landed.
|
||||
Coverage: `CachedCall_TransientFailure_BuffersWithSystemRetrySettings` asserts both
|
||||
`result.WasBuffered == true` and that the message is genuinely present in the S&F buffer
|
||||
(depth == 1), confirming the `WasBuffered: true` claim is not a lie; the existing
|
||||
`CachedCall` permanent-failure path is exercised by `Call_Permanent400_ReturnsPermanentError`
|
||||
semantics shared via `InvokeHttpAsync`.
|
||||
|
||||
### ExternalSystemGateway-010 — `GetConnectionAsync` leaks the `SqlConnection` when `OpenAsync` fails
|
||||
|
||||
@@ -406,7 +497,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Performance & resource management |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.ExternalSystemGateway/DatabaseGateway.cs:48-50` |
|
||||
|
||||
**Description**
|
||||
@@ -427,7 +518,14 @@ Wrap the open in a try/catch that disposes the connection before rethrowing:
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit pending). `GetConnectionAsync` now wraps `OpenAsync` in a
|
||||
`try/catch` that calls `await connection.DisposeAsync()` before rethrowing, so a failed
|
||||
open (unreachable server, bad credentials, cancellation) no longer leaks the
|
||||
`SqlConnection`. Connection creation was extracted into an `internal virtual
|
||||
CreateConnection(string)` factory so the failure path is unit-testable. Regression test
|
||||
`GetConnection_OpenFails_DisposesConnectionBeforeRethrowing` substitutes a `DbConnection`
|
||||
whose `OpenAsync` always throws and asserts the connection is disposed when the
|
||||
exception propagates; it was verified to fail before the `try/catch` was added.
|
||||
|
||||
### ExternalSystemGateway-011 — Every call performs a full repository scan of all systems and methods
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-16 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `9c60592` |
|
||||
| Open findings | 10 |
|
||||
| Open findings | 5 |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -143,10 +143,10 @@ and no lost updates.
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — re-triaged: already resolved as a side-effect of HealthMonitoring-002. |
|
||||
| Category | Concurrency & thread safety |
|
||||
| Status | Open |
|
||||
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:55-78` |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:45-103` |
|
||||
|
||||
**Description**
|
||||
|
||||
@@ -169,7 +169,17 @@ reference swap with no observable intermediate state.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `pending`). Re-triaged: verified against the current
|
||||
source — the root cause was already eliminated by the HealthMonitoring-002 fix.
|
||||
`ProcessReport` no longer uses `AddOrUpdate` at all; it is now an explicit
|
||||
compare-and-swap retry loop (`TryGetValue` → guard → `TryAdd`/`TryUpdate`) that
|
||||
produces a brand-new immutable `SiteHealthState` record per transition and never
|
||||
mutates a stored value in place. The sequence-number guard and the field writes are
|
||||
evaluated against the value actually installed by the CAS, so the "only replace if
|
||||
sequence is higher" invariant holds. The concurrency stress test
|
||||
`CentralHealthAggregatorTests.ProcessReport_ConcurrentUpdates_NeverLoseSequenceOrTearState`
|
||||
(added under HealthMonitoring-002) already exercises this path and asserts no lost
|
||||
updates and no torn snapshots. No further code change was required for this finding.
|
||||
|
||||
### HealthMonitoring-004 — Inconsistent heartbeat interval described across XML docs
|
||||
|
||||
@@ -205,7 +215,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthReportLoop.cs:48-81`, `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:149` |
|
||||
|
||||
**Description**
|
||||
@@ -230,7 +240,18 @@ leader starts the report loop promptly on failover.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `pending`). Applied the distinct-timeout option. A new
|
||||
`HealthMonitoringOptions.CentralOfflineTimeout` (default 3x the report interval =
|
||||
3 minutes) is applied by `CentralHealthAggregator.CheckForOfflineSites` to the
|
||||
`central` keyspace entry only — real sites keep the existing `OfflineTimeout`. This
|
||||
gives the synthetic `central` site (which has no heartbeat source and is fed solely
|
||||
by the 30s leader-only `CentralHealthReportLoop`) enough grace to survive a single
|
||||
skipped or late self-report — the equivalent of the "one missed report" grace the
|
||||
design doc grants real sites — while still going offline on genuine total loss.
|
||||
Regression tests `CentralHealthAggregatorTests.OfflineDetection_CentralSite_HasLongerGraceThanRealSites`
|
||||
(central survives 75s of silence while a real site goes offline) and
|
||||
`OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss` (central still detected
|
||||
offline after 10 minutes) verify the behaviour.
|
||||
|
||||
### HealthMonitoring-006 — Sequence seeding contradicts the doc's "starting at 1" wording and is untestable
|
||||
|
||||
@@ -270,7 +291,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:86-99` |
|
||||
|
||||
**Description**
|
||||
@@ -293,16 +314,29 @@ after a central restart.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `pending`). `CentralHealthAggregator.MarkHeartbeat` no
|
||||
longer returns early for an unknown site. When a heartbeat arrives for a site with no
|
||||
aggregator state, it now atomically registers (`TryAdd`, with CAS-loss retry) a
|
||||
minimal `SiteHealthState` that is `IsOnline = true`, `LatestReport = null`,
|
||||
`LastSequenceNumber = 0` and `LastHeartbeatAt = receivedAt` — an "online, awaiting
|
||||
first report" state. This relies on the HealthMonitoring-002 change that made
|
||||
`LatestReport` properly nullable, so UI consumers already handle the null case.
|
||||
Reachable sites therefore show online immediately after a central restart/failover
|
||||
instead of being absent ("unknown") for up to a full report interval. The
|
||||
`ICentralHealthAggregator.MarkHeartbeat` XML doc was corrected to describe the new
|
||||
behaviour. Regression test
|
||||
`CentralHealthAggregatorTests.MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport`
|
||||
verifies the registration; `MarkHeartbeat_KeepsSiteOnline_BetweenReports` and
|
||||
`MarkHeartbeat_BringsOfflineSiteBackOnline` cover the already-registered paths.
|
||||
|
||||
### HealthMonitoring-008 — `GetAllSiteStates` / `GetSiteState` leak live mutable state objects to callers
|
||||
|
||||
| | |
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Severity | Medium — re-triaged: already resolved as a side-effect of HealthMonitoring-002. |
|
||||
| Category | Concurrency & thread safety |
|
||||
| Status | Open |
|
||||
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:104-116` |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.HealthMonitoring/CentralHealthAggregator.cs:146-158` |
|
||||
|
||||
**Description**
|
||||
|
||||
@@ -323,7 +357,15 @@ state into an immutable DTO before returning.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `pending`). Re-triaged: verified against the current
|
||||
source — the root cause was already eliminated by the HealthMonitoring-002 fix.
|
||||
`SiteHealthState` is now a `sealed record` with `init`-only properties (fully
|
||||
immutable). Every aggregator transition installs a brand-new record instance via an
|
||||
atomic compare-and-swap, so the references `GetAllSiteStates` and `GetSiteState` hand
|
||||
out are immutable snapshots — a UI consumer reading one on its own thread can never
|
||||
observe a torn or half-applied state, and cannot mutate aggregator state through the
|
||||
returned reference. The recommended fix (make `SiteHealthState` a record) is exactly
|
||||
what the HealthMonitoring-002 change did, so no further code change was required.
|
||||
|
||||
### HealthMonitoring-009 — Missing test coverage for central report loop, heartbeat path, replication, and collector setters
|
||||
|
||||
@@ -331,7 +373,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Testing coverage |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `tests/ScadaLink.HealthMonitoring.Tests/` |
|
||||
|
||||
**Description**
|
||||
@@ -358,7 +400,29 @@ setters' presence in `CollectReport` output.
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-16 (commit `pending`). Added the missing coverage:
|
||||
|
||||
- **`CentralHealthReportLoopTests`** (new file) — `GeneratesCentralReports_WhenSelfIsPrimary`,
|
||||
`GeneratesNoReports_WhenNotPrimary` (leader-only `SelfIsPrimary` gating with a fake
|
||||
`IClusterNodeProvider`), `AssignsMonotonicSequenceNumbers`, and
|
||||
`SetsActiveNodeFlag_EvenWhenNotPrimary`.
|
||||
- **`CentralHealthAggregatorTests`** — `MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport`,
|
||||
`MarkHeartbeat_KeepsSiteOnline_BetweenReports` (heartbeat keeps a site online past
|
||||
the offline timeout — the path the design depends on), and
|
||||
`MarkHeartbeat_BringsOfflineSiteBackOnline`.
|
||||
- **`SiteHealthCollectorTests`** — reflected-in-report tests for `SetClusterNodes`,
|
||||
`SetInstanceCounts`, `SetParkedMessageCount`, `SetNodeHostname`,
|
||||
`SetActiveNode`/`NodeRole`, `UpdateTagQuality`, `UpdateConnectionEndpoint`, and
|
||||
`SetStoreAndForwardDepths`.
|
||||
|
||||
The `SiteHealthReportReplica` idempotency item is **out of scope** for this module:
|
||||
`SiteHealthReportReplica` is declared in `ScadaLink.Commons` and published/consumed by
|
||||
`CentralCommunicationActor` in the `ScadaLink.Communication` module — the
|
||||
HealthMonitoring module itself has no replication code. Replica double-delivery
|
||||
idempotency is already covered by `ProcessReport`'s sequence-number guard
|
||||
(`ProcessReport_RejectsEqualSequence`, `ProcessReport_RejectsStaleReport_WhenSequenceNotGreater`);
|
||||
testing the actor-side double-publish belongs in the Communication module's review.
|
||||
The HealthMonitoring test suite now stands at 47 passing tests (was 30).
|
||||
|
||||
### HealthMonitoring-010 — `HealthReportSender` silently swallows inner failures with bare `catch {}`
|
||||
|
||||
|
||||
+8
-36
@@ -41,9 +41,9 @@ module file and counted in **Total**.
|
||||
|----------|---------------|
|
||||
| Critical | 0 |
|
||||
| High | 0 |
|
||||
| Medium | 73 |
|
||||
| Medium | 45 |
|
||||
| Low | 90 |
|
||||
| **Total** | **163** |
|
||||
| **Total** | **135** |
|
||||
|
||||
## Module Status
|
||||
|
||||
@@ -54,11 +54,11 @@ module file and counted in **Total**.
|
||||
| [ClusterInfrastructure](ClusterInfrastructure/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/3 | 3 | 8 |
|
||||
| [Commons](Commons/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/8 | 8 | 12 |
|
||||
| [Communication](Communication/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/3 | 3 | 11 |
|
||||
| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-16 | `9c60592` | 0/0/4/6 | 10 | 11 |
|
||||
| [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-16 | `9c60592` | 0/0/6/2 | 8 | 13 |
|
||||
| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-16 | `9c60592` | 0/0/6/5 | 11 | 14 |
|
||||
| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-16 | `9c60592` | 0/0/7/4 | 11 | 14 |
|
||||
| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-16 | `9c60592` | 0/0/5/5 | 10 | 12 |
|
||||
| [ConfigurationDatabase](ConfigurationDatabase/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/6 | 6 | 11 |
|
||||
| [DataConnectionLayer](DataConnectionLayer/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/2 | 2 | 13 |
|
||||
| [DeploymentManager](DeploymentManager/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/5 | 5 | 14 |
|
||||
| [ExternalSystemGateway](ExternalSystemGateway/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/4 | 4 | 14 |
|
||||
| [HealthMonitoring](HealthMonitoring/findings.md) | 2026-05-16 | `9c60592` | 0/0/0/5 | 5 | 12 |
|
||||
| [Host](Host/findings.md) | 2026-05-16 | `9c60592` | 0/0/3/7 | 10 | 11 |
|
||||
| [InboundAPI](InboundAPI/findings.md) | 2026-05-16 | `9c60592` | 0/0/5/5 | 10 | 13 |
|
||||
| [ManagementService](ManagementService/findings.md) | 2026-05-16 | `9c60592` | 0/0/5/5 | 10 | 13 |
|
||||
@@ -84,40 +84,12 @@ _None open._
|
||||
|
||||
_None open._
|
||||
|
||||
### Medium (73)
|
||||
### Medium (45)
|
||||
|
||||
| ID | Module | Title |
|
||||
|----|--------|-------|
|
||||
| CentralUI-005 | [CentralUI](CentralUI/findings.md) | Session expiry implementation diverges from the documented policy |
|
||||
| CentralUI-006 | [CentralUI](CentralUI/findings.md) | Deployment status page polls every 10s despite the documented SignalR-push design |
|
||||
| ConfigurationDatabase-002 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Hardcoded `sa` connection string with embedded password literal |
|
||||
| ConfigurationDatabase-003 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | No-arg `AddConfigurationDatabase()` silently registers nothing |
|
||||
| ConfigurationDatabase-004 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | Secret-bearing columns stored in plaintext with no protection |
|
||||
| ConfigurationDatabase-007 | [ConfigurationDatabase](ConfigurationDatabase/findings.md) | `AuditService` does not handle JSON-serialization failure of arbitrary `afterState` |
|
||||
| DataConnectionLayer-006 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Health quality counters not reset/recomputed after failover or re-subscribe |
|
||||
| DataConnectionLayer-007 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `ReadBatchAsync` aborts the whole batch on the first failing tag |
|
||||
| DataConnectionLayer-009 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Implemented failover heuristic diverges from the documented state machine |
|
||||
| DataConnectionLayer-010 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Tag-resolution retry can issue duplicate concurrent subscribe attempts |
|
||||
| DataConnectionLayer-011 | [DataConnectionLayer](DataConnectionLayer/findings.md) | Stale subscription callbacks from disposed adapters can still reach the actor |
|
||||
| DataConnectionLayer-012 | [DataConnectionLayer](DataConnectionLayer/findings.md) | `AutoAcceptUntrustedCerts` defaults to `true`, accepting any server certificate |
|
||||
| DeploymentManager-003 | [DeploymentManager](DeploymentManager/findings.md) | Successful-deployment cleanup is not atomic with the status write |
|
||||
| DeploymentManager-004 | [DeploymentManager](DeploymentManager/findings.md) | Site-success but central-delete-failure leaves orphaned site config |
|
||||
| DeploymentManager-005 | [DeploymentManager](DeploymentManager/findings.md) | `OperationLockManager` leaks a `SemaphoreSlim` per instance name |
|
||||
| DeploymentManager-007 | [DeploymentManager](DeploymentManager/findings.md) | "Diff View" reduced to a hash comparison with no diff detail |
|
||||
| DeploymentManager-008 | [DeploymentManager](DeploymentManager/findings.md) | `DeploymentManagerOptions` is never bound to configuration |
|
||||
| DeploymentManager-011 | [DeploymentManager](DeploymentManager/findings.md) | Tests never exercise a successful deployment or lifecycle success path |
|
||||
| ExternalSystemGateway-004 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | System retry settings are not honoured for cached calls/writes |
|
||||
| ExternalSystemGateway-005 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `HttpRequestMessage` and `HttpResponseMessage` are not disposed |
|
||||
| ExternalSystemGateway-006 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `BuildUrl` ignores path templates and appends a trailing slash for empty paths |
|
||||
| ExternalSystemGateway-007 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | External error response bodies are echoed verbatim into script-visible error messages |
|
||||
| ExternalSystemGateway-008 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | Cancellation is conflated with transient timeout failure |
|
||||
| ExternalSystemGateway-009 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `StoreAndForwardResult` from `EnqueueAsync` is discarded; permanent failures during buffering are swallowed |
|
||||
| ExternalSystemGateway-010 | [ExternalSystemGateway](ExternalSystemGateway/findings.md) | `GetConnectionAsync` leaks the `SqlConnection` when `OpenAsync` fails |
|
||||
| HealthMonitoring-003 | [HealthMonitoring](HealthMonitoring/findings.md) | Shared state mutated inside `ConcurrentDictionary.AddOrUpdate` update delegate |
|
||||
| HealthMonitoring-005 | [HealthMonitoring](HealthMonitoring/findings.md) | Central self-report site can flap offline; no heartbeat grace like real sites |
|
||||
| HealthMonitoring-007 | [HealthMonitoring](HealthMonitoring/findings.md) | Heartbeats for not-yet-registered sites are silently dropped |
|
||||
| HealthMonitoring-008 | [HealthMonitoring](HealthMonitoring/findings.md) | `GetAllSiteStates` / `GetSiteState` leak live mutable state objects to callers |
|
||||
| HealthMonitoring-009 | [HealthMonitoring](HealthMonitoring/findings.md) | Missing test coverage for central report loop, heartbeat path, replication, and collector setters |
|
||||
| Host-002 | [Host](Host/findings.md) | Akka.Persistence required by REQ-HOST-6 is not configured and not used |
|
||||
| Host-003 | [Host](Host/findings.md) | Secrets committed in plaintext in `appsettings.Central.json` |
|
||||
| Host-004 | [Host](Host/findings.md) | Site seed-node list points at the gRPC port, not a remoting port |
|
||||
|
||||
@@ -22,8 +22,10 @@ public class ExternalSystemDefinitionConfiguration : IEntityTypeConfiguration<Ex
|
||||
.IsRequired()
|
||||
.HasMaxLength(50);
|
||||
|
||||
// Stored encrypted at rest (EncryptedStringConverter). Ciphertext is larger than
|
||||
// the plaintext, so the column is sized generously to avoid truncation.
|
||||
builder.Property(e => e.AuthConfiguration)
|
||||
.HasMaxLength(4000);
|
||||
.HasMaxLength(8000);
|
||||
|
||||
builder.HasMany<ExternalSystemMethod>()
|
||||
.WithOne()
|
||||
@@ -72,9 +74,11 @@ public class DatabaseConnectionDefinitionConfiguration : IEntityTypeConfiguratio
|
||||
.IsRequired()
|
||||
.HasMaxLength(200);
|
||||
|
||||
// Stored encrypted at rest (EncryptedStringConverter). Ciphertext is larger than
|
||||
// the plaintext, so the column is sized generously to avoid truncation.
|
||||
builder.Property(d => d.ConnectionString)
|
||||
.IsRequired()
|
||||
.HasMaxLength(4000);
|
||||
.HasMaxLength(8000);
|
||||
|
||||
builder.HasIndex(d => d.Name).IsUnique();
|
||||
}
|
||||
|
||||
@@ -53,8 +53,10 @@ public class SmtpConfigurationConfiguration : IEntityTypeConfiguration<SmtpConfi
|
||||
.IsRequired()
|
||||
.HasMaxLength(50);
|
||||
|
||||
// Stored encrypted at rest (EncryptedStringConverter). Ciphertext is larger than
|
||||
// the plaintext, so the column is sized generously to avoid truncation.
|
||||
builder.Property(s => s.Credentials)
|
||||
.HasMaxLength(4000);
|
||||
.HasMaxLength(8000);
|
||||
|
||||
builder.Property(s => s.TlsMode)
|
||||
.HasMaxLength(50);
|
||||
|
||||
@@ -6,20 +6,50 @@ namespace ScadaLink.ConfigurationDatabase;
|
||||
|
||||
/// <summary>
|
||||
/// Factory for creating DbContext instances at design time (used by dotnet ef tooling).
|
||||
/// Reads connection string from Host's appsettings.Central.json.
|
||||
/// Resolves the connection string from the Host's appsettings files, or — for environments
|
||||
/// where those files are not present — from the
|
||||
/// <c>SCADALINK_DESIGNTIME_CONNECTIONSTRING</c> environment variable.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// There is deliberately no hardcoded fallback connection string. A credential literal in
|
||||
/// source is committed to version control, encourages copy-paste of <c>sa</c> /
|
||||
/// <c>TrustServerCertificate=True</c> into real environments, and can silently point
|
||||
/// <c>dotnet ef</c> tooling at an unintended database. If no connection string can be
|
||||
/// resolved, this factory fails loudly with an actionable message.
|
||||
/// </remarks>
|
||||
public class DesignTimeDbContextFactory : IDesignTimeDbContextFactory<ScadaLinkDbContext>
|
||||
{
|
||||
private const string EnvironmentVariableName = "SCADALINK_DESIGNTIME_CONNECTIONSTRING";
|
||||
private const string ConfigurationKey = "ScadaLink:Database:ConfigurationDb";
|
||||
|
||||
public ScadaLinkDbContext CreateDbContext(string[] args)
|
||||
{
|
||||
var configuration = new ConfigurationBuilder()
|
||||
.SetBasePath(Path.Combine(Directory.GetCurrentDirectory(), "..", "ScadaLink.Host"))
|
||||
.AddJsonFile("appsettings.json", optional: true)
|
||||
.AddJsonFile("appsettings.Central.json", optional: true)
|
||||
.Build();
|
||||
var configurationBuilder = new ConfigurationBuilder();
|
||||
|
||||
var connectionString = configuration["ScadaLink:Database:ConfigurationDb"]
|
||||
?? "Server=localhost,1433;Database=ScadaLink_Config;User Id=sa;Password=YourPassword;TrustServerCertificate=True";
|
||||
// The Host's appsettings files are an optional source — only wire them up when the
|
||||
// Host directory actually exists, otherwise SetBasePath throws DirectoryNotFoundException
|
||||
// (e.g. when this factory is exercised from a test runner with no sibling Host folder).
|
||||
var hostDirectory = Path.Combine(Directory.GetCurrentDirectory(), "..", "ScadaLink.Host");
|
||||
if (Directory.Exists(hostDirectory))
|
||||
{
|
||||
configurationBuilder
|
||||
.SetBasePath(hostDirectory)
|
||||
.AddJsonFile("appsettings.json", optional: true)
|
||||
.AddJsonFile("appsettings.Central.json", optional: true);
|
||||
}
|
||||
|
||||
var configuration = configurationBuilder.Build();
|
||||
|
||||
var connectionString = configuration[ConfigurationKey]
|
||||
?? Environment.GetEnvironmentVariable(EnvironmentVariableName);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(connectionString))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"No design-time database connection string was found. Set the configuration " +
|
||||
$"key '{ConfigurationKey}' in the Host's appsettings file, or set the " +
|
||||
$"'{EnvironmentVariableName}' environment variable, before running dotnet ef tooling.");
|
||||
}
|
||||
|
||||
var optionsBuilder = new DbContextOptionsBuilder<ScadaLinkDbContext>();
|
||||
optionsBuilder.UseSqlServer(connectionString);
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
using Microsoft.AspNetCore.DataProtection;
|
||||
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
|
||||
|
||||
namespace ScadaLink.ConfigurationDatabase;
|
||||
|
||||
/// <summary>
|
||||
/// EF Core value converter that encrypts a string column at rest using ASP.NET
|
||||
/// Data Protection. Plaintext is protected when written to the database and
|
||||
/// transparently unprotected when read back, so secret-bearing columns
|
||||
/// (SMTP credentials, external-system auth config, database connection strings)
|
||||
/// are never persisted verbatim.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The protector is purpose-scoped so ciphertext from one column cannot be
|
||||
/// unprotected as another. Data Protection keys are persisted to the
|
||||
/// configuration database itself (see <see cref="ScadaLinkDbContext"/> implementing
|
||||
/// <c>IDataProtectionKeyContext</c>), so all central nodes share the same key ring
|
||||
/// and can decrypt each other's writes.
|
||||
/// </remarks>
|
||||
public sealed class EncryptedStringConverter : ValueConverter<string?, string?>
|
||||
{
|
||||
/// <summary>The Data Protection purpose string shared by all encrypted configuration columns.</summary>
|
||||
public const string ProtectorPurpose = "ScadaLink.ConfigurationDatabase.EncryptedColumn";
|
||||
|
||||
public EncryptedStringConverter(IDataProtector protector)
|
||||
: base(
|
||||
plaintext => plaintext == null ? null : protector.Protect(plaintext),
|
||||
ciphertext => ciphertext == null ? null : Unprotect(protector, ciphertext))
|
||||
{
|
||||
}
|
||||
|
||||
private static string Unprotect(IDataProtector protector, string ciphertext)
|
||||
{
|
||||
// A row that predates encryption (or test fixtures inserting raw text) is not valid
|
||||
// protected payload. Unprotect throws CryptographicException in that case; surface a
|
||||
// clearer message rather than a bare crypto failure.
|
||||
try
|
||||
{
|
||||
return protector.Unprotect(ciphertext);
|
||||
}
|
||||
catch (System.Security.Cryptography.CryptographicException ex)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"Failed to decrypt an encrypted configuration column. The Data Protection key " +
|
||||
"ring may be unavailable, or the stored value was not written by this system.",
|
||||
ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
Generated
+1348
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,82 @@
|
||||
using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace ScadaLink.ConfigurationDatabase.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class EncryptSecretColumns : Migration
|
||||
{
|
||||
/// <inheritdoc />
|
||||
protected override void Up(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.AlterColumn<string>(
|
||||
name: "Credentials",
|
||||
table: "SmtpConfigurations",
|
||||
type: "nvarchar(max)",
|
||||
maxLength: 8000,
|
||||
nullable: true,
|
||||
oldClrType: typeof(string),
|
||||
oldType: "nvarchar(4000)",
|
||||
oldMaxLength: 4000,
|
||||
oldNullable: true);
|
||||
|
||||
migrationBuilder.AlterColumn<string>(
|
||||
name: "AuthConfiguration",
|
||||
table: "ExternalSystemDefinitions",
|
||||
type: "nvarchar(max)",
|
||||
maxLength: 8000,
|
||||
nullable: true,
|
||||
oldClrType: typeof(string),
|
||||
oldType: "nvarchar(4000)",
|
||||
oldMaxLength: 4000,
|
||||
oldNullable: true);
|
||||
|
||||
migrationBuilder.AlterColumn<string>(
|
||||
name: "ConnectionString",
|
||||
table: "DatabaseConnectionDefinitions",
|
||||
type: "nvarchar(max)",
|
||||
maxLength: 8000,
|
||||
nullable: false,
|
||||
oldClrType: typeof(string),
|
||||
oldType: "nvarchar(4000)",
|
||||
oldMaxLength: 4000);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void Down(MigrationBuilder migrationBuilder)
|
||||
{
|
||||
migrationBuilder.AlterColumn<string>(
|
||||
name: "Credentials",
|
||||
table: "SmtpConfigurations",
|
||||
type: "nvarchar(4000)",
|
||||
maxLength: 4000,
|
||||
nullable: true,
|
||||
oldClrType: typeof(string),
|
||||
oldType: "nvarchar(max)",
|
||||
oldMaxLength: 8000,
|
||||
oldNullable: true);
|
||||
|
||||
migrationBuilder.AlterColumn<string>(
|
||||
name: "AuthConfiguration",
|
||||
table: "ExternalSystemDefinitions",
|
||||
type: "nvarchar(4000)",
|
||||
maxLength: 4000,
|
||||
nullable: true,
|
||||
oldClrType: typeof(string),
|
||||
oldType: "nvarchar(max)",
|
||||
oldMaxLength: 8000,
|
||||
oldNullable: true);
|
||||
|
||||
migrationBuilder.AlterColumn<string>(
|
||||
name: "ConnectionString",
|
||||
table: "DatabaseConnectionDefinitions",
|
||||
type: "nvarchar(4000)",
|
||||
maxLength: 4000,
|
||||
nullable: false,
|
||||
oldClrType: typeof(string),
|
||||
oldType: "nvarchar(max)",
|
||||
oldMaxLength: 8000);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -232,8 +232,8 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
|
||||
|
||||
b.Property<string>("ConnectionString")
|
||||
.IsRequired()
|
||||
.HasMaxLength(4000)
|
||||
.HasColumnType("nvarchar(4000)");
|
||||
.HasMaxLength(8000)
|
||||
.HasColumnType("nvarchar(max)");
|
||||
|
||||
b.Property<int>("MaxRetries")
|
||||
.HasColumnType("int");
|
||||
@@ -263,8 +263,8 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
|
||||
SqlServerPropertyBuilderExtensions.UseIdentityColumn(b.Property<int>("Id"));
|
||||
|
||||
b.Property<string>("AuthConfiguration")
|
||||
.HasMaxLength(4000)
|
||||
.HasColumnType("nvarchar(4000)");
|
||||
.HasMaxLength(8000)
|
||||
.HasColumnType("nvarchar(max)");
|
||||
|
||||
b.Property<string>("AuthType")
|
||||
.IsRequired()
|
||||
@@ -632,8 +632,8 @@ namespace ScadaLink.ConfigurationDatabase.Migrations
|
||||
.HasColumnType("int");
|
||||
|
||||
b.Property<string>("Credentials")
|
||||
.HasMaxLength(4000)
|
||||
.HasColumnType("nvarchar(4000)");
|
||||
.HasMaxLength(8000)
|
||||
.HasColumnType("nvarchar(max)");
|
||||
|
||||
b.Property<string>("FromAddress")
|
||||
.IsRequired()
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Json" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.DataProtection" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" />
|
||||
</ItemGroup>
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using Microsoft.AspNetCore.DataProtection;
|
||||
using Microsoft.AspNetCore.DataProtection.EntityFrameworkCore;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
@@ -15,10 +16,24 @@ namespace ScadaLink.ConfigurationDatabase;
|
||||
|
||||
public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
|
||||
{
|
||||
private readonly IDataProtectionProvider? _dataProtectionProvider;
|
||||
|
||||
public ScadaLinkDbContext(DbContextOptions<ScadaLinkDbContext> options) : base(options)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a context with an explicit Data Protection provider used to encrypt
|
||||
/// secret-bearing configuration columns at rest. The runtime resolves this overload
|
||||
/// via DI; design-time tooling uses the single-argument overload.
|
||||
/// </summary>
|
||||
public ScadaLinkDbContext(DbContextOptions<ScadaLinkDbContext> options, IDataProtectionProvider dataProtectionProvider)
|
||||
: base(options)
|
||||
{
|
||||
_dataProtectionProvider = dataProtectionProvider
|
||||
?? throw new ArgumentNullException(nameof(dataProtectionProvider));
|
||||
}
|
||||
|
||||
// Templates
|
||||
public DbSet<Template> Templates => Set<Template>();
|
||||
public DbSet<TemplateAttribute> TemplateAttributes => Set<TemplateAttribute>();
|
||||
@@ -73,5 +88,38 @@ public class ScadaLinkDbContext : DbContext, IDataProtectionKeyContext
|
||||
protected override void OnModelCreating(ModelBuilder modelBuilder)
|
||||
{
|
||||
modelBuilder.ApplyConfigurationsFromAssembly(typeof(ScadaLinkDbContext).Assembly);
|
||||
|
||||
ApplySecretColumnEncryption(modelBuilder);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Applies encryption-at-rest to columns that hold authentication secrets
|
||||
/// (SMTP credentials, external-system auth config, database connection strings)
|
||||
/// so they are never persisted as plaintext.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// When no Data Protection provider is supplied (design-time <c>dotnet ef</c> tooling,
|
||||
/// which only emits schema and never reads or writes secret data), an ephemeral provider
|
||||
/// is used. The encrypted-column type is <c>nvarchar</c> either way, so the generated
|
||||
/// schema is identical regardless of which provider is in effect. The runtime path always
|
||||
/// receives the DI-registered provider whose keys are persisted to this database.
|
||||
/// </remarks>
|
||||
private void ApplySecretColumnEncryption(ModelBuilder modelBuilder)
|
||||
{
|
||||
IDataProtectionProvider provider = _dataProtectionProvider ?? new EphemeralDataProtectionProvider();
|
||||
var converter = new EncryptedStringConverter(
|
||||
provider.CreateProtector(EncryptedStringConverter.ProtectorPurpose));
|
||||
|
||||
modelBuilder.Entity<SmtpConfiguration>()
|
||||
.Property(s => s.Credentials)
|
||||
.HasConversion(converter);
|
||||
|
||||
modelBuilder.Entity<ExternalSystemDefinition>()
|
||||
.Property(e => e.AuthConfiguration)
|
||||
.HasConversion(converter);
|
||||
|
||||
modelBuilder.Entity<DatabaseConnectionDefinition>()
|
||||
.Property(d => d.ConnectionString)
|
||||
.HasConversion((Microsoft.EntityFrameworkCore.Storage.ValueConversion.ValueConverter)converter);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,10 +15,28 @@ public static class ServiceCollectionExtensions
|
||||
/// </summary>
|
||||
public static IServiceCollection AddConfigurationDatabase(this IServiceCollection services, string connectionString)
|
||||
{
|
||||
services.AddDbContext<ScadaLinkDbContext>(options =>
|
||||
// The DbContext is constructed via the (options, IDataProtectionProvider) overload so
|
||||
// secret-bearing configuration columns are encrypted at rest. AddDataProtection below
|
||||
// registers IDataProtectionProvider as a singleton; resolving it here does not recurse
|
||||
// because key-ring loading is lazy (first Protect/Unprotect), not triggered by
|
||||
// CreateProtector during model building.
|
||||
services.AddDbContext<ScadaLinkDbContext>((serviceProvider, options) =>
|
||||
{
|
||||
options.UseSqlServer(connectionString)
|
||||
.ConfigureWarnings(w => w.Ignore(
|
||||
Microsoft.EntityFrameworkCore.Diagnostics.RelationalEventId.PendingModelChangesWarning)));
|
||||
Microsoft.EntityFrameworkCore.Diagnostics.RelationalEventId.PendingModelChangesWarning));
|
||||
});
|
||||
|
||||
// AddDbContext registers ScadaLinkDbContext via EF's activator, which only injects
|
||||
// DbContextOptions. Override that registration (last registration wins for resolution)
|
||||
// with a factory that also supplies the IDataProtectionProvider, so the encrypting
|
||||
// value converter for secret columns is always wired up at runtime.
|
||||
services.AddScoped(serviceProvider =>
|
||||
{
|
||||
var options = serviceProvider.GetRequiredService<DbContextOptions<ScadaLinkDbContext>>();
|
||||
var protectionProvider = serviceProvider.GetRequiredService<IDataProtectionProvider>();
|
||||
return new ScadaLinkDbContext(options, protectionProvider);
|
||||
});
|
||||
|
||||
services.AddScoped<ISecurityRepository, SecurityRepository>();
|
||||
services.AddScoped<ICentralUiRepository, CentralUiRepository>();
|
||||
@@ -38,13 +56,27 @@ public static class ServiceCollectionExtensions
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers the ScadaLinkDbContext with no connection string (for backward compatibility / Phase 0 stubs).
|
||||
/// This overload is a no-op placeholder; callers should migrate to the overload that accepts a connection string.
|
||||
/// Obsolete parameterless overload. This previously registered nothing, which meant a
|
||||
/// central node wired up with it failed late and opaquely — the first repository
|
||||
/// resolution threw a DI exception far from the actual misconfiguration. Use
|
||||
/// <see cref="AddConfigurationDatabase(IServiceCollection, string)"/> and pass the
|
||||
/// configured connection string.
|
||||
/// </summary>
|
||||
/// <exception cref="InvalidOperationException">
|
||||
/// Always thrown. The connection string is required; there is no valid no-op registration.
|
||||
/// </exception>
|
||||
[Obsolete(
|
||||
"AddConfigurationDatabase() with no connection string registers nothing and is not a " +
|
||||
"valid configuration. Call AddConfigurationDatabase(connectionString) instead.",
|
||||
error: true)]
|
||||
public static IServiceCollection AddConfigurationDatabase(this IServiceCollection services)
|
||||
{
|
||||
// Retained for backward compatibility during migration.
|
||||
// Site nodes do not use the configuration database, so this is intentionally a no-op.
|
||||
return services;
|
||||
// Defence-in-depth: even if a caller suppresses the compile-time obsolete error,
|
||||
// fail fast at wire-up time rather than silently registering nothing and surfacing
|
||||
// an opaque DI resolution failure much later.
|
||||
throw new InvalidOperationException(
|
||||
"AddConfigurationDatabase() requires a connection string. Call " +
|
||||
"AddConfigurationDatabase(connectionString) with the configured " +
|
||||
"'ScadaLink:Database:ConfigurationDb' value.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,19 @@ public class AuditService : IAuditService
|
||||
{
|
||||
private readonly ScadaLinkDbContext _context;
|
||||
|
||||
/// <summary>
|
||||
/// Serializer options for audit <c>afterState</c> payloads. Audit writes commit in the
|
||||
/// same transaction as the change they record, so a serialization exception here would
|
||||
/// roll back the entire business operation. Reference cycles (common when an EF entity
|
||||
/// with loaded navigations is passed in) are ignored rather than thrown, and depth is
|
||||
/// bounded so a pathological graph cannot produce an unbounded payload.
|
||||
/// </summary>
|
||||
private static readonly JsonSerializerOptions AuditSerializerOptions = new()
|
||||
{
|
||||
ReferenceHandler = System.Text.Json.Serialization.ReferenceHandler.IgnoreCycles,
|
||||
MaxDepth = 32
|
||||
};
|
||||
|
||||
public AuditService(ScadaLinkDbContext context)
|
||||
{
|
||||
_context = context ?? throw new ArgumentNullException(nameof(context));
|
||||
@@ -26,7 +39,7 @@ public class AuditService : IAuditService
|
||||
{
|
||||
Timestamp = DateTimeOffset.UtcNow,
|
||||
AfterStateJson = afterState != null
|
||||
? JsonSerializer.Serialize(afterState)
|
||||
? SerializeAfterState(afterState)
|
||||
: null
|
||||
};
|
||||
|
||||
@@ -34,4 +47,27 @@ public class AuditService : IAuditService
|
||||
// to ensure atomicity with the entity change.
|
||||
await _context.AuditLogEntries.AddAsync(entry, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serializes the caller-supplied after-state, tolerating arbitrary object shapes.
|
||||
/// Reference cycles are ignored via <see cref="AuditSerializerOptions"/>. If serialization
|
||||
/// still fails (e.g. <c>MaxDepth</c> exceeded), the audit entry is preserved with a
|
||||
/// diagnostic placeholder rather than throwing — a serialization failure must never
|
||||
/// roll back the business operation the audit entry is recording.
|
||||
/// </summary>
|
||||
private static string SerializeAfterState(object afterState)
|
||||
{
|
||||
try
|
||||
{
|
||||
return JsonSerializer.Serialize(afterState, AuditSerializerOptions);
|
||||
}
|
||||
catch (Exception ex) when (ex is JsonException or NotSupportedException)
|
||||
{
|
||||
return JsonSerializer.Serialize(new
|
||||
{
|
||||
AuditSerializationError = ex.Message,
|
||||
StateType = afterState.GetType().FullName
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,6 +55,13 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
/// </summary>
|
||||
private readonly HashSet<string> _unresolvedTags = new();
|
||||
|
||||
/// <summary>
|
||||
/// DataConnectionLayer-010: tags whose retry SubscribeAsync is currently in flight.
|
||||
/// They are excluded from the next retry tick so a slow attempt is not duplicated
|
||||
/// (which would leak monitored items / subscription ids).
|
||||
/// </summary>
|
||||
private readonly HashSet<string> _resolutionInFlight = new();
|
||||
|
||||
/// <summary>
|
||||
/// Subscribers: instanceUniqueName → IActorRef (the Instance Actor).
|
||||
/// </summary>
|
||||
@@ -80,6 +87,15 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
private int _consecutiveUnstableDisconnects;
|
||||
private DateTimeOffset _lastConnectedAt;
|
||||
|
||||
/// <summary>
|
||||
/// DataConnectionLayer-011: monotonically increasing tag that identifies the
|
||||
/// current adapter instance. Subscription callbacks capture the generation in
|
||||
/// effect when they were created; a <see cref="TagValueReceived"/> whose
|
||||
/// generation no longer matches comes from a disposed adapter and is dropped so
|
||||
/// stale pre-failover device data is never forwarded to Instance Actors.
|
||||
/// </summary>
|
||||
private int _adapterGeneration;
|
||||
|
||||
/// <summary>
|
||||
/// Captured Self reference for use from non-actor threads (event handlers, callbacks).
|
||||
/// Akka.NET's Self property is only valid inside the actor's message loop.
|
||||
@@ -187,12 +203,6 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
// ── Connected State ──
|
||||
|
||||
/// <summary>
|
||||
/// Minimum time connected before we consider the connection stable.
|
||||
/// If we disconnect before this, it counts as an unstable connection toward failover.
|
||||
/// </summary>
|
||||
private static readonly TimeSpan StableConnectionThreshold = TimeSpan.FromSeconds(60);
|
||||
|
||||
private void BecomeConnected()
|
||||
{
|
||||
_log.Info("[{0}] Entering Connected state", _connectionName);
|
||||
@@ -263,7 +273,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
// If we were connected for less than the stability threshold, this counts
|
||||
// as an unstable cycle (e.g., connect succeeded but heartbeat went stale).
|
||||
var connectionDuration = DateTimeOffset.UtcNow - _lastConnectedAt;
|
||||
if (_lastConnectedAt != default && connectionDuration < StableConnectionThreshold)
|
||||
if (_lastConnectedAt != default && connectionDuration < _options.StableConnectionThreshold)
|
||||
{
|
||||
_consecutiveUnstableDisconnects++;
|
||||
_log.Warning("[{0}] Unstable connection (lasted {1:F0}s) — consecutive unstable disconnects: {2}/{3}",
|
||||
@@ -298,6 +308,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_connectionDetails = newConfig;
|
||||
_adapter.Disconnected += OnAdapterDisconnected;
|
||||
|
||||
// DataConnectionLayer-011: new adapter — bump the generation so callbacks
|
||||
// from the disposed adapter are recognised as stale and dropped.
|
||||
_adapterGeneration++;
|
||||
|
||||
_log.Warning("[{0}] Failing over from {1} to {2} (unstable connection pattern)",
|
||||
_connectionName, previousEndpoint, _activeEndpoint);
|
||||
|
||||
@@ -306,7 +320,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_ = _siteEventLogger.LogEventAsync(
|
||||
"connection", "Warning", null, _connectionName,
|
||||
$"Failover from {previousEndpoint} to {_activeEndpoint} (unstable connection)",
|
||||
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {StableConnectionThreshold.TotalSeconds:F0}s");
|
||||
$"Connection lasted {connectionDuration.TotalSeconds:F0}s, threshold {_options.StableConnectionThreshold.TotalSeconds:F0}s");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -443,6 +457,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
// Wire disconnect handler on new adapter
|
||||
_adapter.Disconnected += OnAdapterDisconnected;
|
||||
|
||||
// DataConnectionLayer-011: new adapter — bump the generation so callbacks
|
||||
// from the disposed adapter are recognised as stale and dropped.
|
||||
_adapterGeneration++;
|
||||
|
||||
_log.Warning("[{0}] Failing over from {1} to {2}",
|
||||
_connectionName, previousEndpoint, _activeEndpoint);
|
||||
|
||||
@@ -487,6 +505,9 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
var self = Self;
|
||||
var sender = Sender;
|
||||
// DataConnectionLayer-011: capture the current adapter generation so callbacks
|
||||
// from this adapter can be distinguished from a later (post-failover) adapter.
|
||||
var generation = _adapterGeneration;
|
||||
|
||||
// Snapshot the already-subscribed tag set on the actor thread. The background
|
||||
// task below must NOT read or mutate actor state — it performs only adapter
|
||||
@@ -513,7 +534,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
{
|
||||
var subId = await _adapter.SubscribeAsync(tagPath, (path, value) =>
|
||||
{
|
||||
self.Tell(new TagValueReceived(path, value));
|
||||
self.Tell(new TagValueReceived(path, value, generation));
|
||||
});
|
||||
results.Add(new SubscribeTagResult(tagPath, AlreadySubscribed: false, Success: true, subId, null));
|
||||
tagsToSeed.Add(tagPath);
|
||||
@@ -541,7 +562,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
var readResult = await _adapter.ReadAsync(tagPath);
|
||||
if (readResult.Success && readResult.Value != null)
|
||||
{
|
||||
self.Tell(new TagValueReceived(tagPath, readResult.Value));
|
||||
self.Tell(new TagValueReceived(tagPath, readResult.Value, generation));
|
||||
}
|
||||
}
|
||||
catch
|
||||
@@ -676,14 +697,34 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_ = _adapter.UnsubscribeAsync(subId);
|
||||
_subscriptionIds.Remove(tagPath);
|
||||
_unresolvedTags.Remove(tagPath);
|
||||
_resolutionInFlight.Remove(tagPath);
|
||||
_totalSubscribed--;
|
||||
if (!_unresolvedTags.Contains(tagPath))
|
||||
_resolvedTags--;
|
||||
|
||||
// DataConnectionLayer-006: drop the tag's tracked quality so it is no
|
||||
// longer counted by PushBadQualityForAllTags (which sets _tagsBadQuality
|
||||
// from _lastTagQuality.Count). Leaving it here drifts the quality
|
||||
// counters above _totalSubscribed across disconnect cycles.
|
||||
if (_lastTagQuality.Remove(tagPath, out var droppedQuality))
|
||||
{
|
||||
switch (droppedQuality)
|
||||
{
|
||||
case QualityCode.Good: _tagsGoodQuality--; break;
|
||||
case QualityCode.Bad: _tagsBadQuality--; break;
|
||||
case QualityCode.Uncertain: _tagsUncertainQuality--; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_subscriptionsByInstance.Remove(request.InstanceUniqueName);
|
||||
_subscribers.Remove(request.InstanceUniqueName);
|
||||
|
||||
// DataConnectionLayer-006: keep the reported quality counters in sync after the
|
||||
// unsubscribed tags' buckets were decremented above.
|
||||
_healthCollector.UpdateTagQuality(_connectionName, _tagsGoodQuality, _tagsBadQuality, _tagsUncertainQuality);
|
||||
_healthCollector.UpdateTagResolution(_connectionName, _totalSubscribed, _resolvedTags);
|
||||
}
|
||||
|
||||
// ── Write Support (WP-11) ──
|
||||
@@ -731,16 +772,29 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("[{0}] Retrying resolution for {1} unresolved tags", _connectionName, _unresolvedTags.Count);
|
||||
|
||||
var self = Self;
|
||||
var toResolve = _unresolvedTags.ToList();
|
||||
|
||||
// DataConnectionLayer-010: only dispatch retries for tags that do not already
|
||||
// have an attempt in flight. A slow SubscribeAsync overlapping the next tick
|
||||
// would otherwise produce duplicate concurrent subscribes for the same tag.
|
||||
var toResolve = _unresolvedTags.Where(t => !_resolutionInFlight.Contains(t)).ToList();
|
||||
|
||||
if (toResolve.Count == 0)
|
||||
{
|
||||
_log.Debug("[{0}] Tag-resolution retry skipped — {1} attempt(s) still in flight",
|
||||
_connectionName, _resolutionInFlight.Count);
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug("[{0}] Retrying resolution for {1} unresolved tags", _connectionName, toResolve.Count);
|
||||
|
||||
var generation = _adapterGeneration;
|
||||
foreach (var tagPath in toResolve)
|
||||
{
|
||||
_resolutionInFlight.Add(tagPath);
|
||||
_adapter.SubscribeAsync(tagPath, (path, value) =>
|
||||
{
|
||||
self.Tell(new TagValueReceived(path, value));
|
||||
self.Tell(new TagValueReceived(path, value, generation));
|
||||
}).ContinueWith(t =>
|
||||
{
|
||||
if (t.IsCompletedSuccessfully)
|
||||
@@ -788,13 +842,25 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
var self = Self;
|
||||
_subscriptionIds.Clear();
|
||||
_unresolvedTags.Clear();
|
||||
_resolutionInFlight.Clear();
|
||||
_resolvedTags = 0;
|
||||
|
||||
// DataConnectionLayer-006: reset the quality tracking too. Otherwise tags
|
||||
// resolved for the first time after reconnect (never in _lastTagQuality) only
|
||||
// increment their bucket and the totals drift above _totalSubscribed. They are
|
||||
// repopulated from fresh TagValueReceived messages once subscriptions activate.
|
||||
_lastTagQuality.Clear();
|
||||
_tagsGoodQuality = 0;
|
||||
_tagsBadQuality = 0;
|
||||
_tagsUncertainQuality = 0;
|
||||
_healthCollector.UpdateTagQuality(_connectionName, _tagsGoodQuality, _tagsBadQuality, _tagsUncertainQuality);
|
||||
|
||||
var generation = _adapterGeneration;
|
||||
foreach (var tagPath in allTags)
|
||||
{
|
||||
_adapter.SubscribeAsync(tagPath, (path, value) =>
|
||||
{
|
||||
self.Tell(new TagValueReceived(path, value));
|
||||
self.Tell(new TagValueReceived(path, value, generation));
|
||||
}).ContinueWith(t =>
|
||||
{
|
||||
if (t.IsCompletedSuccessfully)
|
||||
@@ -820,6 +886,9 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
private void HandleTagResolutionSucceeded(TagResolutionSucceeded msg)
|
||||
{
|
||||
// DataConnectionLayer-010: the retry attempt for this tag has completed.
|
||||
_resolutionInFlight.Remove(msg.TagPath);
|
||||
|
||||
if (_unresolvedTags.Remove(msg.TagPath))
|
||||
{
|
||||
_subscriptionIds[msg.TagPath] = msg.SubscriptionId;
|
||||
@@ -839,6 +908,10 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
_log.Debug("[{0}] Tag resolution still failing for {1}: {2}",
|
||||
_connectionName, msg.TagPath, msg.Error);
|
||||
|
||||
// DataConnectionLayer-010: the retry attempt for this tag has completed —
|
||||
// it is eligible for the next retry tick again.
|
||||
_resolutionInFlight.Remove(msg.TagPath);
|
||||
|
||||
// Track as unresolved so periodic retry picks it up
|
||||
if (_unresolvedTags.Add(msg.TagPath))
|
||||
{
|
||||
@@ -852,6 +925,16 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
|
||||
private void HandleTagValueReceived(TagValueReceived msg)
|
||||
{
|
||||
// DataConnectionLayer-011: drop values delivered by a disposed adapter. After a
|
||||
// failover the old adapter's OPC UA SDK threads may still fire callbacks; those
|
||||
// carry a stale generation and must not be forwarded to Instance Actors.
|
||||
if (msg.AdapterGeneration != _adapterGeneration)
|
||||
{
|
||||
_log.Debug("[{0}] Dropping stale tag value for {1} from adapter generation {2} (current {3})",
|
||||
_connectionName, msg.TagPath, msg.AdapterGeneration, _adapterGeneration);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fan out to all subscribed instances
|
||||
foreach (var (instanceName, tags) in _subscriptionsByInstance)
|
||||
{
|
||||
@@ -892,7 +975,7 @@ public class DataConnectionActor : UntypedActor, IWithStash, IWithTimers
|
||||
internal record AttemptConnect;
|
||||
internal record ConnectResult(bool Success, string? Error);
|
||||
internal record AdapterDisconnected;
|
||||
internal record TagValueReceived(string TagPath, TagValue Value);
|
||||
internal record TagValueReceived(string TagPath, TagValue Value, int AdapterGeneration);
|
||||
internal record TagResolutionFailed(string TagPath, string Error);
|
||||
internal record TagResolutionSucceeded(string TagPath, string SubscriptionId);
|
||||
internal record RetryTagResolution;
|
||||
|
||||
@@ -14,7 +14,10 @@ public record OpcUaConnectionOptions(
|
||||
int SamplingIntervalMs = 1000,
|
||||
int QueueSize = 10,
|
||||
string SecurityMode = "None",
|
||||
bool AutoAcceptUntrustedCerts = true,
|
||||
// DataConnectionLayer-012: secure-by-default — untrusted server certificates are
|
||||
// rejected unless an operator explicitly opts in per connection. Accepting any
|
||||
// certificate defeats the Sign / SignAndEncrypt modes against a man-in-the-middle.
|
||||
bool AutoAcceptUntrustedCerts = false,
|
||||
bool DiscardOldest = true,
|
||||
byte SubscriptionPriority = 0,
|
||||
string SubscriptionDisplayName = "ScadaLink",
|
||||
|
||||
@@ -186,10 +186,26 @@ public class OpcUaDataConnection : IDataConnection
|
||||
|
||||
public async Task<IReadOnlyDictionary<string, ReadResult>> ReadBatchAsync(IEnumerable<string> tagPaths, CancellationToken cancellationToken = default)
|
||||
{
|
||||
// DataConnectionLayer-007: a single failing tag must not abort the whole batch.
|
||||
// ReadAsync re-throws non-cancellation exceptions; catch them per tag and record
|
||||
// a failed ReadResult so the caller receives a complete result map for every
|
||||
// requested tag (the ReadResult shape already carries per-tag Success/error).
|
||||
var results = new Dictionary<string, ReadResult>();
|
||||
foreach (var tagPath in tagPaths)
|
||||
{
|
||||
results[tagPath] = await ReadAsync(tagPath, cancellationToken);
|
||||
try
|
||||
{
|
||||
results[tagPath] = await ReadAsync(tagPath, cancellationToken);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Cancellation aborts the whole batch — propagate it.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
results[tagPath] = new ReadResult(false, null, ex.Message);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Security.Cryptography.X509Certificates;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Opc.Ua;
|
||||
using Opc.Ua.Client;
|
||||
using Opc.Ua.Configuration;
|
||||
@@ -25,10 +27,12 @@ public class RealOpcUaClient : IOpcUaClient
|
||||
private volatile bool _connectionLostFired;
|
||||
private OpcUaConnectionOptions _options = new();
|
||||
private readonly OpcUaGlobalOptions _globalOptions;
|
||||
private readonly ILogger<RealOpcUaClient> _logger;
|
||||
|
||||
public RealOpcUaClient(OpcUaGlobalOptions? globalOptions = null)
|
||||
public RealOpcUaClient(OpcUaGlobalOptions? globalOptions = null, ILogger<RealOpcUaClient>? logger = null)
|
||||
{
|
||||
_globalOptions = globalOptions ?? new OpcUaGlobalOptions();
|
||||
_logger = logger ?? NullLogger<RealOpcUaClient>.Instance;
|
||||
}
|
||||
|
||||
public bool IsConnected => _session?.Connected ?? false;
|
||||
@@ -65,7 +69,16 @@ public class RealOpcUaClient : IOpcUaClient
|
||||
|
||||
await appConfig.ValidateAsync(ApplicationType.Client);
|
||||
if (opts.AutoAcceptUntrustedCerts)
|
||||
{
|
||||
// DataConnectionLayer-012: this accepts ANY server certificate, defeating
|
||||
// certificate trust enforcement. Surface a prominent warning so an operator
|
||||
// who has opted in is aware of the man-in-the-middle exposure on the link.
|
||||
_logger.LogWarning(
|
||||
"OPC UA connection to {Endpoint} has AutoAcceptUntrustedCerts enabled — every " +
|
||||
"server certificate is accepted unconditionally. This defeats Sign / " +
|
||||
"SignAndEncrypt protection against a man-in-the-middle.", endpointUrl);
|
||||
appConfig.CertificateValidator.CertificateValidation += (_, e) => e.Accept = true;
|
||||
}
|
||||
|
||||
// Discover endpoints from the server, pick the preferred security mode
|
||||
EndpointDescription? endpoint;
|
||||
|
||||
@@ -13,4 +13,11 @@ public class DataConnectionOptions
|
||||
|
||||
/// <summary>Timeout for synchronous write operations to devices.</summary>
|
||||
public TimeSpan WriteTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Minimum time a connection must stay up before it is considered stable.
|
||||
/// If a connection drops before this threshold, it counts as an unstable
|
||||
/// disconnect toward the failover retry count (DataConnectionLayer-009).
|
||||
/// </summary>
|
||||
public TimeSpan StableConnectionThreshold { get; set; } = TimeSpan.FromSeconds(60);
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ public class DeploymentService
|
||||
private readonly CommunicationService _communicationService;
|
||||
private readonly OperationLockManager _lockManager;
|
||||
private readonly IAuditService _auditService;
|
||||
private readonly DiffService _diffService;
|
||||
private readonly DeploymentManagerOptions _options;
|
||||
private readonly ILogger<DeploymentService> _logger;
|
||||
|
||||
@@ -58,6 +59,7 @@ public class DeploymentService
|
||||
CommunicationService communicationService,
|
||||
OperationLockManager lockManager,
|
||||
IAuditService auditService,
|
||||
DiffService diffService,
|
||||
IOptions<DeploymentManagerOptions> options,
|
||||
ILogger<DeploymentService> logger)
|
||||
{
|
||||
@@ -67,6 +69,7 @@ public class DeploymentService
|
||||
_communicationService = communicationService;
|
||||
_lockManager = lockManager;
|
||||
_auditService = auditService;
|
||||
_diffService = diffService;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
@@ -171,24 +174,47 @@ public class DeploymentService
|
||||
|
||||
var response = await _communicationService.DeployInstanceAsync(siteId, command, cancellationToken);
|
||||
|
||||
// WP-1: Update status based on site response
|
||||
// WP-1: Update status based on site response.
|
||||
record.Status = response.Status;
|
||||
record.ErrorMessage = response.ErrorMessage;
|
||||
record.CompletedAt = DateTimeOffset.UtcNow;
|
||||
|
||||
// DeploymentManager-003: once the site has confirmed the apply,
|
||||
// commit the deployment record's terminal status BEFORE touching
|
||||
// instance state and the deployed-config snapshot. If a later write
|
||||
// (instance update / snapshot store) fails, the recorded fact that
|
||||
// the site succeeded must NOT be lost -- otherwise central reports a
|
||||
// non-Success record while the site is running the new config.
|
||||
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
if (response.Status == DeploymentStatus.Success)
|
||||
{
|
||||
// WP-4: Update instance state to Enabled on successful deployment
|
||||
instance.State = InstanceState.Enabled;
|
||||
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
||||
// The site has applied the deployment. The post-success
|
||||
// persistence below is best-effort: a failure here must be
|
||||
// logged loudly for operator reconciliation but must not flip
|
||||
// the already-committed Success record back to Failed.
|
||||
try
|
||||
{
|
||||
// WP-4: Update instance state to Enabled on successful deployment
|
||||
instance.State = InstanceState.Enabled;
|
||||
await _repository.UpdateInstanceAsync(instance, cancellationToken);
|
||||
|
||||
// WP-8: Store deployed config snapshot
|
||||
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
|
||||
// WP-8: Store deployed config snapshot
|
||||
await StoreDeployedSnapshotAsync(instanceId, deploymentId, revisionHash, configJson, cancellationToken);
|
||||
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
}
|
||||
catch (Exception postEx)
|
||||
{
|
||||
_logger.LogError(postEx,
|
||||
"Deployment {DeploymentId} for instance {Instance} was applied by the site and " +
|
||||
"recorded Success, but post-success persistence (instance state / config snapshot) " +
|
||||
"failed -- central and site state may diverge until reconciled",
|
||||
deploymentId, instance.UniqueName);
|
||||
}
|
||||
}
|
||||
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
|
||||
// Audit log
|
||||
await _auditService.LogAsync(user, "Deploy", "Instance", instanceId.ToString(),
|
||||
instance.UniqueName, new { DeploymentId = deploymentId, Status = record.Status.ToString() },
|
||||
@@ -368,8 +394,34 @@ public class DeploymentService
|
||||
// Delete means delete: remove the instance record entirely.
|
||||
// Deployment records, snapshot, overrides, and connection bindings
|
||||
// are removed with it (see repository implementation).
|
||||
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
//
|
||||
// DeploymentManager-004: the site has already destroyed the Instance
|
||||
// Actor and removed its config. If the central record removal now
|
||||
// fails (DB error / concurrency), the exception must NOT escape
|
||||
// uncaught -- that would leave the central record orphaned and
|
||||
// un-deletable through the normal path (a re-issued delete may fail
|
||||
// because the site no longer has the instance). Surface a distinct
|
||||
// failure so an operator can reconcile.
|
||||
try
|
||||
{
|
||||
await _repository.DeleteInstanceAsync(instanceId, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Instance {Instance} was deleted at the site, but the central record could not be " +
|
||||
"removed -- the central record is now orphaned and must be reconciled manually",
|
||||
instance.UniqueName);
|
||||
|
||||
await _auditService.LogAsync(user, "DeleteOrphaned", "Instance", instanceId.ToString(),
|
||||
instance.UniqueName, new { CommandId = commandId, Error = ex.Message },
|
||||
CancellationToken.None);
|
||||
|
||||
return Result<InstanceLifecycleResponse>.Failure(
|
||||
$"The site deleted instance '{instance.UniqueName}', but the central record could not " +
|
||||
$"be removed: {ex.Message}. The central record is orphaned and must be reconciled.");
|
||||
}
|
||||
}
|
||||
|
||||
await _auditService.LogAsync(user, "Delete", "Instance", instanceId.ToString(),
|
||||
@@ -383,7 +435,12 @@ public class DeploymentService
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-8: Get the deployed config snapshot and compare with current template-derived state.
|
||||
/// WP-8: Get the deployed config snapshot and compare with current
|
||||
/// template-derived state. Produces both a staleness flag and — per the
|
||||
/// design's "Diff View" — a structured <see cref="ConfigurationDiff"/> of
|
||||
/// added/removed/changed attributes, alarms, and scripts (including data
|
||||
/// connection binding changes) computed by the TemplateEngine
|
||||
/// <see cref="DiffService"/>.
|
||||
/// </summary>
|
||||
public async Task<Result<DeploymentComparisonResult>> GetDeploymentComparisonAsync(
|
||||
int instanceId,
|
||||
@@ -398,15 +455,47 @@ public class DeploymentService
|
||||
if (currentResult.IsFailure)
|
||||
return Result<DeploymentComparisonResult>.Failure($"Cannot compute current config: {currentResult.Error}");
|
||||
|
||||
var currentConfig = currentResult.Value.Configuration;
|
||||
var currentHash = currentResult.Value.RevisionHash;
|
||||
var isStale = snapshot.RevisionHash != currentHash;
|
||||
|
||||
// DeploymentManager-007: deserialize the deployed snapshot and run the
|
||||
// TemplateEngine DiffService so the result carries real
|
||||
// added/removed/changed detail, not just a hash comparison. A snapshot
|
||||
// that cannot be deserialized (corrupt / older schema) still yields the
|
||||
// hash-based staleness result, with a null diff.
|
||||
ConfigurationDiff? diff = null;
|
||||
try
|
||||
{
|
||||
var deployedConfig = JsonSerializer.Deserialize<FlattenedConfiguration>(snapshot.ConfigurationJson);
|
||||
if (deployedConfig != null)
|
||||
{
|
||||
diff = _diffService.ComputeDiff(
|
||||
deployedConfig, currentConfig, snapshot.RevisionHash, currentHash);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Deployed snapshot for instance {InstanceId} deserialized to null; " +
|
||||
"returning hash-based comparison without a structured diff",
|
||||
instanceId);
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Could not deserialize deployed snapshot for instance {InstanceId}; " +
|
||||
"returning hash-based comparison without a structured diff",
|
||||
instanceId);
|
||||
}
|
||||
|
||||
var result = new DeploymentComparisonResult(
|
||||
instanceId,
|
||||
snapshot.RevisionHash,
|
||||
currentHash,
|
||||
isStale,
|
||||
snapshot.DeployedAt);
|
||||
snapshot.DeployedAt,
|
||||
diff);
|
||||
|
||||
return Result<DeploymentComparisonResult>.Success(result);
|
||||
}
|
||||
@@ -551,9 +640,16 @@ public class DeploymentService
|
||||
/// <summary>
|
||||
/// WP-8: Result of comparing deployed vs template-derived configuration.
|
||||
/// </summary>
|
||||
/// <param name="Diff">
|
||||
/// DeploymentManager-007: structured added/removed/changed detail for
|
||||
/// attributes, alarms, and scripts. Null only when the deployed snapshot could
|
||||
/// not be deserialized (corrupt / older schema), in which case
|
||||
/// <see cref="IsStale"/> still reflects the hash comparison.
|
||||
/// </param>
|
||||
public record DeploymentComparisonResult(
|
||||
int InstanceId,
|
||||
string DeployedRevisionHash,
|
||||
string CurrentRevisionHash,
|
||||
bool IsStale,
|
||||
DateTimeOffset DeployedAt);
|
||||
DateTimeOffset DeployedAt,
|
||||
ConfigurationDiff? Diff = null);
|
||||
|
||||
@@ -6,13 +6,34 @@ namespace ScadaLink.DeploymentManager;
|
||||
/// WP-3: Per-instance operation lock. Only one mutating operation (deploy, disable, enable, delete)
|
||||
/// may be in progress per instance at a time. Different instances can proceed in parallel.
|
||||
///
|
||||
/// Implementation: ConcurrentDictionary of SemaphoreSlim(1,1) keyed by instance unique name.
|
||||
/// Lock released on completion, timeout, or failure.
|
||||
/// Implementation: ConcurrentDictionary of ref-counted SemaphoreSlim(1,1) keyed by instance
|
||||
/// unique name. The lock is released on completion, timeout, or failure.
|
||||
/// Lost on central failover (acceptable per design -- in-progress treated as failed).
|
||||
///
|
||||
/// DeploymentManager-005: each entry is ref-counted. The semaphore is created on the
|
||||
/// first acquire/wait, shared while there are waiters or a holder, and removed +
|
||||
/// <see cref="IDisposable.Dispose"/>d when the last reference is released — so the dictionary
|
||||
/// does not accumulate one kernel wait handle per distinct instance name forever.
|
||||
/// </summary>
|
||||
public class OperationLockManager
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, SemaphoreSlim> _locks = new(StringComparer.Ordinal);
|
||||
private readonly object _gate = new();
|
||||
private readonly Dictionary<string, LockEntry> _locks = new(StringComparer.Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// Number of lock entries currently tracked. Used for diagnostics and to
|
||||
/// verify that semaphores are reclaimed (DeploymentManager-005).
|
||||
/// </summary>
|
||||
public int TrackedLockCount
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_gate)
|
||||
{
|
||||
return _locks.Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Acquires the operation lock for the given instance. Returns a disposable that releases the lock.
|
||||
@@ -20,16 +41,40 @@ public class OperationLockManager
|
||||
/// </summary>
|
||||
public async Task<IDisposable> AcquireAsync(string instanceUniqueName, TimeSpan timeout, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var semaphore = _locks.GetOrAdd(instanceUniqueName, _ => new SemaphoreSlim(1, 1));
|
||||
|
||||
if (!await semaphore.WaitAsync(timeout, cancellationToken))
|
||||
// Reserve a reference (creating the entry if needed) BEFORE waiting, so a
|
||||
// concurrent waiter for the same instance shares the same semaphore and
|
||||
// the entry survives until every waiter/holder has released it.
|
||||
LockEntry entry;
|
||||
lock (_gate)
|
||||
{
|
||||
throw new TimeoutException(
|
||||
$"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " +
|
||||
"Another mutating operation is in progress.");
|
||||
if (!_locks.TryGetValue(instanceUniqueName, out entry!))
|
||||
{
|
||||
entry = new LockEntry();
|
||||
_locks[instanceUniqueName] = entry;
|
||||
}
|
||||
entry.RefCount++;
|
||||
}
|
||||
|
||||
return new LockRelease(semaphore);
|
||||
try
|
||||
{
|
||||
if (!await entry.Semaphore.WaitAsync(timeout, cancellationToken))
|
||||
{
|
||||
throw new TimeoutException(
|
||||
$"Could not acquire operation lock for instance '{instanceUniqueName}' within {timeout.TotalSeconds}s. " +
|
||||
"Another mutating operation is in progress.");
|
||||
}
|
||||
}
|
||||
catch (Exception) when (DropReferenceOnFailure(instanceUniqueName, entry))
|
||||
{
|
||||
// DropReferenceOnFailure always returns false; the filter just runs
|
||||
// the cleanup so the reservation is not leaked when WaitAsync throws
|
||||
// or times out (TimeoutException / OperationCanceledException). The
|
||||
// exception still propagates. The semaphore was NOT entered on any
|
||||
// of these paths, so only the reference is dropped.
|
||||
throw;
|
||||
}
|
||||
|
||||
return new LockRelease(this, instanceUniqueName, entry);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -37,21 +82,73 @@ public class OperationLockManager
|
||||
/// </summary>
|
||||
public bool IsLocked(string instanceUniqueName)
|
||||
{
|
||||
return _locks.TryGetValue(instanceUniqueName, out var semaphore) && semaphore.CurrentCount == 0;
|
||||
lock (_gate)
|
||||
{
|
||||
return _locks.TryGetValue(instanceUniqueName, out var entry) && entry.Semaphore.CurrentCount == 0;
|
||||
}
|
||||
}
|
||||
|
||||
private bool DropReferenceOnFailure(string instanceUniqueName, LockEntry entry)
|
||||
{
|
||||
ReleaseReference(instanceUniqueName, entry, semaphoreWasEntered: false);
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Drops one reference to the entry. When <paramref name="semaphoreWasEntered"/>
|
||||
/// is true the semaphore is released first. When the reference count reaches
|
||||
/// zero the entry is removed from the dictionary and the semaphore disposed.
|
||||
/// </summary>
|
||||
private void ReleaseReference(string instanceUniqueName, LockEntry entry, bool semaphoreWasEntered)
|
||||
{
|
||||
lock (_gate)
|
||||
{
|
||||
// Release the semaphore (handing the lock to any waiter) and drop the
|
||||
// reference under the same gate, so the dispose decision below cannot
|
||||
// race with the Release on an entry that another caller is reclaiming.
|
||||
if (semaphoreWasEntered)
|
||||
{
|
||||
entry.Semaphore.Release();
|
||||
}
|
||||
|
||||
entry.RefCount--;
|
||||
if (entry.RefCount <= 0 &&
|
||||
_locks.TryGetValue(instanceUniqueName, out var current) &&
|
||||
ReferenceEquals(current, entry))
|
||||
{
|
||||
_locks.Remove(instanceUniqueName);
|
||||
entry.Semaphore.Dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class LockEntry
|
||||
{
|
||||
public readonly SemaphoreSlim Semaphore = new(1, 1);
|
||||
|
||||
/// <summary>Number of in-flight acquires (waiters + the current holder). Guarded by <see cref="_gate"/>.</summary>
|
||||
public int RefCount;
|
||||
}
|
||||
|
||||
private sealed class LockRelease : IDisposable
|
||||
{
|
||||
private readonly SemaphoreSlim _semaphore;
|
||||
private readonly OperationLockManager _owner;
|
||||
private readonly string _instanceUniqueName;
|
||||
private readonly LockEntry _entry;
|
||||
private int _disposed;
|
||||
|
||||
public LockRelease(SemaphoreSlim semaphore) => _semaphore = semaphore;
|
||||
public LockRelease(OperationLockManager owner, string instanceUniqueName, LockEntry entry)
|
||||
{
|
||||
_owner = owner;
|
||||
_instanceUniqueName = instanceUniqueName;
|
||||
_entry = entry;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (Interlocked.CompareExchange(ref _disposed, 1, 0) == 0)
|
||||
{
|
||||
_semaphore.Release();
|
||||
_owner.ReleaseReference(_instanceUniqueName, _entry, semaphoreWasEntered: true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -1,9 +1,41 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace ScadaLink.DeploymentManager;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section that <see cref="DeploymentManagerOptions"/> is bound to.
|
||||
/// </summary>
|
||||
public const string OptionsSection = "ScadaLink:DeploymentManager";
|
||||
|
||||
/// <summary>
|
||||
/// Registers the Deployment Manager services and binds
|
||||
/// <see cref="DeploymentManagerOptions"/> to the
|
||||
/// <see cref="OptionsSection"/> configuration section, consistent with the
|
||||
/// Options-pattern convention ("Per-component configuration via
|
||||
/// appsettings.json sections bound to options classes").
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeploymentManager(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(configuration);
|
||||
|
||||
// DeploymentManager-008: bind the options class so the operation-lock
|
||||
// and artifact-deployment timeouts are tunable via appsettings.json.
|
||||
services.Configure<DeploymentManagerOptions>(configuration.GetSection(OptionsSection));
|
||||
return services.AddDeploymentManager();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers the Deployment Manager services without binding options to
|
||||
/// configuration. <see cref="DeploymentManagerOptions"/> falls back to its
|
||||
/// declared defaults unless configured elsewhere. Prefer the
|
||||
/// <see cref="AddDeploymentManager(IServiceCollection, IConfiguration)"/>
|
||||
/// overload so the options are bound to <c>appsettings.json</c>.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeploymentManager(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<OperationLockManager>();
|
||||
|
||||
@@ -45,11 +45,29 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
throw new InvalidOperationException($"Database connection '{connectionName}' not found");
|
||||
}
|
||||
|
||||
var connection = new SqlConnection(definition.ConnectionString);
|
||||
await connection.OpenAsync(cancellationToken);
|
||||
var connection = CreateConnection(definition.ConnectionString);
|
||||
try
|
||||
{
|
||||
await connection.OpenAsync(cancellationToken);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// OpenAsync failed (unreachable server, bad credentials, cancellation) —
|
||||
// dispose the just-created connection before the exception propagates so
|
||||
// it is not leaked (ExternalSystemGateway-010).
|
||||
await connection.DisposeAsync();
|
||||
throw;
|
||||
}
|
||||
return connection;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the underlying ADO.NET connection for a connection string. Virtual so
|
||||
/// tests can substitute a connection whose <c>OpenAsync</c> fails.
|
||||
/// </summary>
|
||||
internal virtual DbConnection CreateConnection(string connectionString) =>
|
||||
new SqlConnection(connectionString);
|
||||
|
||||
/// <summary>
|
||||
/// Submits a SQL write to the store-and-forward engine for reliable delivery.
|
||||
/// </summary>
|
||||
@@ -78,12 +96,15 @@ public class DatabaseGateway : IDatabaseGateway
|
||||
Parameters = parameters
|
||||
});
|
||||
|
||||
// The per-connection retry settings are passed through verbatim — a
|
||||
// configured MaxRetries of 0 means "never retry" and must NOT be
|
||||
// collapsed to the S&F default (ExternalSystemGateway-004).
|
||||
await _storeAndForward.EnqueueAsync(
|
||||
StoreAndForwardCategory.CachedDbWrite,
|
||||
connectionName,
|
||||
payload,
|
||||
originInstanceName,
|
||||
definition.MaxRetries > 0 ? definition.MaxRetries : null,
|
||||
definition.MaxRetries,
|
||||
definition.RetryDelay > TimeSpan.Zero ? definition.RetryDelay : null);
|
||||
}
|
||||
|
||||
|
||||
@@ -113,12 +113,16 @@ public class ExternalSystemClient : IExternalSystemClient
|
||||
// attemptImmediateDelivery: false — this method already made the HTTP
|
||||
// attempt above; letting EnqueueAsync re-invoke the handler would
|
||||
// dispatch the same request a second time.
|
||||
//
|
||||
// The per-system retry settings are passed through verbatim — a
|
||||
// configured MaxRetries of 0 means "never retry" and must NOT be
|
||||
// collapsed to the S&F default (ExternalSystemGateway-004).
|
||||
await _storeAndForward.EnqueueAsync(
|
||||
StoreAndForwardCategory.ExternalSystem,
|
||||
systemName,
|
||||
payload,
|
||||
originInstanceName,
|
||||
system.MaxRetries > 0 ? system.MaxRetries : null,
|
||||
system.MaxRetries,
|
||||
system.RetryDelay > TimeSpan.Zero ? system.RetryDelay : null,
|
||||
attemptImmediateDelivery: false);
|
||||
|
||||
@@ -183,7 +187,11 @@ public class ExternalSystemClient : IExternalSystemClient
|
||||
var client = _httpClientFactory.CreateClient($"ExternalSystem_{system.Name}");
|
||||
|
||||
var url = BuildUrl(system.EndpointUrl, method.Path, parameters, method.HttpMethod);
|
||||
var request = new HttpRequestMessage(new HttpMethod(method.HttpMethod), url);
|
||||
|
||||
// The request and response own IDisposable resources (StringContent, the
|
||||
// response content stream). Dispose both, including on the exception paths
|
||||
// (ExternalSystemGateway-005).
|
||||
using var request = new HttpRequestMessage(new HttpMethod(method.HttpMethod), url);
|
||||
|
||||
// Apply authentication
|
||||
ApplyAuth(request, system);
|
||||
@@ -232,44 +240,75 @@ public class ExternalSystemClient : IExternalSystemClient
|
||||
throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex);
|
||||
}
|
||||
|
||||
// The timeout also covers reading the response body (the design's
|
||||
// "round-trip" guarantee), so the linked token is used for the read too.
|
||||
string body;
|
||||
try
|
||||
using (response)
|
||||
{
|
||||
body = await response.Content.ReadAsStringAsync(linkedCts.Token);
|
||||
// The timeout also covers reading the response body (the design's
|
||||
// "round-trip" guarantee), so the linked token is used for the read too.
|
||||
string body;
|
||||
try
|
||||
{
|
||||
body = await response.Content.ReadAsStringAsync(linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
throw ErrorClassifier.AsTransient(
|
||||
$"Timeout reading response from {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
|
||||
}
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
return body;
|
||||
}
|
||||
|
||||
// Bound the external error body before embedding it into a
|
||||
// script-visible message / event-log entry — a misbehaving or hostile
|
||||
// endpoint must not be able to inflate every error string
|
||||
// (ExternalSystemGateway-007).
|
||||
var errorBody = Truncate(body, MaxErrorBodyChars);
|
||||
|
||||
if (ErrorClassifier.IsTransient(response.StatusCode))
|
||||
{
|
||||
throw ErrorClassifier.AsTransient(
|
||||
$"HTTP {(int)response.StatusCode} from {system.Name}: {errorBody}");
|
||||
}
|
||||
|
||||
throw new PermanentExternalSystemException(
|
||||
$"HTTP {(int)response.StatusCode} from {system.Name}: {errorBody}",
|
||||
(int)response.StatusCode);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Upper bound (characters) on an external error response body echoed into a
|
||||
/// script-visible error message — see ExternalSystemGateway-007.
|
||||
/// </summary>
|
||||
private const int MaxErrorBodyChars = 2048;
|
||||
|
||||
private static string Truncate(string value, int maxChars)
|
||||
{
|
||||
if (string.IsNullOrEmpty(value) || value.Length <= maxChars)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
throw ErrorClassifier.AsTransient(
|
||||
$"Timeout reading response from {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
|
||||
return value;
|
||||
}
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
return body;
|
||||
}
|
||||
|
||||
var errorBody = body;
|
||||
|
||||
if (ErrorClassifier.IsTransient(response.StatusCode))
|
||||
{
|
||||
throw ErrorClassifier.AsTransient(
|
||||
$"HTTP {(int)response.StatusCode} from {system.Name}: {errorBody}");
|
||||
}
|
||||
|
||||
throw new PermanentExternalSystemException(
|
||||
$"HTTP {(int)response.StatusCode} from {system.Name}: {errorBody}",
|
||||
(int)response.StatusCode);
|
||||
return value.Substring(0, maxChars) + $"… [truncated, {value.Length} chars total]";
|
||||
}
|
||||
|
||||
private static string BuildUrl(string baseUrl, string path, IReadOnlyDictionary<string, object?>? parameters, string httpMethod)
|
||||
{
|
||||
var url = baseUrl.TrimEnd('/') + "/" + path.TrimStart('/');
|
||||
// A method that targets the base URL itself has an empty (or "/") path.
|
||||
// Appending a trailing "/" in that case yields ".../api/" which some
|
||||
// servers treat as a distinct resource — only append a segment when the
|
||||
// method actually defines a non-empty relative path (ExternalSystemGateway-006).
|
||||
var trimmedBase = baseUrl.TrimEnd('/');
|
||||
var trimmedPath = path.Trim().TrimStart('/');
|
||||
var url = string.IsNullOrEmpty(trimmedPath)
|
||||
? trimmedBase
|
||||
: trimmedBase + "/" + trimmedPath;
|
||||
|
||||
// For GET/DELETE, append parameters as query string
|
||||
if ((httpMethod.Equals("GET", StringComparison.OrdinalIgnoreCase) ||
|
||||
|
||||
@@ -103,17 +103,42 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||
/// SiteHealthReport. Heartbeats from sites we have not yet received a
|
||||
/// full report from are ignored — registration only happens on report.
|
||||
/// The update is an atomic compare-and-swap of the immutable state.
|
||||
/// Bumps the last-seen timestamp for a site. If a heartbeat arrives for a
|
||||
/// site the aggregator has no state for yet (e.g. immediately after a central
|
||||
/// restart/failover, when in-memory state is empty), the site is registered
|
||||
/// as online with no <see cref="SiteHealthState.LatestReport"/> — heartbeats
|
||||
/// prove the site is reachable, so it shows online straight away rather than
|
||||
/// as "unknown" for up to a full report interval. The update is an atomic
|
||||
/// compare-and-swap of the immutable state.
|
||||
/// </summary>
|
||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
if (!_siteStates.TryGetValue(siteId, out var existing))
|
||||
return;
|
||||
{
|
||||
// Unknown site — register it as online, awaiting its first
|
||||
// full report. LatestReport stays null until ProcessReport runs.
|
||||
var registered = new SiteHealthState
|
||||
{
|
||||
SiteId = siteId,
|
||||
LatestReport = null,
|
||||
LastReportReceivedAt = default,
|
||||
LastHeartbeatAt = receivedAt,
|
||||
LastSequenceNumber = 0,
|
||||
IsOnline = true
|
||||
};
|
||||
|
||||
if (_siteStates.TryAdd(siteId, registered))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Site {SiteId} registered online via heartbeat (awaiting first report)", siteId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Lost the race — another thread registered first; retry as an update.
|
||||
continue;
|
||||
}
|
||||
|
||||
var newHeartbeat = receivedAt > existing.LastHeartbeatAt
|
||||
? receivedAt
|
||||
@@ -163,10 +188,10 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Central health aggregator started, offline timeout {Timeout}s",
|
||||
_options.OfflineTimeout.TotalSeconds);
|
||||
"Central health aggregator started, offline timeout {Timeout}s (central {CentralTimeout}s)",
|
||||
_options.OfflineTimeout.TotalSeconds, _options.CentralOfflineTimeout.TotalSeconds);
|
||||
|
||||
// Check at half the offline timeout interval for timely detection
|
||||
// Check at half the (shorter) offline timeout interval for timely detection
|
||||
var checkInterval = TimeSpan.FromMilliseconds(_options.OfflineTimeout.TotalMilliseconds / 2);
|
||||
using var timer = new PeriodicTimer(checkInterval);
|
||||
|
||||
@@ -189,8 +214,17 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
// healthy site node (cadence owned by Cluster Infrastructure /
|
||||
// SiteCommunicationActor), so OfflineTimeout only fires when no
|
||||
// node can reach central, not during single-node failovers.
|
||||
//
|
||||
// The synthetic "central" site has no heartbeat source — its only
|
||||
// signal is the 30s CentralHealthReportLoop self-report — so it gets
|
||||
// a longer grace window (CentralOfflineTimeout) to survive a single
|
||||
// skipped/late self-report.
|
||||
var timeout = kvp.Key == CentralHealthReportLoop.CentralSiteId
|
||||
? _options.CentralOfflineTimeout
|
||||
: _options.OfflineTimeout;
|
||||
|
||||
var elapsed = now - state.LastHeartbeatAt;
|
||||
if (elapsed <= _options.OfflineTimeout)
|
||||
if (elapsed <= timeout)
|
||||
continue;
|
||||
|
||||
// Atomically swap to an offline copy. If the CAS loses to a
|
||||
@@ -201,7 +235,7 @@ public class CentralHealthAggregator : BackgroundService, ICentralHealthAggregat
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Site {SiteId} marked offline — no signal for {Elapsed}s (timeout: {Timeout}s)",
|
||||
state.SiteId, elapsed.TotalSeconds, _options.OfflineTimeout.TotalSeconds);
|
||||
state.SiteId, elapsed.TotalSeconds, timeout.TotalSeconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,4 +4,17 @@ public class HealthMonitoringOptions
|
||||
{
|
||||
public TimeSpan ReportInterval { get; set; } = TimeSpan.FromSeconds(30);
|
||||
public TimeSpan OfflineTimeout { get; set; } = TimeSpan.FromMinutes(1);
|
||||
|
||||
/// <summary>
|
||||
/// Offline timeout applied to the synthetic "central" site only. Real sites
|
||||
/// emit frequent heartbeats that keep <c>LastHeartbeatAt</c> fresh, so the
|
||||
/// normal <see cref="OfflineTimeout"/> only fires on genuine total loss. The
|
||||
/// "central" self-report has no heartbeat source — its only signal is the
|
||||
/// 30s <see cref="CentralHealthReportLoop"/>, so a single skipped/late
|
||||
/// self-report (leader GC pause, brief stall, mid-failover before the new
|
||||
/// leader's loop spins up) would flap it offline under the 60s site timeout.
|
||||
/// A longer central grace gives the equivalent of "one missed report" that
|
||||
/// the design doc grants real sites. Default: 3x the report interval.
|
||||
/// </summary>
|
||||
public TimeSpan CentralOfflineTimeout { get; set; } = TimeSpan.FromMinutes(3);
|
||||
}
|
||||
|
||||
@@ -11,10 +11,13 @@ public interface ICentralHealthAggregator
|
||||
void ProcessReport(SiteHealthReport report);
|
||||
|
||||
/// <summary>
|
||||
/// Bumps the last-seen timestamp for a site already known via a prior
|
||||
/// SiteHealthReport. Used to keep a site marked online between full
|
||||
/// 30s reports when ~2s heartbeats are arriving — protects against the
|
||||
/// 60s offline threshold firing on a transiently delayed report.
|
||||
/// Bumps the last-seen timestamp for a site, keeping it marked online
|
||||
/// between full 30s reports when heartbeats are arriving — protects against
|
||||
/// the offline threshold firing on a transiently delayed report. A heartbeat
|
||||
/// for a site with no aggregator state yet (e.g. just after a central
|
||||
/// restart/failover) registers that site as online with no
|
||||
/// <see cref="SiteHealthState.LatestReport"/>, so reachable sites are not
|
||||
/// shown as "unknown" during the failover window.
|
||||
/// </summary>
|
||||
void MarkHeartbeat(string siteId, DateTimeOffset receivedAt);
|
||||
|
||||
|
||||
@@ -124,4 +124,35 @@ public class AuditServiceTests : IDisposable
|
||||
Assert.DoesNotContain(methods, m => m.Name.Contains("Update", StringComparison.OrdinalIgnoreCase));
|
||||
Assert.DoesNotContain(methods, m => m.Name.Contains("Delete", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
// Self-referential POCO used to reproduce a reference cycle in afterState.
|
||||
private sealed class CyclicNode
|
||||
{
|
||||
public string Name { get; set; } = "node";
|
||||
public CyclicNode? Self { get; set; }
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LogAsync_AfterStateWithReferenceCycle_DoesNotThrow_AndDoesNotRollBackOperation()
|
||||
{
|
||||
// Regression guard for ConfigurationDatabase-007: serializing an afterState
|
||||
// object that contains a reference cycle must not throw a JsonException —
|
||||
// that would roll back the entire business operation it is auditing.
|
||||
var node = new CyclicNode();
|
||||
node.Self = node; // reference cycle
|
||||
|
||||
var template = new Template("CyclicAuditTemplate");
|
||||
_context.Templates.Add(template);
|
||||
|
||||
// Must not throw.
|
||||
await _auditService.LogAsync("admin", "Create", "Template", "1", "CyclicAuditTemplate", node);
|
||||
|
||||
// The audited business operation must still commit successfully.
|
||||
await _context.SaveChangesAsync();
|
||||
|
||||
var audit = await _context.AuditLogEntries.SingleAsync();
|
||||
Assert.NotNull(audit.AfterStateJson);
|
||||
Assert.Contains("node", audit.AfterStateJson);
|
||||
Assert.Single(await _context.Templates.Where(t => t.Name == "CyclicAuditTemplate").ToListAsync());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
using ScadaLink.ConfigurationDatabase;
|
||||
|
||||
namespace ScadaLink.ConfigurationDatabase.Tests;
|
||||
|
||||
public class DesignTimeDbContextFactoryTests : IDisposable
|
||||
{
|
||||
private const string EnvVar = "SCADALINK_DESIGNTIME_CONNECTIONSTRING";
|
||||
private readonly string? _originalEnv;
|
||||
|
||||
public DesignTimeDbContextFactoryTests()
|
||||
{
|
||||
_originalEnv = Environment.GetEnvironmentVariable(EnvVar);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
Environment.SetEnvironmentVariable(EnvVar, _originalEnv);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateDbContext_NoConnectionStringConfigured_ThrowsClearException()
|
||||
{
|
||||
// Regression guard for ConfigurationDatabase-002: the factory must not fall back
|
||||
// to a hardcoded `sa`/password literal. With nothing configured it must fail loudly
|
||||
// with an actionable message instead of silently pointing tooling at a guessed DB.
|
||||
Environment.SetEnvironmentVariable(EnvVar, null);
|
||||
var factory = new DesignTimeDbContextFactory();
|
||||
|
||||
var ex = Assert.Throws<InvalidOperationException>(() => factory.CreateDbContext(Array.Empty<string>()));
|
||||
|
||||
Assert.Contains("connection string", ex.Message, StringComparison.OrdinalIgnoreCase);
|
||||
// The message must not leak / suggest a hardcoded `sa` credential.
|
||||
Assert.DoesNotContain("sa", ex.Message.Split(' '), StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateDbContext_ConnectionStringFromEnvironmentVariable_IsUsed()
|
||||
{
|
||||
// The design-time connection string may be supplied via an environment variable
|
||||
// rather than a source literal.
|
||||
Environment.SetEnvironmentVariable(EnvVar,
|
||||
"Server=localhost,1433;Database=ScadaLink_Config;Trusted_Connection=True;TrustServerCertificate=True");
|
||||
var factory = new DesignTimeDbContextFactory();
|
||||
|
||||
using var context = factory.CreateDbContext(Array.Empty<string>());
|
||||
|
||||
Assert.NotNull(context);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DesignTimeDbContextFactory_SourceContainsNoHardcodedSaCredential()
|
||||
{
|
||||
// Belt-and-braces: assert no `sa`/password literal exists in the compiled type's
|
||||
// behaviour by confirming the no-config path throws rather than connecting.
|
||||
Environment.SetEnvironmentVariable(EnvVar, null);
|
||||
var factory = new DesignTimeDbContextFactory();
|
||||
|
||||
var ex = Assert.Throws<InvalidOperationException>(() => factory.CreateDbContext(Array.Empty<string>()));
|
||||
Assert.DoesNotContain("YourPassword", ex.Message);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
using Microsoft.AspNetCore.DataProtection;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.EntityFrameworkCore.Diagnostics;
|
||||
using ScadaLink.Commons.Entities.ExternalSystems;
|
||||
using ScadaLink.Commons.Entities.Notifications;
|
||||
using ScadaLink.ConfigurationDatabase;
|
||||
|
||||
namespace ScadaLink.ConfigurationDatabase.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Regression guard for ConfigurationDatabase-004: secret-bearing columns
|
||||
/// (SMTP credentials, external-system auth config, database connection strings)
|
||||
/// must be encrypted at rest, not persisted verbatim.
|
||||
/// </summary>
|
||||
public class SecretEncryptionTests : IDisposable
|
||||
{
|
||||
private readonly ScadaLinkDbContext _context;
|
||||
private readonly IDataProtectionProvider _protectionProvider;
|
||||
|
||||
public SecretEncryptionTests()
|
||||
{
|
||||
_protectionProvider = new EphemeralDataProtectionProvider();
|
||||
|
||||
var options = new DbContextOptionsBuilder<ScadaLinkDbContext>()
|
||||
.UseSqlite("DataSource=:memory:")
|
||||
.ConfigureWarnings(w => w.Ignore(RelationalEventId.PendingModelChangesWarning))
|
||||
.Options;
|
||||
|
||||
_context = new ScadaLinkDbContext(options, _protectionProvider);
|
||||
_context.Database.OpenConnection();
|
||||
_context.Database.EnsureCreated();
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_context.Database.CloseConnection();
|
||||
_context.Dispose();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DatabaseConnectionDefinition_ConnectionString_StoredEncrypted_RoundTrips()
|
||||
{
|
||||
const string secret = "Server=db;Database=X;User Id=svc;Password=SuperSecret123!";
|
||||
var def = new DatabaseConnectionDefinition("PrimaryDb", secret);
|
||||
_context.DatabaseConnectionDefinitions.Add(def);
|
||||
await _context.SaveChangesAsync();
|
||||
_context.ChangeTracker.Clear();
|
||||
|
||||
// Raw column value must not be the plaintext secret.
|
||||
var raw = await ReadRawColumnAsync("DatabaseConnectionDefinitions", "ConnectionString", def.Id);
|
||||
Assert.NotNull(raw);
|
||||
Assert.NotEqual(secret, raw);
|
||||
Assert.DoesNotContain("SuperSecret123!", raw);
|
||||
|
||||
// Reading back through EF must transparently decrypt.
|
||||
var loaded = await _context.DatabaseConnectionDefinitions.SingleAsync(d => d.Id == def.Id);
|
||||
Assert.Equal(secret, loaded.ConnectionString);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SmtpConfiguration_Credentials_StoredEncrypted_RoundTrips()
|
||||
{
|
||||
const string secret = "client_secret=oauth2-abc-very-secret";
|
||||
var smtp = new SmtpConfiguration("smtp.example.com", "OAuth2", "noreply@example.com")
|
||||
{
|
||||
Credentials = secret
|
||||
};
|
||||
_context.SmtpConfigurations.Add(smtp);
|
||||
await _context.SaveChangesAsync();
|
||||
_context.ChangeTracker.Clear();
|
||||
|
||||
var raw = await ReadRawColumnAsync("SmtpConfigurations", "Credentials", smtp.Id);
|
||||
Assert.NotNull(raw);
|
||||
Assert.NotEqual(secret, raw);
|
||||
Assert.DoesNotContain("oauth2-abc-very-secret", raw);
|
||||
|
||||
var loaded = await _context.SmtpConfigurations.SingleAsync(s => s.Id == smtp.Id);
|
||||
Assert.Equal(secret, loaded.Credentials);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ExternalSystemDefinition_AuthConfiguration_StoredEncrypted_RoundTrips()
|
||||
{
|
||||
const string secret = "{\"apiKey\":\"live-key-do-not-leak\"}";
|
||||
var ext = new ExternalSystemDefinition("Erp", "https://erp.example.com", "ApiKey")
|
||||
{
|
||||
AuthConfiguration = secret
|
||||
};
|
||||
_context.ExternalSystemDefinitions.Add(ext);
|
||||
await _context.SaveChangesAsync();
|
||||
_context.ChangeTracker.Clear();
|
||||
|
||||
var raw = await ReadRawColumnAsync("ExternalSystemDefinitions", "AuthConfiguration", ext.Id);
|
||||
Assert.NotNull(raw);
|
||||
Assert.NotEqual(secret, raw);
|
||||
Assert.DoesNotContain("live-key-do-not-leak", raw);
|
||||
|
||||
var loaded = await _context.ExternalSystemDefinitions.SingleAsync(e => e.Id == ext.Id);
|
||||
Assert.Equal(secret, loaded.AuthConfiguration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SmtpConfiguration_NullCredentials_RoundTripsAsNull()
|
||||
{
|
||||
var smtp = new SmtpConfiguration("smtp.example.com", "None", "noreply@example.com")
|
||||
{
|
||||
Credentials = null
|
||||
};
|
||||
_context.SmtpConfigurations.Add(smtp);
|
||||
await _context.SaveChangesAsync();
|
||||
_context.ChangeTracker.Clear();
|
||||
|
||||
var loaded = await _context.SmtpConfigurations.SingleAsync(s => s.Id == smtp.Id);
|
||||
Assert.Null(loaded.Credentials);
|
||||
}
|
||||
|
||||
private async Task<string?> ReadRawColumnAsync(string table, string column, int id)
|
||||
{
|
||||
var connection = _context.Database.GetDbConnection();
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = $"SELECT \"{column}\" FROM \"{table}\" WHERE \"Id\" = $id";
|
||||
var p = cmd.CreateParameter();
|
||||
p.ParameterName = "$id";
|
||||
p.Value = id;
|
||||
cmd.Parameters.Add(p);
|
||||
var result = await cmd.ExecuteScalarAsync();
|
||||
return result == null || result == DBNull.Value ? null : (string)result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
using System.Reflection;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.ConfigurationDatabase;
|
||||
|
||||
namespace ScadaLink.ConfigurationDatabase.Tests;
|
||||
|
||||
public class ServiceCollectionExtensionsTests
|
||||
{
|
||||
[Fact]
|
||||
public void AddConfigurationDatabase_WithConnectionString_RegistersRepositoriesAndServices()
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
|
||||
services.AddConfigurationDatabase("DataSource=:memory:");
|
||||
|
||||
Assert.Contains(services, d => d.ServiceType == typeof(ITemplateEngineRepository));
|
||||
Assert.Contains(services, d => d.ServiceType == typeof(IAuditService));
|
||||
Assert.Contains(services, d => d.ServiceType == typeof(IInstanceLocator));
|
||||
}
|
||||
|
||||
// The no-arg overload is [Obsolete(error: true)], so it cannot be referenced directly
|
||||
// from source — that is the compile-time guard. Invoke it via reflection to verify the
|
||||
// runtime defence-in-depth behaviour.
|
||||
private static MethodInfo NoArgOverload =>
|
||||
typeof(ServiceCollectionExtensions).GetMethod(
|
||||
nameof(ServiceCollectionExtensions.AddConfigurationDatabase),
|
||||
BindingFlags.Public | BindingFlags.Static,
|
||||
binder: null,
|
||||
types: new[] { typeof(IServiceCollection) },
|
||||
modifiers: null)!;
|
||||
|
||||
[Fact]
|
||||
public void AddConfigurationDatabase_NoArgOverload_FailsFastWithClearMessage()
|
||||
{
|
||||
// Regression guard for ConfigurationDatabase-003: the parameterless overload must not
|
||||
// silently register nothing. Misuse must surface immediately at wire-up time with an
|
||||
// actionable message — not later as an opaque DI resolution failure.
|
||||
var services = new ServiceCollection();
|
||||
|
||||
var invocation = Assert.Throws<TargetInvocationException>(
|
||||
() => NoArgOverload.Invoke(null, new object[] { services }));
|
||||
|
||||
var ex = Assert.IsType<InvalidOperationException>(invocation.InnerException);
|
||||
Assert.Contains("connection string", ex.Message, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AddConfigurationDatabase_NoArgOverload_IsMarkedObsoleteAsError()
|
||||
{
|
||||
// The no-op overload must be flagged so misuse is caught at compile time.
|
||||
var obsolete = NoArgOverload.GetCustomAttribute<ObsoleteAttribute>();
|
||||
Assert.NotNull(obsolete);
|
||||
Assert.True(obsolete!.IsError);
|
||||
}
|
||||
}
|
||||
@@ -372,15 +372,24 @@ public class ServiceRegistrationTests
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AddConfigurationDatabase_NoArgs_DoesNotThrow()
|
||||
public void AddConfigurationDatabase_NoArgs_FailsFast()
|
||||
{
|
||||
// ConfigurationDatabase-003: the no-arg overload previously silently registered
|
||||
// nothing, which deferred a misconfiguration into an opaque DI failure later.
|
||||
// It is now [Obsolete(error: true)] (compile-time guard) and throws at runtime.
|
||||
// Invoked via reflection because the obsolete-error overload cannot be called
|
||||
// directly from source.
|
||||
var method = typeof(ServiceCollectionExtensions).GetMethod(
|
||||
nameof(ServiceCollectionExtensions.AddConfigurationDatabase),
|
||||
System.Reflection.BindingFlags.Public | System.Reflection.BindingFlags.Static,
|
||||
binder: null,
|
||||
types: new[] { typeof(IServiceCollection) },
|
||||
modifiers: null)!;
|
||||
var services = new ServiceCollection();
|
||||
services.AddConfigurationDatabase();
|
||||
|
||||
// Should not register DbContext (no-op for backward compatibility)
|
||||
var provider = services.BuildServiceProvider();
|
||||
var context = provider.GetService<ScadaLinkDbContext>();
|
||||
Assert.Null(context);
|
||||
var invocation = Assert.Throws<System.Reflection.TargetInvocationException>(
|
||||
() => method.Invoke(null, new object[] { services }));
|
||||
Assert.IsType<InvalidOperationException>(invocation.InnerException);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -612,6 +612,197 @@ public class DataConnectionActorTests : TestKit
|
||||
Assert.Contains("timeout", response.ErrorMessage, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-006: quality counters must not drift after unsubscribe/reconnect ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL006_DisconnectAfterUnsubscribe_BadQualityCountMatchesRemainingTags()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-006. _lastTagQuality and the three
|
||||
// quality counters were never cleaned up on unsubscribe, so a tag removed via
|
||||
// HandleUnsubscribe lingered in _lastTagQuality. PushBadQualityForAllTags then
|
||||
// set _tagsBadQuality = _lastTagQuality.Count, counting the dropped tag and
|
||||
// drifting the bad-quality count above the number of currently subscribed tags.
|
||||
var callbacks = new System.Collections.Concurrent.ConcurrentDictionary<string, SubscriptionCallback>();
|
||||
var connectCount = 0;
|
||||
var reconnectGate = new TaskCompletionSource();
|
||||
// First connect succeeds; the reconnect after the disconnect hangs so the actor
|
||||
// stays in Reconnecting and ReSubscribeAll does not run before the assertion.
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(_ => Interlocked.Increment(ref connectCount) == 1
|
||||
? Task.CompletedTask
|
||||
: reconnectGate.Task);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
_mockAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(ci =>
|
||||
{
|
||||
callbacks[(string)ci[0]] = (SubscriptionCallback)ci[1];
|
||||
return Task.FromResult("sub-" + (string)ci[0]);
|
||||
});
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl006-drift");
|
||||
await Task.Delay(300);
|
||||
|
||||
// Two instances, one tag each.
|
||||
actor.Tell(new SubscribeTagsRequest("c1", "instA", "dcl006-drift", ["tagA"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
actor.Tell(new SubscribeTagsRequest("c2", "instB", "dcl006-drift", ["tagB"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
// Push a Good value for each tag so both land in _lastTagQuality.
|
||||
AwaitCondition(() => callbacks.ContainsKey("tagA") && callbacks.ContainsKey("tagB"),
|
||||
TimeSpan.FromSeconds(3));
|
||||
callbacks["tagA"]("tagA", new TagValue(1, QualityCode.Good, DateTimeOffset.UtcNow));
|
||||
callbacks["tagB"]("tagB", new TagValue(2, QualityCode.Good, DateTimeOffset.UtcNow));
|
||||
await Task.Delay(200);
|
||||
|
||||
// Unsubscribe instance B — tagB is no longer subscribed by anyone.
|
||||
actor.Tell(new UnsubscribeTagsRequest("c3", "instB", "dcl006-drift", DateTimeOffset.UtcNow));
|
||||
await Task.Delay(200);
|
||||
|
||||
_mockHealthCollector.ClearReceivedCalls();
|
||||
|
||||
// Disconnect — PushBadQualityForAllTags runs (the reconnect hangs on the gate,
|
||||
// so the actor stays in Reconnecting and ReSubscribeAll does not run).
|
||||
RaiseDisconnected(_mockAdapter);
|
||||
await Task.Delay(300);
|
||||
|
||||
// PushBadQualityForAllTags must report exactly 1 bad tag (only tagA is still
|
||||
// subscribed). Pre-fix tagB lingered in _lastTagQuality and bad was reported as 2.
|
||||
var qualityCall = _mockHealthCollector.ReceivedCalls()
|
||||
.Where(c => c.GetMethodInfo().Name == "UpdateTagQuality")
|
||||
.FirstOrDefault();
|
||||
Assert.NotNull(qualityCall);
|
||||
var args = qualityCall!.GetArguments();
|
||||
var bad = (int)args[2]!;
|
||||
Assert.Equal(1, bad);
|
||||
|
||||
reconnectGate.SetCanceled();
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-010: tag-resolution retry must not double-dispatch ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL010_TagResolutionRetry_DoesNotIssueDuplicateConcurrentSubscribes()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-010. HandleRetryTagResolution fired a
|
||||
// SubscribeAsync for every unresolved tag without removing it from _unresolvedTags
|
||||
// first. A slow SubscribeAsync overlapping the next retry tick produced duplicate
|
||||
// concurrent subscribe attempts for the same tag, leaking the first monitored
|
||||
// item / subscription id. After the fix a tag in flight is excluded from the
|
||||
// next retry until its attempt completes.
|
||||
_options.TagResolutionRetryInterval = TimeSpan.FromMilliseconds(100);
|
||||
|
||||
var subscribeGate = new TaskCompletionSource<string>();
|
||||
var subscribeCalls = 0;
|
||||
_mockAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
_mockAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
_mockAdapter.SubscribeAsync("slow/tag", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(ci =>
|
||||
{
|
||||
var n = Interlocked.Increment(ref subscribeCalls);
|
||||
// First call (initial subscribe) fails genuinely → unresolved.
|
||||
if (n == 1) return Task.FromException<string>(new KeyNotFoundException("not found yet"));
|
||||
// Subsequent calls are retry attempts — block on the gate so they stay
|
||||
// in flight across multiple retry ticks.
|
||||
return subscribeGate.Task;
|
||||
});
|
||||
_mockAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateConnectionActor("dcl010-retry");
|
||||
await Task.Delay(300);
|
||||
|
||||
actor.Tell(new SubscribeTagsRequest("c1", "inst1", "dcl010-retry", ["slow/tag"], DateTimeOffset.UtcNow));
|
||||
// Initial subscribe fails → bad-quality push then ack.
|
||||
ExpectMsg<TagValueUpdate>(TimeSpan.FromSeconds(5));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(5));
|
||||
|
||||
// Let several retry ticks (100ms each) elapse while the first retry is blocked.
|
||||
await Task.Delay(600);
|
||||
|
||||
// Exactly one retry attempt should be in flight: 1 initial + 1 retry = 2 total.
|
||||
// Pre-fix, every 100ms tick dispatched another → far more than 2.
|
||||
Assert.Equal(2, Volatile.Read(ref subscribeCalls));
|
||||
|
||||
subscribeGate.SetCanceled();
|
||||
}
|
||||
|
||||
// ── DataConnectionLayer-011: stale callbacks from a disposed adapter must be dropped ──
|
||||
|
||||
[Fact]
|
||||
public async Task DCL011_StaleTagValueFromOldAdapter_IsNotForwardedAfterFailover()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-011. On failover the old adapter is
|
||||
// disposed and a fresh one created, but the old adapter's subscription callbacks
|
||||
// captured Self and keep Telling TagValueReceived. With no per-adapter generation
|
||||
// tag, a value from the disposed adapter delivered after the actor is Connected
|
||||
// on the new endpoint would be forwarded to the Instance Actor, mixing
|
||||
// pre-failover device data with the active endpoint's data.
|
||||
var primaryConfig = new Dictionary<string, string> { ["Endpoint"] = "opc.tcp://primary:4840" };
|
||||
var backupConfig = new Dictionary<string, string> { ["Endpoint"] = "opc.tcp://backup:4840" };
|
||||
var primaryAdapter = Substitute.For<IDataConnection>();
|
||||
var backupAdapter = Substitute.For<IDataConnection>();
|
||||
|
||||
SubscriptionCallback? primaryCallback = null;
|
||||
|
||||
var primaryConnectCount = 0;
|
||||
primaryAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(_ => Interlocked.Increment(ref primaryConnectCount) == 1
|
||||
? Task.CompletedTask
|
||||
: Task.FromException(new Exception("Primary down")));
|
||||
primaryAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
primaryAdapter.SubscribeAsync("sensor/temp", Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns(ci =>
|
||||
{
|
||||
primaryCallback = (SubscriptionCallback)ci[1];
|
||||
return Task.FromResult("sub-primary");
|
||||
});
|
||||
primaryAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
_mockFactory.Create("OpcUa", Arg.Is<IDictionary<string, string>>(d => d["Endpoint"] == "opc.tcp://backup:4840"))
|
||||
.Returns(backupAdapter);
|
||||
backupAdapter.ConnectAsync(Arg.Any<IDictionary<string, string>>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
backupAdapter.Status.Returns(ConnectionHealth.Connected);
|
||||
backupAdapter.SubscribeAsync(Arg.Any<string>(), Arg.Any<SubscriptionCallback>(), Arg.Any<CancellationToken>())
|
||||
.Returns("sub-backup");
|
||||
backupAdapter.ReadAsync(Arg.Any<string>(), Arg.Any<CancellationToken>())
|
||||
.Returns(new ReadResult(false, null, null));
|
||||
|
||||
var actor = CreateFailoverActor(primaryAdapter, "dcl011-stale", primaryConfig, backupConfig, failoverRetryCount: 1);
|
||||
|
||||
AwaitCondition(() => primaryConnectCount >= 1, TimeSpan.FromSeconds(2));
|
||||
await Task.Delay(200);
|
||||
|
||||
actor.Tell(new SubscribeTagsRequest("c1", TestActor.Path.Name, "dcl011-stale", ["sensor/temp"], DateTimeOffset.UtcNow));
|
||||
ExpectMsg<SubscribeTagsResponse>(TimeSpan.FromSeconds(3));
|
||||
AwaitCondition(() => primaryCallback != null, TimeSpan.FromSeconds(3));
|
||||
|
||||
// Fail over to backup.
|
||||
RaiseDisconnected(primaryAdapter);
|
||||
|
||||
// The disconnect pushes a bad-quality ConnectionQualityChanged to the subscriber.
|
||||
ExpectMsg<ConnectionQualityChanged>(TimeSpan.FromSeconds(3));
|
||||
|
||||
AwaitCondition(() =>
|
||||
backupAdapter.ReceivedCalls().Any(c => c.GetMethodInfo().Name == "SubscribeAsync"),
|
||||
TimeSpan.FromSeconds(5));
|
||||
await Task.Delay(300); // actor is Connected on backup
|
||||
|
||||
// Drain any value updates produced by the re-subscribe path.
|
||||
ExpectNoMsg(TimeSpan.FromMilliseconds(300));
|
||||
|
||||
// The disposed primary adapter's callback fires a stale value.
|
||||
primaryCallback!("sensor/temp", new TagValue(999, QualityCode.Good, DateTimeOffset.UtcNow));
|
||||
|
||||
// That stale value must NOT reach the subscriber.
|
||||
ExpectNoMsg(TimeSpan.FromSeconds(1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DCL001_SubscribeWithFailedTags_CountsResolvedAndUnresolvedSeparately()
|
||||
{
|
||||
|
||||
@@ -37,6 +37,44 @@ public class RealOpcUaClientThreadSafetyTests
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DataConnectionLayer-012: secure-by-default certificate handling.
|
||||
/// </summary>
|
||||
public class OpcUaCertificateDefaultTests
|
||||
{
|
||||
[Fact]
|
||||
public void DCL012_OpcUaConnectionOptions_AutoAcceptUntrustedCerts_DefaultsToFalse()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-012. AutoAcceptUntrustedCerts defaulted
|
||||
// to true, accepting every server certificate unconditionally and defeating the
|
||||
// Sign / SignAndEncrypt security modes against an active man-in-the-middle. A
|
||||
// secure-by-default posture rejects untrusted certs unless explicitly opted in.
|
||||
var options = new OpcUaConnectionOptions();
|
||||
Assert.False(options.AutoAcceptUntrustedCerts);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DataConnectionLayer-009: failover-stability tunables must be configurable.
|
||||
/// </summary>
|
||||
public class DataConnectionOptionsStabilityTests
|
||||
{
|
||||
[Fact]
|
||||
public void DCL009_StableConnectionThreshold_IsConfigurable_WithSixtySecondDefault()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-009. The unstable-disconnect failover
|
||||
// path used a hard-coded 60s StableConnectionThreshold constant inside
|
||||
// DataConnectionActor. It must live on DataConnectionOptions like the other
|
||||
// tunables (ReconnectInterval, TagResolutionRetryInterval, WriteTimeout) so it
|
||||
// is configurable via appsettings.json.
|
||||
var options = new DataConnectionOptions();
|
||||
Assert.Equal(TimeSpan.FromSeconds(60), options.StableConnectionThreshold);
|
||||
|
||||
options.StableConnectionThreshold = TimeSpan.FromSeconds(30);
|
||||
Assert.Equal(TimeSpan.FromSeconds(30), options.StableConnectionThreshold);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-7: Tests for OPC UA adapter.
|
||||
/// </summary>
|
||||
@@ -162,6 +200,36 @@ public class OpcUaDataConnectionTests
|
||||
Assert.All(results.Values, r => Assert.True(r.Success));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DCL007_ReadBatch_ReturnsPerTagResults_WhenOneTagFails()
|
||||
{
|
||||
// Regression test for DataConnectionLayer-007. ReadBatchAsync looped calling
|
||||
// ReadAsync per tag; ReadAsync re-throws any non-cancellation exception, so a
|
||||
// single failing tag aborted the whole batch and the caller got NO results for
|
||||
// the tags that did read successfully — even though ReadResult already carries
|
||||
// a per-tag Success/ErrorMessage shape. After the fix the batch catches per-tag
|
||||
// exceptions and returns a complete map.
|
||||
_mockClient.IsConnected.Returns(true);
|
||||
_mockClient.ReadValueAsync("good1", Arg.Any<CancellationToken>())
|
||||
.Returns((1.0, DateTime.UtcNow, 0u));
|
||||
_mockClient.ReadValueAsync("bad", Arg.Any<CancellationToken>())
|
||||
.Returns<(object?, DateTime, uint)>(_ => throw new InvalidOperationException("node not found"));
|
||||
_mockClient.ReadValueAsync("good2", Arg.Any<CancellationToken>())
|
||||
.Returns((2.0, DateTime.UtcNow, 0u));
|
||||
|
||||
await _adapter.ConnectAsync(new Dictionary<string, string>());
|
||||
|
||||
var results = await _adapter.ReadBatchAsync(["good1", "bad", "good2"]);
|
||||
|
||||
// Every requested tag is present in the result map.
|
||||
Assert.Equal(3, results.Count);
|
||||
Assert.True(results["good1"].Success);
|
||||
Assert.True(results["good2"].Success);
|
||||
// The failing tag is reported as a failed ReadResult, not by aborting the batch.
|
||||
Assert.False(results["bad"].Success);
|
||||
Assert.NotNull(results["bad"].ErrorMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task NotConnected_ThrowsOnOperations()
|
||||
{
|
||||
|
||||
@@ -13,6 +13,7 @@ using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
using ScadaLink.Commons.Types.Flattening;
|
||||
using ScadaLink.Communication;
|
||||
using ScadaLink.TemplateEngine.Flattening;
|
||||
|
||||
namespace ScadaLink.DeploymentManager.Tests;
|
||||
|
||||
@@ -45,7 +46,8 @@ public class DeploymentServiceTests : TestKit
|
||||
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
_service = new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, _comms, _lockManager, _audit, options,
|
||||
_repo, siteRepo, _pipeline, _comms, _lockManager, _audit,
|
||||
new DiffService(), options,
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
}
|
||||
|
||||
@@ -276,6 +278,34 @@ public class DeploymentServiceTests : TestKit
|
||||
Assert.Contains("not found", result.Error);
|
||||
}
|
||||
|
||||
// ── DeploymentManager-004: site-success but central-delete-failure must not escape uncaught ──
|
||||
|
||||
[Fact]
|
||||
public async Task DeleteInstanceAsync_SiteSucceeds_CentralDeleteFails_ReturnsDistinctFailure()
|
||||
{
|
||||
// The site destroys the Instance Actor and removes its config (response
|
||||
// Success), but the central record removal throws. The exception must
|
||||
// NOT propagate uncaught -- it must be surfaced as a distinct failure so
|
||||
// an operator can reconcile the orphaned central record.
|
||||
var instance = new Instance("OrphanInst") { Id = 30, SiteId = 1, State = InstanceState.Enabled };
|
||||
_repo.GetInstanceByIdAsync(30, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
_repo.DeleteInstanceAsync(30, Arg.Any<CancellationToken>())
|
||||
.Returns<Task>(_ => throw new InvalidOperationException("db unavailable"));
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new ReconcileProbeActor(siteHash: "sha256:x", failQuery: false)));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var result = await service.DeleteInstanceAsync(30, "admin");
|
||||
|
||||
// The failure is surfaced (not thrown) and clearly says the site
|
||||
// succeeded but the central record could not be removed.
|
||||
Assert.True(result.IsFailure);
|
||||
Assert.Contains("site", result.Error, StringComparison.OrdinalIgnoreCase);
|
||||
Assert.Contains("central", result.Error, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// ── WP-8: Deployment comparison ──
|
||||
|
||||
[Fact]
|
||||
@@ -331,6 +361,51 @@ public class DeploymentServiceTests : TestKit
|
||||
Assert.True(result.Value.IsStale);
|
||||
}
|
||||
|
||||
// ── DeploymentManager-007: comparison must produce a structured diff ──
|
||||
|
||||
[Fact]
|
||||
public async Task GetDeploymentComparisonAsync_ProducesStructuredDiff()
|
||||
{
|
||||
// The deployed snapshot has one attribute; the current template-derived
|
||||
// config has a different attribute. The comparison must surface a real
|
||||
// Added/Removed diff via the TemplateEngine DiffService, not just a
|
||||
// boolean staleness flag.
|
||||
var deployedConfig = new FlattenedConfiguration
|
||||
{
|
||||
InstanceUniqueName = "DiffInst",
|
||||
Attributes = [new ResolvedAttribute { CanonicalName = "OldAttr", DataType = "Int" }]
|
||||
};
|
||||
var snapshot = new DeployedConfigSnapshot(
|
||||
"dep1", "sha256:old", System.Text.Json.JsonSerializer.Serialize(deployedConfig))
|
||||
{
|
||||
InstanceId = 40,
|
||||
DeployedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
_repo.GetDeployedSnapshotByInstanceIdAsync(40, Arg.Any<CancellationToken>()).Returns(snapshot);
|
||||
|
||||
var currentConfig = new FlattenedConfiguration
|
||||
{
|
||||
InstanceUniqueName = "DiffInst",
|
||||
Attributes = [new ResolvedAttribute { CanonicalName = "NewAttr", DataType = "Int" }]
|
||||
};
|
||||
_pipeline.FlattenAndValidateAsync(40, Arg.Any<CancellationToken>())
|
||||
.Returns(Result<FlatteningPipelineResult>.Success(
|
||||
new FlatteningPipelineResult(currentConfig, "sha256:new", ValidationResult.Success())));
|
||||
|
||||
var result = await _service.GetDeploymentComparisonAsync(40);
|
||||
|
||||
Assert.True(result.IsSuccess);
|
||||
Assert.True(result.Value.IsStale);
|
||||
|
||||
// A structured diff is present with the added and removed attributes.
|
||||
Assert.NotNull(result.Value.Diff);
|
||||
Assert.True(result.Value.Diff!.HasChanges);
|
||||
Assert.Contains(result.Value.Diff.AttributeChanges,
|
||||
c => c.CanonicalName == "NewAttr" && c.ChangeType == DiffChangeType.Added);
|
||||
Assert.Contains(result.Value.Diff.AttributeChanges,
|
||||
c => c.CanonicalName == "OldAttr" && c.ChangeType == DiffChangeType.Removed);
|
||||
}
|
||||
|
||||
// ── WP-2: GetDeploymentStatusAsync ──
|
||||
|
||||
[Fact]
|
||||
@@ -352,8 +427,11 @@ public class DeploymentServiceTests : TestKit
|
||||
// ── Audit logging ──
|
||||
|
||||
[Fact]
|
||||
public async Task DeployInstanceAsync_AuditLogs()
|
||||
public async Task DeployInstanceAsync_FlatteningFails_DoesNotReachAudit()
|
||||
{
|
||||
// DeploymentManager-011: this test previously asserted nothing. A
|
||||
// flatten failure returns before any site communication, so no audit
|
||||
// entry is written.
|
||||
var instance = new Instance("TestInst") { Id = 1, SiteId = 1, State = InstanceState.NotDeployed };
|
||||
_repo.GetInstanceByIdAsync(1).Returns(instance);
|
||||
|
||||
@@ -362,8 +440,120 @@ public class DeploymentServiceTests : TestKit
|
||||
|
||||
await _service.DeployInstanceAsync(1, "admin");
|
||||
|
||||
// Failure case does not reach audit (returns before communication)
|
||||
// The audit is only logged after communication succeeds/fails
|
||||
await _audit.DidNotReceive().LogAsync(
|
||||
Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(),
|
||||
Arg.Any<string>(), Arg.Any<object>(), Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployInstanceAsync_SiteSucceeds_WritesDeployAuditEntry()
|
||||
{
|
||||
// DeploymentManager-011: a successful deployment must write a "Deploy"
|
||||
// audit entry referencing the deployed instance.
|
||||
var instance = new Instance("AuditInst") { Id = 50, SiteId = 1, State = InstanceState.NotDeployed };
|
||||
_repo.GetInstanceByIdAsync(50, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
SetupValidPipeline(50, "AuditInst", "sha256:target");
|
||||
_repo.GetCurrentDeploymentStatusAsync(50, Arg.Any<CancellationToken>())
|
||||
.Returns((DeploymentRecord?)null);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false)));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var result = await service.DeployInstanceAsync(50, "admin");
|
||||
|
||||
Assert.True(result.IsSuccess);
|
||||
await _audit.Received().LogAsync(
|
||||
"admin", "Deploy", "Instance", "50", "AuditInst",
|
||||
Arg.Any<object>(), Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
// ── DeploymentManager-011: lifecycle success paths ──
|
||||
|
||||
[Fact]
|
||||
public async Task DisableInstanceAsync_SiteSucceeds_SetsDisabledStateAndAudits()
|
||||
{
|
||||
var instance = new Instance("DisInst") { Id = 51, SiteId = 1, State = InstanceState.Enabled };
|
||||
_repo.GetInstanceByIdAsync(51, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new ReconcileProbeActor(siteHash: "x", failQuery: false)));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var result = await service.DisableInstanceAsync(51, "admin");
|
||||
|
||||
Assert.True(result.IsSuccess);
|
||||
Assert.Equal(InstanceState.Disabled, instance.State);
|
||||
await _repo.Received().UpdateInstanceAsync(instance, Arg.Any<CancellationToken>());
|
||||
await _audit.Received().LogAsync(
|
||||
"admin", "Disable", "Instance", "51", "DisInst",
|
||||
Arg.Any<object>(), Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EnableInstanceAsync_SiteSucceeds_SetsEnabledStateAndAudits()
|
||||
{
|
||||
var instance = new Instance("EnInst") { Id = 52, SiteId = 1, State = InstanceState.Disabled };
|
||||
_repo.GetInstanceByIdAsync(52, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new ReconcileProbeActor(siteHash: "x", failQuery: false)));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var result = await service.EnableInstanceAsync(52, "admin");
|
||||
|
||||
Assert.True(result.IsSuccess);
|
||||
Assert.Equal(InstanceState.Enabled, instance.State);
|
||||
await _repo.Received().UpdateInstanceAsync(instance, Arg.Any<CancellationToken>());
|
||||
await _audit.Received().LogAsync(
|
||||
"admin", "Enable", "Instance", "52", "EnInst",
|
||||
Arg.Any<object>(), Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeleteInstanceAsync_SiteSucceeds_RemovesRecordAndAudits()
|
||||
{
|
||||
var instance = new Instance("DelInst") { Id = 53, SiteId = 1, State = InstanceState.Enabled };
|
||||
_repo.GetInstanceByIdAsync(53, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new ReconcileProbeActor(siteHash: "x", failQuery: false)));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var result = await service.DeleteInstanceAsync(53, "admin");
|
||||
|
||||
Assert.True(result.IsSuccess);
|
||||
await _repo.Received().DeleteInstanceAsync(53, Arg.Any<CancellationToken>());
|
||||
await _audit.Received().LogAsync(
|
||||
"admin", "Delete", "Instance", "53", "DelInst",
|
||||
Arg.Any<object>(), Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployInstanceAsync_SameInstance_OperationLockSerializesConcurrentDeploys()
|
||||
{
|
||||
// DeploymentManager-011: two concurrent deploys of the SAME instance
|
||||
// must be serialized by the per-instance operation lock — the site sees
|
||||
// them one at a time, never overlapping.
|
||||
var instance = new Instance("LockInst") { Id = 54, SiteId = 1, State = InstanceState.Enabled };
|
||||
_repo.GetInstanceByIdAsync(54, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
SetupValidPipeline(54, "LockInst", "sha256:target");
|
||||
_repo.GetCurrentDeploymentStatusAsync(54, Arg.Any<CancellationToken>())
|
||||
.Returns((DeploymentRecord?)null);
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new SerializationProbeActor()));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var deploy1 = service.DeployInstanceAsync(54, "admin");
|
||||
var deploy2 = service.DeployInstanceAsync(54, "admin");
|
||||
var results = await Task.WhenAll(deploy1, deploy2);
|
||||
|
||||
Assert.True(results[0].IsSuccess);
|
||||
Assert.True(results[1].IsSuccess);
|
||||
// The probe records the maximum concurrency observed; the lock must
|
||||
// keep it at 1 for a single instance.
|
||||
Assert.Equal(1, SerializationProbeActor.MaxConcurrent);
|
||||
}
|
||||
|
||||
// ── DeploymentManager-006: query-the-site-before-redeploy idempotency ──
|
||||
@@ -386,6 +576,7 @@ public class DeploymentServiceTests : TestKit
|
||||
var siteRepo = Substitute.For<ISiteRepository>();
|
||||
return new DeploymentService(
|
||||
_repo, siteRepo, _pipeline, comms, _lockManager, _audit,
|
||||
new DiffService(),
|
||||
Options.Create(new DeploymentManagerOptions { OperationLockTimeout = TimeSpan.FromSeconds(5) }),
|
||||
NullLogger<DeploymentService>.Instance);
|
||||
}
|
||||
@@ -572,6 +763,109 @@ public class DeploymentServiceTests : TestKit
|
||||
Assert.Equal(1, ReconcileProbeActor.DeployCount);
|
||||
}
|
||||
|
||||
// ── DeploymentManager-003: post-success persistence must commit the Success status ──
|
||||
|
||||
[Fact]
|
||||
public async Task DeployInstanceAsync_SiteSucceeds_SnapshotWriteFails_RecordStillCommittedSuccess()
|
||||
{
|
||||
// The site applies the deployment (response Success), but storing the
|
||||
// deployed-config snapshot afterwards throws. The deployment record's
|
||||
// Success status MUST still be durably committed -- otherwise central
|
||||
// and site diverge: the site runs the new config while central shows a
|
||||
// non-Success record forever.
|
||||
var instance = new Instance("SnapFailInst") { Id = 20, SiteId = 1, State = InstanceState.NotDeployed };
|
||||
_repo.GetInstanceByIdAsync(20, Arg.Any<CancellationToken>()).Returns(instance);
|
||||
SetupValidPipeline(20, "SnapFailInst", "sha256:target");
|
||||
_repo.GetCurrentDeploymentStatusAsync(20, Arg.Any<CancellationToken>())
|
||||
.Returns((DeploymentRecord?)null);
|
||||
|
||||
DeploymentRecord? captured = null;
|
||||
await _repo.AddDeploymentRecordAsync(
|
||||
Arg.Do<DeploymentRecord>(r => captured = r), Arg.Any<CancellationToken>());
|
||||
|
||||
// The snapshot store throws.
|
||||
_repo.GetDeployedSnapshotByInstanceIdAsync(20, Arg.Any<CancellationToken>())
|
||||
.Returns((DeployedConfigSnapshot?)null);
|
||||
_repo.AddDeployedSnapshotAsync(Arg.Any<DeployedConfigSnapshot>(), Arg.Any<CancellationToken>())
|
||||
.Returns<Task>(_ => throw new InvalidOperationException("snapshot store unavailable"));
|
||||
|
||||
var commActor = Sys.ActorOf(Props.Create(() =>
|
||||
new ReconcileProbeActor(siteHash: "sha256:target", failQuery: false)));
|
||||
var service = CreateServiceWithCommActor(commActor);
|
||||
|
||||
var result = await service.DeployInstanceAsync(20, "admin");
|
||||
|
||||
// The site succeeded -> the deployment is reported successful.
|
||||
Assert.True(result.IsSuccess);
|
||||
Assert.NotNull(captured);
|
||||
Assert.Equal(DeploymentStatus.Success, captured!.Status);
|
||||
|
||||
// The Success status was committed (a SaveChanges happened with the
|
||||
// record in Success state) BEFORE the snapshot write was attempted.
|
||||
await _repo.Received().UpdateDeploymentRecordAsync(
|
||||
Arg.Is<DeploymentRecord>(r => r.Status == DeploymentStatus.Success),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stand-in CentralCommunicationActor that measures deploy concurrency. It
|
||||
/// defers each deploy reply via the scheduler, so if two deploys for the
|
||||
/// same instance were NOT serialized by the operation lock their windows
|
||||
/// would overlap and <see cref="MaxConcurrent"/> would exceed 1.
|
||||
/// </summary>
|
||||
private class SerializationProbeActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
public static int MaxConcurrent;
|
||||
private static int _current;
|
||||
private static readonly object Gate = new();
|
||||
|
||||
public ITimerScheduler Timers { get; set; } = null!;
|
||||
|
||||
public SerializationProbeActor()
|
||||
{
|
||||
MaxConcurrent = 0;
|
||||
_current = 0;
|
||||
|
||||
Receive<SiteEnvelope>(env =>
|
||||
{
|
||||
if (env.Message is DeployInstanceCommand d)
|
||||
{
|
||||
lock (Gate)
|
||||
{
|
||||
_current++;
|
||||
if (_current > MaxConcurrent) MaxConcurrent = _current;
|
||||
}
|
||||
|
||||
var replyTo = Sender;
|
||||
// Defer the reply so the deploy "window" stays open long
|
||||
// enough for a non-serialized second deploy to overlap.
|
||||
Timers.StartSingleTimer(
|
||||
d.DeploymentId,
|
||||
new CompleteDeploy(d, replyTo),
|
||||
TimeSpan.FromMilliseconds(150));
|
||||
}
|
||||
else if (env.Message is DeploymentStateQueryRequest q)
|
||||
{
|
||||
Sender.Tell(new DeploymentStateQueryResponse(
|
||||
q.CorrelationId, q.InstanceUniqueName, false, null, null, DateTimeOffset.UtcNow));
|
||||
}
|
||||
});
|
||||
|
||||
Receive<CompleteDeploy>(c =>
|
||||
{
|
||||
lock (Gate)
|
||||
{
|
||||
_current--;
|
||||
}
|
||||
c.ReplyTo.Tell(new DeploymentStatusResponse(
|
||||
c.Command.DeploymentId, c.Command.InstanceUniqueName,
|
||||
DeploymentStatus.Success, null, DateTimeOffset.UtcNow));
|
||||
});
|
||||
}
|
||||
|
||||
private sealed record CompleteDeploy(DeployInstanceCommand Command, IActorRef ReplyTo);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stand-in CentralCommunicationActor for reconciliation tests. Counts the
|
||||
/// site queries and deploy commands it receives, answers queries with a
|
||||
@@ -610,6 +904,21 @@ public class DeploymentServiceTests : TestKit
|
||||
d.DeploymentId, d.InstanceUniqueName,
|
||||
DeploymentStatus.Success, null, DateTimeOffset.UtcNow));
|
||||
break;
|
||||
|
||||
case DisableInstanceCommand dis:
|
||||
Sender.Tell(new InstanceLifecycleResponse(
|
||||
dis.CommandId, dis.InstanceUniqueName, true, null, DateTimeOffset.UtcNow));
|
||||
break;
|
||||
|
||||
case EnableInstanceCommand en:
|
||||
Sender.Tell(new InstanceLifecycleResponse(
|
||||
en.CommandId, en.InstanceUniqueName, true, null, DateTimeOffset.UtcNow));
|
||||
break;
|
||||
|
||||
case DeleteInstanceCommand del:
|
||||
Sender.Tell(new InstanceLifecycleResponse(
|
||||
del.CommandId, del.InstanceUniqueName, true, null, DateTimeOffset.UtcNow));
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -92,4 +92,52 @@ public class OperationLockManagerTests
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
}
|
||||
|
||||
// ── DeploymentManager-005: semaphore must not leak ──
|
||||
|
||||
[Fact]
|
||||
public async Task AcquireAsync_ReleasedLock_RemovesSemaphoreEntry()
|
||||
{
|
||||
// A semaphore that is created, used, and fully released must not be
|
||||
// retained — otherwise every distinct instance name leaks a
|
||||
// SemaphoreSlim (a kernel handle) for the life of the process.
|
||||
using (await _lockManager.AcquireAsync("transient-inst", TimeSpan.FromSeconds(5)))
|
||||
{
|
||||
Assert.Equal(1, _lockManager.TrackedLockCount);
|
||||
}
|
||||
|
||||
// After release, the entry is reclaimed.
|
||||
Assert.Equal(0, _lockManager.TrackedLockCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AcquireAsync_ManyDistinctInstances_DoesNotAccumulateSemaphores()
|
||||
{
|
||||
// Simulates the long-running central process: many instances are
|
||||
// deployed/disabled over time. Their semaphores must be reclaimed.
|
||||
for (var i = 0; i < 100; i++)
|
||||
{
|
||||
using var handle = await _lockManager.AcquireAsync($"churn-{i}", TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
Assert.Equal(0, _lockManager.TrackedLockCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AcquireAsync_ContendedLock_KeepsSemaphoreUntilLastReleaseThenReclaims()
|
||||
{
|
||||
// While a second caller is waiting, the semaphore must survive the
|
||||
// first release; only when the last holder releases is it reclaimed.
|
||||
var first = await _lockManager.AcquireAsync("contended", TimeSpan.FromSeconds(5));
|
||||
|
||||
var secondAcquire = _lockManager.AcquireAsync("contended", TimeSpan.FromSeconds(5));
|
||||
|
||||
first.Dispose(); // hands the lock to the waiter; entry must NOT be removed
|
||||
var second = await secondAcquire;
|
||||
|
||||
Assert.Equal(1, _lockManager.TrackedLockCount);
|
||||
second.Dispose();
|
||||
|
||||
Assert.Equal(0, _lockManager.TrackedLockCount);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.DeploymentManager.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// DeploymentManager-008: DeploymentManagerOptions must be bound to the
|
||||
/// "ScadaLink:DeploymentManager" configuration section, consistent with the
|
||||
/// Options-pattern convention used by the other components.
|
||||
/// </summary>
|
||||
public class ServiceCollectionExtensionsTests
|
||||
{
|
||||
[Fact]
|
||||
public void AddDeploymentManager_WithConfiguration_BindsDeploymentManagerOptions()
|
||||
{
|
||||
var configuration = new ConfigurationBuilder()
|
||||
.AddInMemoryCollection(new Dictionary<string, string?>
|
||||
{
|
||||
["ScadaLink:DeploymentManager:OperationLockTimeout"] = "00:00:09",
|
||||
["ScadaLink:DeploymentManager:ArtifactDeploymentTimeoutPerSite"] = "00:03:00"
|
||||
})
|
||||
.Build();
|
||||
|
||||
var services = new ServiceCollection();
|
||||
services.AddDeploymentManager(configuration);
|
||||
|
||||
using var provider = services.BuildServiceProvider();
|
||||
var options = provider.GetRequiredService<IOptions<DeploymentManagerOptions>>().Value;
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(9), options.OperationLockTimeout);
|
||||
Assert.Equal(TimeSpan.FromMinutes(3), options.ArtifactDeploymentTimeoutPerSite);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AddDeploymentManager_WithConfiguration_MissingSection_UsesDefaults()
|
||||
{
|
||||
var configuration = new ConfigurationBuilder().Build();
|
||||
|
||||
var services = new ServiceCollection();
|
||||
services.AddDeploymentManager(configuration);
|
||||
|
||||
using var provider = services.BuildServiceProvider();
|
||||
var options = provider.GetRequiredService<IOptions<DeploymentManagerOptions>>().Value;
|
||||
|
||||
// No section present -> the option-class defaults are retained.
|
||||
Assert.Equal(TimeSpan.FromSeconds(5), options.OperationLockTimeout);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
using System.Data;
|
||||
using System.Data.Common;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using NSubstitute;
|
||||
using ScadaLink.Commons.Entities.ExternalSystems;
|
||||
@@ -75,4 +77,66 @@ public class DatabaseGatewayTests
|
||||
|
||||
Assert.False(delivered); // permanent — the S&F engine parks the message
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-010: SqlConnection must not leak when OpenAsync fails ──
|
||||
|
||||
[Fact]
|
||||
public async Task GetConnection_OpenFails_DisposesConnectionBeforeRethrowing()
|
||||
{
|
||||
var conn = new DatabaseConnectionDefinition("testDb", "Server=localhost;Database=test") { Id = 1 };
|
||||
_repository.GetAllDatabaseConnectionsAsync().Returns(new List<DatabaseConnectionDefinition> { conn });
|
||||
|
||||
var fake = new ThrowingDbConnection();
|
||||
var gateway = new ConnectionFactoryStubGateway(_repository, fake);
|
||||
|
||||
await Assert.ThrowsAsync<InvalidOperationException>(
|
||||
() => gateway.GetConnectionAsync("testDb"));
|
||||
|
||||
Assert.True(fake.WasDisposed, "The SqlConnection was leaked — it must be disposed when OpenAsync fails");
|
||||
}
|
||||
|
||||
/// <summary>Test gateway that substitutes the connection factory with a stub.</summary>
|
||||
private sealed class ConnectionFactoryStubGateway : DatabaseGateway
|
||||
{
|
||||
private readonly DbConnection _connection;
|
||||
|
||||
public ConnectionFactoryStubGateway(IExternalSystemRepository repository, DbConnection connection)
|
||||
: base(repository, NullLogger<DatabaseGateway>.Instance) => _connection = connection;
|
||||
|
||||
internal override DbConnection CreateConnection(string connectionString) => _connection;
|
||||
}
|
||||
|
||||
/// <summary>A DbConnection whose OpenAsync always fails, tracking whether it was disposed.</summary>
|
||||
private sealed class ThrowingDbConnection : DbConnection
|
||||
{
|
||||
public bool WasDisposed { get; private set; }
|
||||
|
||||
public override Task OpenAsync(CancellationToken cancellationToken) =>
|
||||
throw new InvalidOperationException("simulated open failure");
|
||||
public override void Open() => throw new InvalidOperationException("simulated open failure");
|
||||
|
||||
protected override void Dispose(bool disposing)
|
||||
{
|
||||
if (disposing) WasDisposed = true;
|
||||
base.Dispose(disposing);
|
||||
}
|
||||
|
||||
public override ValueTask DisposeAsync()
|
||||
{
|
||||
WasDisposed = true;
|
||||
return base.DisposeAsync();
|
||||
}
|
||||
|
||||
// Unused abstract members.
|
||||
[System.Diagnostics.CodeAnalysis.AllowNull]
|
||||
public override string ConnectionString { get; set; } = string.Empty;
|
||||
public override string Database => string.Empty;
|
||||
public override string DataSource => string.Empty;
|
||||
public override string ServerVersion => string.Empty;
|
||||
public override ConnectionState State => ConnectionState.Closed;
|
||||
public override void ChangeDatabase(string databaseName) => throw new NotSupportedException();
|
||||
public override void Close() { }
|
||||
protected override DbTransaction BeginDbTransaction(IsolationLevel il) => throw new NotSupportedException();
|
||||
protected override DbCommand CreateDbCommand() => throw new NotSupportedException();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -328,6 +328,281 @@ public class ExternalSystemClientTests
|
||||
() => client.CallAsync("TestAPI", "getData", cancellationToken: cts.Token));
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-004: per-system retry settings honoured for cached calls ──
|
||||
|
||||
[Fact]
|
||||
public async Task CachedCall_TransientFailure_BuffersWithSystemRetrySettings()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none")
|
||||
{
|
||||
Id = 1,
|
||||
MaxRetries = 7,
|
||||
RetryDelay = TimeSpan.FromSeconds(42),
|
||||
};
|
||||
var method = new ExternalSystemMethod("postData", "POST", "/post") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.InternalServerError, "boom"));
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var dbName = $"EsgRetry_{Guid.NewGuid():N}";
|
||||
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
|
||||
using var keepAlive = new SqliteConnection(connStr);
|
||||
keepAlive.Open();
|
||||
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
|
||||
await storage.InitializeAsync();
|
||||
// S&F defaults deliberately different from the system's settings.
|
||||
var sfOptions = new StoreAndForwardOptions
|
||||
{
|
||||
DefaultMaxRetries = 3,
|
||||
DefaultRetryInterval = TimeSpan.FromMinutes(10),
|
||||
RetryTimerInterval = TimeSpan.FromMinutes(10),
|
||||
};
|
||||
var sf = new StoreAndForwardService(storage, sfOptions, NullLogger<StoreAndForwardService>.Instance);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance,
|
||||
storeAndForward: sf);
|
||||
|
||||
var result = await client.CachedCallAsync("TestAPI", "postData");
|
||||
Assert.True(result.WasBuffered);
|
||||
|
||||
var depth = await storage.GetBufferDepthByCategoryAsync();
|
||||
Assert.Equal(1, depth[ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem]);
|
||||
|
||||
var (maxRetries, retryIntervalMs) = ReadBufferedRetrySettings(connStr);
|
||||
Assert.Equal(7, maxRetries);
|
||||
Assert.Equal((long)TimeSpan.FromSeconds(42).TotalMilliseconds, retryIntervalMs);
|
||||
}
|
||||
|
||||
private static (int MaxRetries, long RetryIntervalMs) ReadBufferedRetrySettings(string connStr)
|
||||
{
|
||||
using var conn = new SqliteConnection(connStr);
|
||||
conn.Open();
|
||||
using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "SELECT max_retries, retry_interval_ms FROM sf_messages";
|
||||
using var reader = cmd.ExecuteReader();
|
||||
Assert.True(reader.Read(), "expected exactly one buffered message");
|
||||
var result = (reader.GetInt32(0), reader.GetInt64(1));
|
||||
Assert.False(reader.Read(), "expected exactly one buffered message");
|
||||
return result;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CachedCall_TransientFailure_ZeroMaxRetriesIsHonouredNotTreatedAsUnset()
|
||||
{
|
||||
// MaxRetries == 0 must mean "never retry", not "fall back to the S&F default".
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none")
|
||||
{
|
||||
Id = 1,
|
||||
MaxRetries = 0,
|
||||
RetryDelay = TimeSpan.FromSeconds(5),
|
||||
};
|
||||
var method = new ExternalSystemMethod("postData", "POST", "/post") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var httpClient = new HttpClient(new MockHttpMessageHandler(HttpStatusCode.InternalServerError, "boom"));
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var dbName = $"EsgRetryZero_{Guid.NewGuid():N}";
|
||||
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
|
||||
using var keepAlive = new SqliteConnection(connStr);
|
||||
keepAlive.Open();
|
||||
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
|
||||
await storage.InitializeAsync();
|
||||
var sfOptions = new StoreAndForwardOptions
|
||||
{
|
||||
DefaultMaxRetries = 99,
|
||||
DefaultRetryInterval = TimeSpan.FromMinutes(10),
|
||||
RetryTimerInterval = TimeSpan.FromMinutes(10),
|
||||
};
|
||||
var sf = new StoreAndForwardService(storage, sfOptions, NullLogger<StoreAndForwardService>.Instance);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance,
|
||||
storeAndForward: sf);
|
||||
|
||||
await client.CachedCallAsync("TestAPI", "postData");
|
||||
|
||||
var (maxRetries, _) = ReadBufferedRetrySettings(connStr);
|
||||
Assert.Equal(0, maxRetries); // honoured — not the default of 99
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-005: HttpRequestMessage / HttpResponseMessage disposal ──
|
||||
|
||||
[Fact]
|
||||
public async Task Call_SuccessfulHttp_DisposesRequestAndResponse()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var handler = new DisposalTrackingHandler(HttpStatusCode.OK, "{\"ok\":true}");
|
||||
var httpClient = new HttpClient(handler);
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance);
|
||||
|
||||
await client.CallAsync("TestAPI", "getData");
|
||||
|
||||
Assert.True(handler.RequestDisposed, "HttpRequestMessage was not disposed");
|
||||
Assert.True(handler.ResponseContentDisposed, "HttpResponseMessage content was not disposed");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Call_PermanentFailure_StillDisposesRequestAndResponse()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("badMethod", "POST", "/bad") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var handler = new DisposalTrackingHandler(HttpStatusCode.BadRequest, "bad request");
|
||||
var httpClient = new HttpClient(handler);
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance);
|
||||
|
||||
await client.CallAsync("TestAPI", "badMethod");
|
||||
|
||||
Assert.True(handler.RequestDisposed, "HttpRequestMessage was not disposed on the error path");
|
||||
Assert.True(handler.ResponseContentDisposed, "HttpResponseMessage content was not disposed on the error path");
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-006: BuildUrl — empty path must not append a trailing slash ──
|
||||
|
||||
[Fact]
|
||||
public async Task Call_MethodWithEmptyPath_DoesNotAppendTrailingSlash()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com/api", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("root", "GET", "") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var handler = new RequestCapturingHandler(HttpStatusCode.OK, "{}");
|
||||
var httpClient = new HttpClient(handler);
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance);
|
||||
|
||||
await client.CallAsync("TestAPI", "root");
|
||||
|
||||
Assert.Equal("https://api.example.com/api", handler.LastUri!.ToString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Call_MethodWithPath_BuildsExpectedUrl()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com/api", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("getData", "GET", "/data") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var handler = new RequestCapturingHandler(HttpStatusCode.OK, "{}");
|
||||
var httpClient = new HttpClient(handler);
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance);
|
||||
|
||||
await client.CallAsync("TestAPI", "getData");
|
||||
|
||||
Assert.Equal("https://api.example.com/api/data", handler.LastUri!.ToString());
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-007: external error body must be truncated, not echoed verbatim ──
|
||||
|
||||
[Fact]
|
||||
public async Task Call_PermanentFailureWithHugeErrorBody_TruncatesErrorMessage()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("badMethod", "POST", "/bad") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var hugeBody = new string('X', 500_000);
|
||||
var handler = new MockHttpMessageHandler(HttpStatusCode.BadRequest, hugeBody);
|
||||
var httpClient = new HttpClient(handler);
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance);
|
||||
|
||||
var result = await client.CallAsync("TestAPI", "badMethod");
|
||||
|
||||
Assert.False(result.Success);
|
||||
// The error message must be bounded — a misbehaving endpoint cannot inflate
|
||||
// every script-visible error string / event-log entry.
|
||||
Assert.True(result.ErrorMessage!.Length < 4096,
|
||||
$"Error message was {result.ErrorMessage.Length} chars — expected it to be truncated");
|
||||
}
|
||||
|
||||
// ── ExternalSystemGateway-008: cancellation of a CachedCall must not be buffered ──
|
||||
|
||||
[Fact]
|
||||
public async Task CachedCall_CallerCancellation_IsNotBufferedAsTransient()
|
||||
{
|
||||
var system = new ExternalSystemDefinition("TestAPI", "https://api.example.com", "none") { Id = 1 };
|
||||
var method = new ExternalSystemMethod("postData", "POST", "/post") { Id = 1, ExternalSystemDefinitionId = 1 };
|
||||
_repository.GetAllExternalSystemsAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemDefinition> { system });
|
||||
_repository.GetMethodsByExternalSystemIdAsync(1, Arg.Any<CancellationToken>())
|
||||
.Returns(new List<ExternalSystemMethod> { method });
|
||||
|
||||
var httpClient = new HttpClient(new HangingHttpMessageHandler(TimeSpan.FromMinutes(10)));
|
||||
_httpClientFactory.CreateClient(Arg.Any<string>()).Returns(httpClient);
|
||||
|
||||
var dbName = $"EsgCancel_{Guid.NewGuid():N}";
|
||||
var connStr = $"Data Source={dbName};Mode=Memory;Cache=Shared";
|
||||
using var keepAlive = new SqliteConnection(connStr);
|
||||
keepAlive.Open();
|
||||
var storage = new StoreAndForwardStorage(connStr, NullLogger<StoreAndForwardStorage>.Instance);
|
||||
await storage.InitializeAsync();
|
||||
var sfOptions = new StoreAndForwardOptions
|
||||
{
|
||||
DefaultRetryInterval = TimeSpan.FromMinutes(10),
|
||||
RetryTimerInterval = TimeSpan.FromMinutes(10),
|
||||
};
|
||||
var sf = new StoreAndForwardService(storage, sfOptions, NullLogger<StoreAndForwardService>.Instance);
|
||||
|
||||
var options = new ExternalSystemGatewayOptions { DefaultHttpTimeout = TimeSpan.FromMinutes(5) };
|
||||
var client = new ExternalSystemClient(
|
||||
_httpClientFactory, _repository, NullLogger<ExternalSystemClient>.Instance,
|
||||
storeAndForward: sf,
|
||||
options: Microsoft.Extensions.Options.Options.Create(options));
|
||||
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(200));
|
||||
|
||||
// Caller asked to abandon the work — it must NOT be buffered for retry.
|
||||
await Assert.ThrowsAnyAsync<OperationCanceledException>(
|
||||
() => client.CachedCallAsync("TestAPI", "postData", cancellationToken: cts.Token));
|
||||
|
||||
var depth = await storage.GetBufferDepthByCategoryAsync();
|
||||
Assert.False(
|
||||
depth.TryGetValue(ScadaLink.Commons.Types.Enums.StoreAndForwardCategory.ExternalSystem, out var n) && n > 0,
|
||||
"A caller-cancelled CachedCall must not be buffered for retry");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test helper: mock HTTP message handler.
|
||||
/// </summary>
|
||||
@@ -351,6 +626,72 @@ public class ExternalSystemClientTests
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test helper: tracks disposal of the request and the response content.
|
||||
/// </summary>
|
||||
private class DisposalTrackingHandler : HttpMessageHandler
|
||||
{
|
||||
private readonly HttpStatusCode _statusCode;
|
||||
private readonly string _body;
|
||||
|
||||
public DisposalTrackingHandler(HttpStatusCode statusCode, string body)
|
||||
{
|
||||
_statusCode = statusCode;
|
||||
_body = body;
|
||||
}
|
||||
|
||||
public bool RequestDisposed { get; private set; }
|
||||
public bool ResponseContentDisposed { get; private set; }
|
||||
|
||||
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
|
||||
{
|
||||
request.Content = new TrackingContent(string.Empty, () => RequestDisposed = true);
|
||||
var response = new HttpResponseMessage(_statusCode)
|
||||
{
|
||||
Content = new TrackingContent(_body, () => ResponseContentDisposed = true)
|
||||
};
|
||||
return Task.FromResult(response);
|
||||
}
|
||||
|
||||
private sealed class TrackingContent : StringContent
|
||||
{
|
||||
private readonly Action _onDispose;
|
||||
public TrackingContent(string content, Action onDispose) : base(content) => _onDispose = onDispose;
|
||||
|
||||
protected override void Dispose(bool disposing)
|
||||
{
|
||||
if (disposing) _onDispose();
|
||||
base.Dispose(disposing);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test helper: captures the request URI of the last request.
|
||||
/// </summary>
|
||||
private class RequestCapturingHandler : HttpMessageHandler
|
||||
{
|
||||
private readonly HttpStatusCode _statusCode;
|
||||
private readonly string _body;
|
||||
|
||||
public RequestCapturingHandler(HttpStatusCode statusCode, string body)
|
||||
{
|
||||
_statusCode = statusCode;
|
||||
_body = body;
|
||||
}
|
||||
|
||||
public Uri? LastUri { get; private set; }
|
||||
|
||||
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
|
||||
{
|
||||
LastUri = request.RequestUri;
|
||||
return Task.FromResult(new HttpResponseMessage(_statusCode)
|
||||
{
|
||||
Content = new StringContent(_body)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test helper: an HTTP handler that hangs until cancelled (simulates a slow/hung system).
|
||||
/// </summary>
|
||||
|
||||
@@ -219,6 +219,94 @@ public class CentralHealthAggregatorTests
|
||||
Assert.True(final.IsOnline);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-007 regression: a heartbeat for a site that has not yet
|
||||
/// sent a full report (e.g. immediately after a central restart/failover, when
|
||||
/// the aggregator's in-memory state is empty) must register the site as online
|
||||
/// rather than being silently discarded. Otherwise reachable sites show as
|
||||
/// "unknown" for up to a full report interval during the failover window.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void MarkHeartbeat_RegistersUnknownSite_AsOnlineAwaitingReport()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
_aggregator.MarkHeartbeat("site-new", now);
|
||||
|
||||
var state = _aggregator.GetSiteState("site-new");
|
||||
Assert.NotNull(state);
|
||||
Assert.True(state.IsOnline);
|
||||
Assert.Null(state.LatestReport);
|
||||
Assert.Equal(now, state.LastHeartbeatAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MarkHeartbeat_KeepsSiteOnline_BetweenReports()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport("site-1", 1));
|
||||
|
||||
// Time advances past the offline timeout, but heartbeats keep arriving.
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(45));
|
||||
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(45));
|
||||
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
|
||||
|
||||
_aggregator.CheckForOfflineSites();
|
||||
|
||||
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MarkHeartbeat_BringsOfflineSiteBackOnline()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport("site-1", 1));
|
||||
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(61));
|
||||
_aggregator.CheckForOfflineSites();
|
||||
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
|
||||
_aggregator.MarkHeartbeat("site-1", _timeProvider.GetUtcNow());
|
||||
Assert.True(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-005 regression: the synthetic "central" site has no
|
||||
/// heartbeat source — its LastHeartbeatAt is only bumped by the 30s
|
||||
/// CentralHealthReportLoop self-report. A single skipped/late self-report
|
||||
/// (leader GC pause, brief stall, mid-failover) would leave it with no signal
|
||||
/// for >60s and flap it offline even though the central cluster is healthy.
|
||||
/// The "central" keyspace entry must get a longer offline grace than real sites.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void OfflineDetection_CentralSite_HasLongerGraceThanRealSites()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
|
||||
_aggregator.ProcessReport(MakeReport("site-1", 1));
|
||||
|
||||
// One missed central self-report (~30s) plus the normal 60s site timeout:
|
||||
// a real site would already be offline here, but central must not be —
|
||||
// it only gets one self-report every 30s, so 60s is barely two reports.
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(75));
|
||||
_aggregator.CheckForOfflineSites();
|
||||
|
||||
Assert.False(_aggregator.GetSiteState("site-1")!.IsOnline);
|
||||
Assert.True(
|
||||
_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline,
|
||||
"central must survive a single missed self-report");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void OfflineDetection_CentralSite_StillGoesOfflineOnGenuineLoss()
|
||||
{
|
||||
_aggregator.ProcessReport(MakeReport(CentralHealthReportLoop.CentralSiteId, 1));
|
||||
|
||||
// Well beyond even the central grace window — genuine total loss.
|
||||
_timeProvider.Advance(TimeSpan.FromMinutes(10));
|
||||
_aggregator.CheckForOfflineSites();
|
||||
|
||||
Assert.False(_aggregator.GetSiteState(CentralHealthReportLoop.CentralSiteId)!.IsOnline);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SequenceNumberReset_RejectedUntilExceedsPrevMax()
|
||||
{
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Messages.Health;
|
||||
|
||||
namespace ScadaLink.HealthMonitoring.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// HealthMonitoring-009 regression: the central self-report loop had no test
|
||||
/// coverage at all. These tests exercise leader-only gating (SelfIsPrimary),
|
||||
/// self-report generation for siteId="central", and monotonic sequence
|
||||
/// assignment.
|
||||
/// </summary>
|
||||
public class CentralHealthReportLoopTests
|
||||
{
|
||||
private sealed class FakeClusterNodeProvider : IClusterNodeProvider
|
||||
{
|
||||
public bool SelfIsPrimary { get; set; }
|
||||
public IReadOnlyList<NodeStatus> Nodes { get; set; } = [];
|
||||
public IReadOnlyList<NodeStatus> GetClusterNodes() => Nodes;
|
||||
}
|
||||
|
||||
private sealed class RecordingAggregator : ICentralHealthAggregator
|
||||
{
|
||||
public List<SiteHealthReport> Processed { get; } = [];
|
||||
public void ProcessReport(SiteHealthReport report) => Processed.Add(report);
|
||||
public void MarkHeartbeat(string siteId, DateTimeOffset receivedAt) { }
|
||||
public IReadOnlyDictionary<string, SiteHealthState> GetAllSiteStates() =>
|
||||
new Dictionary<string, SiteHealthState>();
|
||||
public SiteHealthState? GetSiteState(string siteId) => null;
|
||||
}
|
||||
|
||||
private static async Task RunLoopBriefly(CentralHealthReportLoop loop, int runForMs)
|
||||
{
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromMilliseconds(runForMs + 100));
|
||||
try
|
||||
{
|
||||
await loop.StartAsync(cts.Token);
|
||||
await Task.Delay(runForMs, CancellationToken.None);
|
||||
await loop.StopAsync(CancellationToken.None);
|
||||
}
|
||||
catch (OperationCanceledException) { }
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GeneratesCentralReports_WhenSelfIsPrimary()
|
||||
{
|
||||
var collector = new SiteHealthCollector();
|
||||
var aggregator = new RecordingAggregator();
|
||||
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
|
||||
var options = Options.Create(new HealthMonitoringOptions
|
||||
{
|
||||
ReportInterval = TimeSpan.FromMilliseconds(50)
|
||||
});
|
||||
|
||||
var loop = new CentralHealthReportLoop(
|
||||
collector, aggregator, clusterNodes, options,
|
||||
NullLogger<CentralHealthReportLoop>.Instance);
|
||||
|
||||
await RunLoopBriefly(loop, 250);
|
||||
|
||||
Assert.NotEmpty(aggregator.Processed);
|
||||
Assert.All(aggregator.Processed,
|
||||
r => Assert.Equal(CentralHealthReportLoop.CentralSiteId, r.SiteId));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GeneratesNoReports_WhenNotPrimary()
|
||||
{
|
||||
var collector = new SiteHealthCollector();
|
||||
var aggregator = new RecordingAggregator();
|
||||
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
|
||||
var options = Options.Create(new HealthMonitoringOptions
|
||||
{
|
||||
ReportInterval = TimeSpan.FromMilliseconds(50)
|
||||
});
|
||||
|
||||
var loop = new CentralHealthReportLoop(
|
||||
collector, aggregator, clusterNodes, options,
|
||||
NullLogger<CentralHealthReportLoop>.Instance);
|
||||
|
||||
await RunLoopBriefly(loop, 250);
|
||||
|
||||
Assert.Empty(aggregator.Processed);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AssignsMonotonicSequenceNumbers()
|
||||
{
|
||||
var collector = new SiteHealthCollector();
|
||||
var aggregator = new RecordingAggregator();
|
||||
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = true };
|
||||
var options = Options.Create(new HealthMonitoringOptions
|
||||
{
|
||||
ReportInterval = TimeSpan.FromMilliseconds(50)
|
||||
});
|
||||
|
||||
var loop = new CentralHealthReportLoop(
|
||||
collector, aggregator, clusterNodes, options,
|
||||
NullLogger<CentralHealthReportLoop>.Instance);
|
||||
|
||||
await RunLoopBriefly(loop, 300);
|
||||
|
||||
Assert.True(aggregator.Processed.Count >= 2,
|
||||
$"Expected at least 2 reports, got {aggregator.Processed.Count}");
|
||||
for (int i = 1; i < aggregator.Processed.Count; i++)
|
||||
{
|
||||
Assert.True(
|
||||
aggregator.Processed[i].SequenceNumber > aggregator.Processed[i - 1].SequenceNumber,
|
||||
$"Sequence numbers not strictly increasing at index {i}");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SetsActiveNodeFlag_EvenWhenNotPrimary()
|
||||
{
|
||||
// The loop must still report the node's role to the collector when it is
|
||||
// the standby, so the standby's own node card shows the correct role.
|
||||
var collector = new SiteHealthCollector();
|
||||
var aggregator = new RecordingAggregator();
|
||||
var clusterNodes = new FakeClusterNodeProvider { SelfIsPrimary = false };
|
||||
var options = Options.Create(new HealthMonitoringOptions
|
||||
{
|
||||
ReportInterval = TimeSpan.FromMilliseconds(50)
|
||||
});
|
||||
|
||||
var loop = new CentralHealthReportLoop(
|
||||
collector, aggregator, clusterNodes, options,
|
||||
NullLogger<CentralHealthReportLoop>.Instance);
|
||||
|
||||
await RunLoopBriefly(loop, 150);
|
||||
|
||||
Assert.False(collector.IsActiveNode);
|
||||
}
|
||||
}
|
||||
@@ -138,6 +138,111 @@ public class SiteHealthCollectorTests
|
||||
Assert.Equal(0, report.SequenceNumber);
|
||||
}
|
||||
|
||||
// HealthMonitoring-009 regression: the remaining collector setters had no
|
||||
// "reflected in report" coverage. The following tests verify each setter's
|
||||
// value reaches CollectReport output.
|
||||
|
||||
[Fact]
|
||||
public void SetClusterNodes_ReflectedInReport()
|
||||
{
|
||||
var nodes = new List<ScadaLink.Commons.Messages.Health.NodeStatus>
|
||||
{
|
||||
new("node-a", true, "Active"),
|
||||
new("node-b", true, "Standby")
|
||||
};
|
||||
_collector.SetClusterNodes(nodes);
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.NotNull(report.ClusterNodes);
|
||||
Assert.Equal(2, report.ClusterNodes!.Count);
|
||||
Assert.Equal("node-a", report.ClusterNodes[0].Hostname);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetInstanceCounts_ReflectedInReport()
|
||||
{
|
||||
_collector.SetInstanceCounts(deployed: 10, enabled: 7, disabled: 3);
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.Equal(10, report.DeployedInstanceCount);
|
||||
Assert.Equal(7, report.EnabledInstanceCount);
|
||||
Assert.Equal(3, report.DisabledInstanceCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetParkedMessageCount_ReflectedInReport()
|
||||
{
|
||||
_collector.SetParkedMessageCount(42);
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.Equal(42, report.ParkedMessageCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetNodeHostname_ReflectedInReport()
|
||||
{
|
||||
_collector.SetNodeHostname("site-host-1");
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.Equal("site-host-1", report.NodeHostname);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetActiveNode_ReflectedInNodeRole()
|
||||
{
|
||||
_collector.SetActiveNode(true);
|
||||
Assert.Equal("Active", _collector.CollectReport("site-1").NodeRole);
|
||||
Assert.True(_collector.IsActiveNode);
|
||||
|
||||
_collector.SetActiveNode(false);
|
||||
Assert.Equal("Standby", _collector.CollectReport("site-1").NodeRole);
|
||||
Assert.False(_collector.IsActiveNode);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void UpdateTagQuality_ReflectedInReport()
|
||||
{
|
||||
_collector.UpdateTagQuality("opc-1", good: 80, bad: 15, uncertain: 5);
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.NotNull(report.DataConnectionTagQuality);
|
||||
var quality = report.DataConnectionTagQuality!["opc-1"];
|
||||
Assert.Equal(80, quality.Good);
|
||||
Assert.Equal(15, quality.Bad);
|
||||
Assert.Equal(5, quality.Uncertain);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void UpdateConnectionEndpoint_ReflectedInReport()
|
||||
{
|
||||
_collector.UpdateConnectionEndpoint("opc-1", "opc.tcp://plc-1:4840");
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.NotNull(report.DataConnectionEndpoints);
|
||||
Assert.Equal("opc.tcp://plc-1:4840", report.DataConnectionEndpoints!["opc-1"]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetStoreAndForwardDepths_ReflectedInReport()
|
||||
{
|
||||
_collector.SetStoreAndForwardDepths(new Dictionary<string, int>
|
||||
{
|
||||
["ExternalSystem"] = 5,
|
||||
["Notification"] = 2
|
||||
});
|
||||
|
||||
var report = _collector.CollectReport("site-1");
|
||||
|
||||
Assert.Equal(5, report.StoreAndForwardBufferDepths["ExternalSystem"]);
|
||||
Assert.Equal(2, report.StoreAndForwardBufferDepths["Notification"]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ThreadSafety_ConcurrentIncrements()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user