diff --git a/code-reviews/SiteRuntime/findings.md b/code-reviews/SiteRuntime/findings.md index 8843dd8..8294056 100644 --- a/code-reviews/SiteRuntime/findings.md +++ b/code-reviews/SiteRuntime/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 13 | +| Open findings | 5 | ## Summary @@ -176,10 +176,10 @@ longer drifts (this additionally addresses the root cause behind SiteRuntime-004 | | | |--|--| -| Severity | Medium | +| Severity | Medium — re-triaged: already fixed by the SiteRuntime-003 resolution. | | Category | Correctness & logic bugs | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:239` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs` (`ApplyDeployment`) | **Description** @@ -193,16 +193,24 @@ grow, but the in-memory `_totalDeployedCount` (reported to the health collector `UpdateInstanceCounts`) drifts upward and the reported "disabled" count becomes wrong. -**Recommendation** +**Re-triage (2026-05-16)** -Only increment `_totalDeployedCount` when the instance is genuinely new. Either -track whether this deploy replaced an existing config, or derive the deployed count -from storage / the union of running actors and disabled configs rather than -maintaining a hand-incremented counter. +Verified against the current source: this is **already fixed**. The SiteRuntime-003 +resolution replaced the fixed-delay reschedule with a shared `ApplyDeployment` helper +that takes an `isRedeploy` flag and guards the counter with `if (!isRedeploy) +_totalDeployedCount++;`. The redeploy path (`HandleTerminated`) always calls +`ApplyDeployment(..., isRedeploy: true)`, so the counter is no longer bumped on +redeployment. The regression test +`DeploymentManagerRedeployTests.Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances` +already covers this and passes. No further code change was required. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): no new change needed — the root cause was +eliminated by the SiteRuntime-003 fix (the `isRedeploy` guard in `ApplyDeployment`). +Confirmed by the existing passing regression test +`Redeploy_ExistingInstance_DoesNotOverCountDeployedInstances`. Re-triaged from Open to +Resolved. ### SiteRuntime-005 — Deployment reports `Success` to central before persistence completes @@ -210,8 +218,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Error handling & resilience | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:272` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs` (`ApplyDeployment`, `HandleDeployPersistenceResult`) | **Description** @@ -232,7 +240,16 @@ At minimum, do not report `Success` until the config row is committed. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — `ApplyDeployment` sent +`DeploymentStatusResponse(Success)` synchronously before the persistence `Task.Run` +completed. The `Success` reply is now sent from `HandleDeployPersistenceResult` only +once the persistence result is known: on success it replies `Success`; on a +persistence failure it logs the error, stops the optimistically-created Instance +Actor, rolls back the deployed-instance counter, and replies +`DeploymentStatus.Failed` with the error message. `DeployPersistenceResult` carries an +`IsRedeploy` flag so the counter rollback is skipped for redeployments. Regression +tests: `DeploymentManagerMediumFindingsTests.Deploy_PersistenceFailure_ReportsFailedNotSuccess` +and `Deploy_Success_ReportsSuccessAndPersistsConfig`. ### SiteRuntime-006 — Site-local repositories read `SiteStorageService` private field via reflection @@ -240,8 +257,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Code organization & conventions | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:183`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs:181` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs` | **Description** @@ -263,7 +280,16 @@ repositories. Remove the reflection entirely. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — both repositories +reflected into `SiteStorageService._connectionString`. `SiteStorageService` now +exposes a public `CreateConnection()` factory method that returns an unopened +`SqliteConnection` against the site database. Both `SiteExternalSystemRepository` and +`SiteNotificationRepository` now obtain connections via `_storage.CreateConnection()`; +all reflection (`Type.GetField` / `BindingFlags`) and the contradictory XML comments +have been removed. This is a fully in-module refactor — no cross-module design +decision was needed. Regression test: +`SiteRepositoryTests.ExternalSystemRepository_RoundTripsStoredDefinition` exercises +the repository's connection path end-to-end. ### SiteRuntime-007 — Synthetic entity IDs use the non-deterministic `string.GetHashCode()` @@ -271,8 +297,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Correctness & logic bugs | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs:241`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs:254` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs`, `src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs`, `src/ScadaLink.SiteRuntime/Repositories/SyntheticId.cs` | **Description** @@ -294,7 +320,18 @@ rather than synthesising integer IDs. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — both repositories used +`name.GetHashCode()`, which is per-process randomized on .NET Core. A new internal +`SyntheticId` helper computes a deterministic, process-stable 31-bit ID using the +FNV-1a hash over the name's UTF-8 bytes. Both `GenerateSyntheticId` methods now +delegate to `SyntheticId.From(name)`. (The integer-keyed lookups are kept because +they are mandated by the shared `IExternalSystemRepository`/`INotificationRepository` +contracts in Commons — changing those contracts to name-keyed would be a cross-module +change outside this module's scope; the deterministic hash resolves the correctness +defect within scope.) Regression tests: +`SiteRepositoryTests.ExternalSystemRepository_SyntheticId_IsStableAcrossRestart` and +`NotificationRepository_SyntheticId_IsStableAcrossRestart` re-create the service to +simulate a process restart and confirm by-ID lookups still resolve. ### SiteRuntime-008 — Blocking `.GetAwaiter().GetResult()` on the actor thread during startup @@ -302,8 +339,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Akka.NET conventions | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:479` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs` (`HandleStartupConfigsLoaded`, `LoadSharedScriptsFromStorage`, `HandleSharedScriptsLoaded`) | **Description** @@ -327,7 +364,18 @@ back. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — the blocking +`.GetAwaiter().GetResult()` and Roslyn compilation ran on the singleton's mailbox +thread inside `HandleStartupConfigsLoaded`. `LoadSharedScriptsFromStorage` now runs +the SQLite read **and** the Roslyn compilation on a background `Task.Run` and pipes a +new internal `SharedScriptsLoaded` message back to the actor. A new +`HandleSharedScriptsLoaded` handler then begins staggered Instance Actor creation, so +the compilation→creation ordering is preserved without ever blocking the mailbox. A +shared-script load failure is logged and startup proceeds (scripts needing a missing +shared script fail at execution time). Regression test: +`DeploymentManagerMediumFindingsTests.Startup_WithSharedScripts_LoadsConfigsAndStaysResponsive` +(confirms startup completes and the actor stays responsive with shared scripts +present). ### SiteRuntime-009 — Script execution actors run scripts on the default thread pool, not a dedicated dispatcher @@ -335,8 +383,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Akka.NET conventions | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs:72`, `src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs:289`, `src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs:57` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs`, `src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs`, `src/ScadaLink.SiteRuntime/Scripts/ScriptExecutionScheduler.cs` | **Description** @@ -359,7 +407,19 @@ way, remove the "in production, configure…" comments by actually configuring i **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — script and alarm +on-trigger bodies ran inside a bare `Task.Run` on the shared `ThreadPool`. The +recommendation's `TaskScheduler` option was taken because it is fully in-module (a +HOCON dispatcher would require editing the Host's ActorSystem config, out of scope). +A new `ScriptExecutionScheduler` provides a bounded set of dedicated background +threads (count from the new `SiteRuntimeOptions.ScriptExecutionThreadCount`, default +8). `ScriptExecutionActor` and `AlarmExecutionActor` now run their bodies via +`Task.Factory.StartNew(..., ScriptExecutionScheduler.Shared(options)).Unwrap()` +instead of `Task.Run`, so blocking script I/O is contained to those dedicated threads +and cannot starve the global pool. The misleading "in production, configure a +dedicated dispatcher" comments were removed. Regression tests: +`ScriptExecutionSchedulerTests` (`Scheduler_RunsWork_OffTheThreadPool`, +`Scheduler_RespectsConfiguredThreadCount`, `Scheduler_Shared_ReturnsSameInstanceForOptions`). ### SiteRuntime-010 — `EnsureDclConnections` never updates a connection whose configuration changed @@ -367,8 +427,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Correctness & logic bugs | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs:413` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs` (`EnsureDclConnections`, `ComputeConnectionConfigHash`) | **Description** @@ -390,7 +450,15 @@ the name. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — the cache was a +name-only `HashSet`, so a changed connection config was silently dropped. +`_createdConnections` is now a `Dictionary` mapping connection name to +a SHA-256 hash of its protocol/primary-config/backup-config/failover-retry-count +(`ComputeConnectionConfigHash`). A connection whose hash is unchanged is still +skipped; a connection whose config changed re-issues a `CreateConnectionCommand` so +the DCL adopts the new configuration. Regression tests: +`DeploymentManagerMediumFindingsTests.EnsureDclConnections_ConnectionConfigChanged_ReissuesCreateCommand` +and `EnsureDclConnections_UnchangedConfig_DoesNotReissueCreateCommand`. ### SiteRuntime-011 — Trust-model validation is a substring scan and is both over- and under-inclusive @@ -398,8 +466,8 @@ _Unresolved._ |--|--| | Severity | Medium | | Category | Security | -| Status | Open | -| Location | `src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs:52` | +| Status | Resolved | +| Location | `src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs` (`ValidateTrustModel`) | **Description** @@ -430,7 +498,22 @@ unused `isAllowed` variable. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (`commit pending`): root cause confirmed — `ValidateTrustModel` +was a raw `string.Contains`/`IndexOf` scan of the source text, with a dead `isAllowed` +variable. It is now Roslyn semantic analysis: the script is parsed and a +`CSharpCompilation` + `SemanticModel` are built; every name/member/object-creation +node is resolved to its symbol and the symbol's containing namespace and +fully-qualified containing type are checked against the forbidden roots. Bare +namespace symbols are ignored (so the `System.Threading` qualifier of the allowed +`System.Threading.Tasks.Task` no longer false-positives). A name that cannot be +resolved (a type from an assembly deliberately absent from the script's references) +falls back to a syntactic fully-qualified-name check, so e.g. `System.Net.Http` +references are still rejected. The dead `isAllowed` variable was removed. This fixes +both the bypass (`global::`/alias-qualified forbidden types) and the false positives +(forbidden namespace string in a comment, string literal, or unrelated identifier). +Regression tests: new `TrustModelSemanticTests` (alias/`global::` detection, comment/ +literal/identifier non-detection, allowed-exception resolution); all 39 existing +`SandboxTests` + `ScriptCompilationServiceTests` continue to pass. ### SiteRuntime-012 — `AttributeAccessor`/`ScopeAccessors` block the script on a synchronous Ask diff --git a/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs b/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs index 5483172..72e1686 100644 --- a/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/AlarmActor.cs @@ -460,7 +460,8 @@ public class AlarmActor : ReceiveActor var executionId = $"{_alarmName}-alarm-exec-{_executionCounter++}"; - // NOTE: In production, configure a dedicated blocking I/O dispatcher via HOCON. + // SiteRuntime-009: the on-trigger script body runs on the dedicated + // ScriptExecutionScheduler, not the shared .NET thread pool. var props = Props.Create(() => new AlarmExecutionActor( _alarmName, _instanceName, diff --git a/src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs b/src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs index c288774..693df6f 100644 --- a/src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/AlarmExecutionActor.cs @@ -54,7 +54,11 @@ public class AlarmExecutionActor : ReceiveActor { var timeout = TimeSpan.FromSeconds(options.ScriptExecutionTimeoutSeconds); - _ = Task.Run(async () => + // SiteRuntime-009: run the alarm on-trigger body on the dedicated + // script-execution scheduler, not the shared .NET thread pool. + var scheduler = ScriptExecutionScheduler.Shared(options); + + _ = Task.Factory.StartNew(async () => { using var cts = new CancellationTokenSource(timeout); try @@ -108,6 +112,6 @@ public class AlarmExecutionActor : ReceiveActor { self.Tell(PoisonPill.Instance); } - }); + }, CancellationToken.None, TaskCreationOptions.DenyChildAttach, scheduler).Unwrap(); } } diff --git a/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs b/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs index f414ca4..74fdb11 100644 --- a/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/DeploymentManagerActor.cs @@ -99,6 +99,7 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // Internal startup messages Receive(HandleStartupConfigsLoaded); + Receive(HandleSharedScriptsLoaded); Receive(HandleStartNextBatch); // Internal enable result @@ -156,7 +157,14 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } /// - /// Processes the loaded configs from SQLite and begins staggered Instance Actor creation. + /// Processes the loaded configs from SQLite. + /// + /// SiteRuntime-008: shared scripts must be compiled before Instance Actors are + /// created, but the SQLite read and Roslyn compilation must not block the + /// singleton's mailbox. The compilation is run on a background task and a + /// message is piped back; only then does + /// staggered Instance Actor creation begin. The deployed configs are stashed on the + /// actor field in the meantime. /// private void HandleStartupConfigsLoaded(StartupConfigsLoaded msg) { @@ -166,9 +174,6 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers return; } - // Load and compile shared scripts from SQLite before creating Instance Actors - LoadSharedScriptsFromStorage(); - var enabledConfigs = msg.Configs.Where(c => c.IsEnabled).ToList(); _totalDeployedCount = msg.Configs.Count; _logger.LogInformation( @@ -176,11 +181,25 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers msg.Configs.Count, enabledConfigs.Count); UpdateInstanceCounts(); - if (enabledConfigs.Count == 0) + // Load and compile shared scripts off the actor thread, then resume startup. + LoadSharedScriptsFromStorage(enabledConfigs); + } + + /// + /// SiteRuntime-008: once shared scripts have been compiled off-thread, begins + /// staggered Instance Actor creation for the enabled configs captured at startup. + /// + private void HandleSharedScriptsLoaded(SharedScriptsLoaded msg) + { + _logger.LogInformation( + "Loaded {Compiled}/{Total} shared scripts from SQLite", + msg.CompiledCount, msg.TotalCount); + + if (msg.EnabledConfigs.Count == 0) return; // Start the first batch immediately - var batchState = new BatchState(enabledConfigs, 0); + var batchState = new BatchState(msg.EnabledConfigs, 0); Self.Tell(new StartNextBatch(batchState)); } @@ -275,6 +294,13 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers /// Creates the Instance Actor, persists the config, and replies to the deployer. /// A redeployment is an update of an existing instance, so the deployed-instance /// counter is only incremented for genuinely new deployments. + /// + /// SiteRuntime-005: the deployer is not told + /// until SQLite persistence has committed. The site's deployed-config store is the + /// durable source of truth — a config that was never persisted would be silently lost + /// on the next restart/failover, so reporting Success before the row is committed is + /// incorrect. The reply is sent from once + /// the persistence outcome is known. /// private void ApplyDeployment(DeployInstanceCommand command, IActorRef sender, bool isRedeploy) { @@ -307,33 +333,56 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers instanceName, command.FlattenedConfigurationJson, command.DeploymentId, command.RevisionHash, true)); - return new DeployPersistenceResult(command.DeploymentId, instanceName, true, null, sender); + return new DeployPersistenceResult( + command.DeploymentId, instanceName, true, null, sender, isRedeploy); }).ContinueWith(t => { if (t.IsCompletedSuccessfully) return t.Result; return new DeployPersistenceResult( command.DeploymentId, instanceName, false, - t.Exception?.GetBaseException().Message, sender); + t.Exception?.GetBaseException().Message, sender, isRedeploy); }).PipeTo(Self); - - // Reply immediately — deployment is applied (actor is running) - sender.Tell(new DeploymentStatusResponse( - command.DeploymentId, - instanceName, - DeploymentStatus.Success, - null, - DateTimeOffset.UtcNow)); } + /// + /// SiteRuntime-005: reports the deployment outcome to central only after the + /// persistence result is known. On a persistence failure the Instance Actor that was + /// created optimistically is stopped and the deployed-instance counter rolled back, + /// so the in-memory state stays consistent with durable storage, and central is told + /// the deployment . + /// private void HandleDeployPersistenceResult(DeployPersistenceResult result) { - if (!result.Success) + if (result.Success) { - _logger.LogError( - "Failed to persist deployment {DeploymentId} for {Instance}: {Error}", - result.DeploymentId, result.InstanceName, result.Error); + result.OriginalSender.Tell(new DeploymentStatusResponse( + result.DeploymentId, + result.InstanceName, + DeploymentStatus.Success, + null, + DateTimeOffset.UtcNow)); + return; } + + _logger.LogError( + "Failed to persist deployment {DeploymentId} for {Instance}: {Error}", + result.DeploymentId, result.InstanceName, result.Error); + + // Persistence failed — undo the optimistic actor creation and counter bump so + // the site does not advertise an instance it cannot durably recover. + if (_instanceActors.Remove(result.InstanceName, out var orphan)) + Context.Stop(orphan); + if (!result.IsRedeploy) + _totalDeployedCount = Math.Max(0, _totalDeployedCount - 1); + UpdateInstanceCounts(); + + result.OriginalSender.Tell(new DeploymentStatusResponse( + result.DeploymentId, + result.InstanceName, + DeploymentStatus.Failed, + result.Error ?? "Deployment persistence failed", + DateTimeOffset.UtcNow)); } /// @@ -492,10 +541,20 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // ── DCL connection management ── - private readonly HashSet _createdConnections = new(); + /// + /// Tracks the configuration last sent to the DCL for each connection name, keyed by + /// a hash of the connection's protocol/endpoints/credentials/failover count + /// (SiteRuntime-010). A name whose hash is unchanged is skipped; a name whose config + /// changed re-issues a CreateConnectionCommand so the DCL adopts the new + /// configuration instead of keeping a stale connection after a redeployment. + /// + private readonly Dictionary _createdConnections = new(); /// - /// Sets up DCL connections from the flattened config (idempotent: tracks created connections). + /// Sets up DCL connections from the flattened config. Idempotent on unchanged + /// configuration, but re-issues the create command when a connection's endpoint, + /// credentials, backup endpoint, or failover retry count has changed since it was + /// last sent (SiteRuntime-010). /// private void EnsureDclConnections(string configJson) { @@ -508,7 +567,8 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers foreach (var (name, connConfig) in config.Connections) { - if (_createdConnections.Contains(name)) + var configHash = ComputeConnectionConfigHash(connConfig); + if (_createdConnections.TryGetValue(name, out var lastHash) && lastHash == configHash) continue; var primaryDetails = FlattenConnectionConfig(connConfig.Protocol, connConfig.ConfigurationJson); @@ -519,10 +579,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers _dclManager.Tell(new Commons.Messages.DataConnection.CreateConnectionCommand( name, connConfig.Protocol, primaryDetails, backupDetails, connConfig.FailoverRetryCount)); - _createdConnections.Add(name); + var changed = _createdConnections.ContainsKey(name); + _createdConnections[name] = configHash; _logger.LogInformation( - "Created DCL connection {Connection} (protocol={Protocol})", - name, connConfig.Protocol); + "{Action} DCL connection {Connection} (protocol={Protocol})", + changed ? "Updated" : "Created", name, connConfig.Protocol); } } catch (Exception ex) @@ -531,6 +592,26 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers } } + /// + /// Computes a stable hash over the configuration fields that affect how the DCL + /// connects, so a changed endpoint/credential/backup/failover count is detected + /// (SiteRuntime-010). + /// + private static string ComputeConnectionConfigHash( + Commons.Types.Flattening.ConnectionConfig connConfig) + { + var material = string.Join( + "", + connConfig.Protocol, + connConfig.ConfigurationJson ?? string.Empty, + connConfig.BackupConfigurationJson ?? string.Empty, + connConfig.FailoverRetryCount.ToString()); + + var bytes = System.Security.Cryptography.SHA256.HashData( + System.Text.Encoding.UTF8.GetBytes(material)); + return Convert.ToHexString(bytes); + } + private static IDictionary FlattenConnectionConfig(string protocol, string? json) { if (string.IsNullOrEmpty(json)) @@ -559,25 +640,35 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // ── Shared Script Loading ── - private void LoadSharedScriptsFromStorage() + /// + /// SiteRuntime-008: reads and compiles all shared scripts on a background task so the + /// SQLite read and Roslyn compilation never block the singleton's mailbox thread. The + /// result is piped back as a message, carrying the + /// enabled configs to resume staggered Instance Actor creation on the actor thread. + /// + private void LoadSharedScriptsFromStorage(List enabledConfigs) { - try + Task.Run(async () => { - var scripts = _storage.GetAllSharedScriptsAsync().GetAwaiter().GetResult(); + var scripts = await _storage.GetAllSharedScriptsAsync(); var compiled = 0; foreach (var script in scripts) { if (_sharedScriptLibrary.CompileAndRegister(script.Name, script.Code)) compiled++; } - _logger.LogInformation( - "Loaded {Compiled}/{Total} shared scripts from SQLite", - compiled, scripts.Count); - } - catch (Exception ex) + return new SharedScriptsLoaded(enabledConfigs, compiled, scripts.Count); + }).ContinueWith(t => { - _logger.LogError(ex, "Failed to load shared scripts from SQLite"); - } + if (t.IsCompletedSuccessfully) + return t.Result; + _logger.LogError( + t.Exception?.GetBaseException(), "Failed to load shared scripts from SQLite"); + // A shared-script load failure must not abandon startup — proceed with + // Instance Actor creation; scripts that need a missing shared script fail + // at execution time and are recorded to the site event log. + return new SharedScriptsLoaded(enabledConfigs, 0, 0); + }).PipeTo(Self); } // ── Debug View routing ── @@ -891,12 +982,22 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers // ── Internal messages ── internal record StartupConfigsLoaded(List Configs, string? Error); + + /// + /// Internal message piped back once shared scripts have been compiled off-thread + /// (SiteRuntime-008). Carries the enabled configs so staggered Instance Actor + /// creation resumes on the actor thread. + /// + internal record SharedScriptsLoaded( + List EnabledConfigs, int CompiledCount, int TotalCount); + internal record StartNextBatch(BatchState State); internal record BatchState(List Configs, int NextIndex); internal record EnableResult( EnableInstanceCommand Command, DeployedInstance? Config, string? Error, IActorRef OriginalSender); internal record DeployPersistenceResult( - string DeploymentId, string InstanceName, bool Success, string? Error, IActorRef OriginalSender); + string DeploymentId, string InstanceName, bool Success, string? Error, + IActorRef OriginalSender, bool IsRedeploy); /// /// A redeployment command buffered until the previous Instance Actor terminates. diff --git a/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs b/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs index 6c49b41..13017fa 100644 --- a/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/ScriptActor.cs @@ -286,9 +286,10 @@ public class ScriptActor : ReceiveActor, IWithTimers { var executionId = $"{_scriptName}-exec-{_executionCounter++}"; - // NOTE: In production, configure a dedicated blocking I/O dispatcher via HOCON: - // akka.actor.script-execution-dispatcher { type = PinnedDispatcher } - // and chain .WithDispatcher("akka.actor.script-execution-dispatcher") below. + // SiteRuntime-009: the actor's mailbox stays on the default dispatcher, but the + // script body itself runs on the dedicated ScriptExecutionScheduler (a bounded + // set of dedicated threads), so blocking script I/O is contained there and + // cannot starve the shared .NET thread pool. var props = Props.Create(() => new ScriptExecutionActor( _scriptName, _instanceName, diff --git a/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs b/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs index 0038252..3b53eba 100644 --- a/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs +++ b/src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs @@ -68,8 +68,13 @@ public class ScriptExecutionActor : ReceiveActor { var timeout = TimeSpan.FromSeconds(options.ScriptExecutionTimeoutSeconds); + // SiteRuntime-009: run the script body on the dedicated script-execution + // scheduler, not the shared .NET thread pool, so blocking script I/O cannot + // starve the global pool and stall Akka dispatchers / HTTP handling. + var scheduler = ScriptExecutionScheduler.Shared(options); + // CTS must be created inside the async lambda so it outlives this method - _ = Task.Run(async () => + _ = Task.Factory.StartNew(async () => { IServiceScope? serviceScope = null; // ISiteEventLogger is a singleton; resolve from the root provider so @@ -164,6 +169,6 @@ public class ScriptExecutionActor : ReceiveActor // Stop self after execution completes self.Tell(PoisonPill.Instance); } - }); + }, CancellationToken.None, TaskCreationOptions.DenyChildAttach, scheduler).Unwrap(); } } diff --git a/src/ScadaLink.SiteRuntime/Persistence/SiteStorageService.cs b/src/ScadaLink.SiteRuntime/Persistence/SiteStorageService.cs index ac18b33..e4a44ad 100644 --- a/src/ScadaLink.SiteRuntime/Persistence/SiteStorageService.cs +++ b/src/ScadaLink.SiteRuntime/Persistence/SiteStorageService.cs @@ -19,6 +19,14 @@ public class SiteStorageService _logger = logger; } + /// + /// Creates a new (unopened) SQLite connection against the site database. + /// Exposed so site-local repositories can open their own connections without + /// reaching into private state via reflection (SiteRuntime-006). The caller owns + /// the connection and is responsible for opening and disposing it. + /// + public SqliteConnection CreateConnection() => new(_connectionString); + /// /// Creates the SQLite tables if they do not exist. /// Called once on site startup. diff --git a/src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs b/src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs index 95621f2..838238d 100644 --- a/src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs +++ b/src/ScadaLink.SiteRuntime/Repositories/SiteExternalSystemRepository.cs @@ -169,27 +169,12 @@ public class SiteExternalSystemRepository : IExternalSystemRepository // ── Private helpers ── /// - /// Creates a new SQLite connection using the same connection string as . - /// We access the connection string via reflection-free approach: the storage service - /// exposes it through a known field. Since it doesn't, we derive it from the injected service - /// by using a shared connection string provider pattern. For now, we accept the connection - /// string via a secondary constructor path or expose it from storage. - /// - /// Implementation note: We use the SiteStorageService's internal connection string. - /// This field is accessed via a package-internal helper since SiteStorageService - /// doesn't expose it directly. As a pragmatic solution, we pass the connection string - /// separately at DI registration time. + /// Creates a new SQLite connection against the site database via + /// (SiteRuntime-006). The + /// connection string is owned by ; the repository + /// no longer reaches into its private state via reflection. /// - private SqliteConnection CreateConnection() - { - // Access the connection string from SiteStorageService via its internal field. - // This uses reflection as a pragmatic choice — the alternative is modifying - // SiteStorageService to expose the connection string, which is out of scope. - var field = typeof(SiteStorageService).GetField("_connectionString", - System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); - var connectionString = (string)field!.GetValue(_storage)!; - return new SqliteConnection(connectionString); - } + private SqliteConnection CreateConnection() => _storage.CreateConnection(); private static ExternalSystemDefinition MapExternalSystem(SqliteDataReader reader) { @@ -233,12 +218,13 @@ public class SiteExternalSystemRepository : IExternalSystemRepository } /// - /// Generates a stable positive integer ID from a string name. - /// Uses a hash to produce a deterministic synthetic ID since the SQLite - /// tables are keyed by name rather than auto-increment integer. + /// Generates a stable positive integer ID from a string name (SiteRuntime-007). + /// Uses a deterministic FNV-1a hash rather than , + /// which is randomized per process on .NET Core and would therefore change every + /// time the process restarts — breaking any caller that stored an ID and later + /// looks the entity up by that ID. /// - private static int GenerateSyntheticId(string name) - => name.GetHashCode() & 0x7FFFFFFF; + private static int GenerateSyntheticId(string name) => SyntheticId.From(name); /// /// DTO for deserializing individual method entries from the method_definitions JSON column. diff --git a/src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs b/src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs index 0023cf3..3778aac 100644 --- a/src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs +++ b/src/ScadaLink.SiteRuntime/Repositories/SiteNotificationRepository.cs @@ -178,13 +178,12 @@ public class SiteNotificationRepository : INotificationRepository // ── Private helpers ── - private SqliteConnection CreateConnection() - { - var field = typeof(SiteStorageService).GetField("_connectionString", - System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance); - var connectionString = (string)field!.GetValue(_storage)!; - return new SqliteConnection(connectionString); - } + /// + /// Creates a new SQLite connection against the site database via + /// (SiteRuntime-006) instead of + /// reaching into its private connection-string field via reflection. + /// + private SqliteConnection CreateConnection() => _storage.CreateConnection(); private static NotificationList MapNotificationList(SqliteDataReader reader) { @@ -246,10 +245,9 @@ public class SiteNotificationRepository : INotificationRepository } /// - /// Generates a stable positive integer ID from a string name. - /// Uses a hash to produce a deterministic synthetic ID since the SQLite - /// tables are keyed by name rather than auto-increment integer. + /// Generates a stable positive integer ID from a string name (SiteRuntime-007). + /// Uses a deterministic FNV-1a hash rather than , + /// which is randomized per process on .NET Core and would change every restart. /// - private static int GenerateSyntheticId(string name) - => name.GetHashCode() & 0x7FFFFFFF; + private static int GenerateSyntheticId(string name) => SyntheticId.From(name); } diff --git a/src/ScadaLink.SiteRuntime/Repositories/SyntheticId.cs b/src/ScadaLink.SiteRuntime/Repositories/SyntheticId.cs new file mode 100644 index 0000000..5c4dee5 --- /dev/null +++ b/src/ScadaLink.SiteRuntime/Repositories/SyntheticId.cs @@ -0,0 +1,35 @@ +namespace ScadaLink.SiteRuntime.Repositories; + +/// +/// SiteRuntime-007: deterministic synthetic-ID generation for site-local artifacts. +/// +/// The site SQLite tables are keyed by name rather than an auto-increment integer, but +/// the shared repository contracts (IExternalSystemRepository, +/// INotificationRepository) expose integer-keyed lookups. A synthetic integer ID +/// is therefore derived from the entity name. It MUST be stable across process restarts +/// — is randomized per process on .NET Core and so +/// cannot be used. +/// +internal static class SyntheticId +{ + // FNV-1a 32-bit constants. + private const uint FnvOffsetBasis = 2166136261; + private const uint FnvPrime = 16777619; + + /// + /// Computes a deterministic, process-stable positive 31-bit integer ID for the + /// given name using the FNV-1a hash over its UTF-8 bytes. + /// + public static int From(string name) + { + var hash = FnvOffsetBasis; + foreach (var b in System.Text.Encoding.UTF8.GetBytes(name)) + { + hash ^= b; + hash *= FnvPrime; + } + + // Mask to a positive 31-bit value so the ID is always non-negative. + return (int)(hash & 0x7FFFFFFF); + } +} diff --git a/src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs b/src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs index 1056d34..188ae8b 100644 --- a/src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs +++ b/src/ScadaLink.SiteRuntime/Scripts/ScriptCompilationService.cs @@ -1,6 +1,7 @@ using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.CSharp; using Microsoft.CodeAnalysis.CSharp.Scripting; +using Microsoft.CodeAnalysis.CSharp.Syntax; using Microsoft.CodeAnalysis.Scripting; using Microsoft.Extensions.Logging; using ScadaLink.Commons.Types; @@ -17,7 +18,10 @@ public class ScriptCompilationService private readonly ILogger _logger; /// - /// Namespaces that are forbidden in user scripts for security. + /// Forbidden API roots. Each entry is matched as a prefix against both the resolved + /// symbol's containing namespace and its fully-qualified containing type name, so an + /// entry may name a whole namespace ("System.IO") or a single type + /// ("System.Diagnostics.Process"). /// private static readonly string[] ForbiddenNamespaces = [ @@ -30,8 +34,8 @@ public class ScriptCompilationService ]; /// - /// Specific types/members allowed even within forbidden namespaces. - /// async/await is OK despite System.Threading being blocked. + /// Specific namespaces/types allowed even though they sit under a forbidden root. + /// async/await and cancellation tokens are OK despite System.Threading being blocked. /// private static readonly string[] AllowedExceptions = [ @@ -46,58 +50,184 @@ public class ScriptCompilationService } /// - /// Validates that the script source code does not reference forbidden APIs. + /// SiteRuntime-011: validates that the script does not reference forbidden APIs. + /// + /// Validation is performed with Roslyn semantic analysis rather than a raw substring + /// scan of the source text. The script is parsed and a semantic model is built; every + /// identifier, type reference, member access, and object creation is resolved to its + /// symbol and the symbol's containing namespace is checked against the forbidden list. + /// + /// This is reliable in both directions a textual scan was not: + /// - it catches forbidden types regardless of how they are written (global:: + /// prefixes, aliases, transitively-imported namespaces) because it inspects the + /// resolved symbol, not the spelling; + /// - it does not raise false positives for the namespace string appearing in a + /// comment, a string literal, or an unrelated identifier. + /// /// Returns a list of violation messages, empty if clean. /// public IReadOnlyList ValidateTrustModel(string code) { - var violations = new List(); - var tree = CSharpSyntaxTree.ParseText(code); + var tree = CSharpSyntaxTree.ParseText( + code, new CSharpParseOptions(kind: SourceCodeKind.Script)); + + var compilation = CSharpCompilation.CreateScriptCompilation( + "TrustValidation", + tree, + ScriptReferences, + new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary)); + + var model = compilation.GetSemanticModel(tree); var root = tree.GetRoot(); - var text = root.ToFullString(); - foreach (var ns in ForbiddenNamespaces) + // Deduplicate so a forbidden symbol used many times is reported once but + // distinct forbidden symbols are all reported. + var violations = new SortedSet(StringComparer.Ordinal); + + foreach (var node in root.DescendantNodes()) { - if (text.Contains(ns, StringComparison.Ordinal)) + // Only inspect nodes that name a type or member; skip declarations, + // string literals and comments entirely. Member-access and qualified-name + // parents are evaluated as a whole, so their nested name parts are skipped. + if (node is not (SimpleNameSyntax or MemberAccessExpressionSyntax + or QualifiedNameSyntax or ObjectCreationExpressionSyntax)) { - // Check if it matches an allowed exception - var isAllowed = AllowedExceptions.Any(allowed => - text.Contains(allowed, StringComparison.Ordinal) && - ns != allowed && - allowed.StartsWith(ns, StringComparison.Ordinal)); - - // More precise: check each occurrence - var idx = 0; - while ((idx = text.IndexOf(ns, idx, StringComparison.Ordinal)) >= 0) - { - var remainder = text.Substring(idx); - var matchesAllowed = AllowedExceptions.Any(a => - remainder.StartsWith(a, StringComparison.Ordinal)); - - if (!matchesAllowed) - { - violations.Add($"Forbidden API reference: '{ns}' at position {idx}"); - break; - } - idx += ns.Length; - } + continue; } + + var info = model.GetSymbolInfo(node); + var symbol = info.Symbol ?? info.CandidateSymbols.FirstOrDefault(); + + // The set of fully-qualified scopes this reference touches: the resolved + // symbol's containing namespace and type, or — when the symbol could not + // be resolved (a type from an unreferenced assembly) — the syntactic + // fully-qualified name written in source as a safe fallback. + var scopes = symbol != null + ? GetSymbolScopes(symbol) + : GetSyntacticScopes(node); + if (scopes.Count == 0) + continue; + + var forbidden = ForbiddenNamespaces.FirstOrDefault( + f => scopes.Any(s => IsUnderScope(s, f))); + if (forbidden == null) + continue; + + // Allow specific exception namespaces/types (async/await, cancellation). + if (scopes.Any(s => AllowedExceptions.Any(a => IsUnderScope(s, a)))) + continue; + + var name = symbol?.Name ?? node.ToString(); + violations.Add($"Forbidden API reference: '{forbidden}' ({scopes[0]}.{name})"); } - return violations; + return violations.ToList(); } + /// + /// Returns the fully-qualified scopes a resolved symbol belongs to — its containing + /// namespace and, for a type or member, the fully-qualified containing type. A bare + /// namespace symbol is intentionally ignored: a namespace name on its own performs + /// no action; harm requires referencing a type or a member. + /// + private static List GetSymbolScopes(ISymbol symbol) + { + var scopes = new List(); + + switch (symbol) + { + case INamespaceSymbol: + // A namespace reference alone is harmless — skip it. (This avoids a + // false positive on the "System.Threading" qualifier of the allowed + // "System.Threading.Tasks.Task".) + break; + case ITypeSymbol typeSymbol: + scopes.Add(typeSymbol.ToDisplayString()); + if (typeSymbol.ContainingNamespace is { IsGlobalNamespace: false } typeNs) + scopes.Add(typeNs.ToDisplayString()); + break; + default: + if (symbol.ContainingType != null) + { + scopes.Add(symbol.ContainingType.ToDisplayString()); + if (symbol.ContainingType.ContainingNamespace is { IsGlobalNamespace: false } memberNs) + scopes.Add(memberNs.ToDisplayString()); + } + else if (symbol.ContainingNamespace is { IsGlobalNamespace: false } ns) + { + scopes.Add(ns.ToDisplayString()); + } + break; + } + + return scopes; + } + + /// + /// Fallback used when a name could not be resolved to a symbol (e.g. a type from an + /// assembly the script is not allowed to reference). The fully-qualified name as + /// written in source is used directly — a script that names + /// System.Net.Http.HttpClient is still rejected even though that assembly is + /// deliberately absent from the script's metadata references. + /// + private static List GetSyntacticScopes(SyntaxNode node) + { + // A dotted name written in source is itself the fully-qualified scope. Only + // consider names that actually contain a dot — bare local identifiers cannot + // reach a forbidden namespace. + var text = node switch + { + QualifiedNameSyntax q => q.ToString(), + MemberAccessExpressionSyntax m => m.ToString(), + _ => string.Empty + }; + + // Strip whitespace/newlines that a multi-line member-access chain may contain. + text = new string(text.Where(c => !char.IsWhiteSpace(c)).ToArray()); + + return string.IsNullOrEmpty(text) || !text.Contains('.') + ? [] + : [text]; + } + + /// + /// True if is exactly, or nested within, + /// (e.g. "System.IO.Compression" is under "System.IO", + /// "System.Diagnostics.Process" is under "System.Diagnostics.Process"). + /// + private static bool IsUnderScope(string actual, string root) + => actual.Equals(root, StringComparison.Ordinal) + || actual.StartsWith(root + ".", StringComparison.Ordinal); + + /// + /// Assemblies referenced by compiled scripts. Shared between the Roslyn scripting + /// options and the semantic-analysis compilation built for trust validation + /// (SiteRuntime-011), so the validator resolves symbols against exactly the same + /// metadata the script is compiled against. + /// + private static readonly System.Reflection.Assembly[] ScriptAssemblies = + [ + typeof(object).Assembly, + typeof(Enumerable).Assembly, + typeof(Math).Assembly, + typeof(Microsoft.CSharp.RuntimeBinder.CSharpArgumentInfo).Assembly, + typeof(Commons.Types.DynamicJsonElement).Assembly + ]; + + /// + /// Metadata references for the trust-validation semantic compilation. + /// + private static readonly MetadataReference[] ScriptReferences = + ScriptAssemblies + .Select(a => (MetadataReference)MetadataReference.CreateFromFile(a.Location)) + .ToArray(); + /// /// Shared Roslyn scripting options (references + imports) used by both full /// script compilation and trigger-expression compilation. /// private static ScriptOptions BuildScriptOptions() => ScriptOptions.Default - .WithReferences( - typeof(object).Assembly, - typeof(Enumerable).Assembly, - typeof(Math).Assembly, - typeof(Microsoft.CSharp.RuntimeBinder.CSharpArgumentInfo).Assembly, - typeof(Commons.Types.DynamicJsonElement).Assembly) + .WithReferences(ScriptAssemblies) .WithImports( "System", "System.Collections.Generic", diff --git a/src/ScadaLink.SiteRuntime/Scripts/ScriptExecutionScheduler.cs b/src/ScadaLink.SiteRuntime/Scripts/ScriptExecutionScheduler.cs new file mode 100644 index 0000000..91450cd --- /dev/null +++ b/src/ScadaLink.SiteRuntime/Scripts/ScriptExecutionScheduler.cs @@ -0,0 +1,107 @@ +using System.Collections.Concurrent; + +namespace ScadaLink.SiteRuntime.Scripts; + +/// +/// SiteRuntime-009: a dedicated, bounded for running script +/// and alarm on-trigger bodies. +/// +/// Script bodies may perform synchronous blocking I/O (a database connection, a +/// synchronous external-system call). Running them on the shared .NET +/// lets a burst of blocking scripts starve the pool and stall +/// unrelated Akka dispatchers and HTTP request handling. This scheduler owns a fixed set +/// of dedicated threads, so script blocking is contained to those threads and cannot +/// exhaust the global pool. +/// +/// The scheduler is process-wide (one set of threads for all instances) and is sized +/// from the first time it is configured. +/// +public sealed class ScriptExecutionScheduler : TaskScheduler, IDisposable +{ + private readonly BlockingCollection _queue = new(); + private readonly List _threads; + private int _disposed; + + private static volatile ScriptExecutionScheduler? _shared; + private static readonly object SharedLock = new(); + + /// + /// The process-wide script-execution scheduler. Lazily created on first use with the + /// thread count from ; the + /// first caller wins, subsequent calls reuse the existing instance. + /// + public static ScriptExecutionScheduler Shared(SiteRuntimeOptions options) + { + if (_shared != null) + return _shared; + + lock (SharedLock) + { + return _shared ??= new ScriptExecutionScheduler(options.ScriptExecutionThreadCount); + } + } + + /// + /// Creates a scheduler backed by dedicated threads. + /// + public ScriptExecutionScheduler(int threadCount) + { + if (threadCount < 1) + threadCount = 1; + + _threads = new List(threadCount); + for (var i = 0; i < threadCount; i++) + { + var thread = new Thread(WorkerLoop) + { + IsBackground = true, + Name = $"script-execution-{i}" + }; + _threads.Add(thread); + thread.Start(); + } + } + + /// The number of dedicated worker threads. + public override int MaximumConcurrencyLevel => _threads.Count; + + private void WorkerLoop() + { + try + { + foreach (var task in _queue.GetConsumingEnumerable()) + { + TryExecuteTask(task); + } + } + catch (ObjectDisposedException) + { + // Scheduler disposed — worker exits. + } + } + + protected override void QueueTask(Task task) => _queue.Add(task); + + protected override bool TryExecuteTaskInline(Task task, bool taskWasPreviouslyQueued) + { + // Only inline if we are already on one of this scheduler's worker threads, + // so script work never escapes onto a thread-pool thread. + if (Thread.CurrentThread.Name?.StartsWith("script-execution-", StringComparison.Ordinal) != true) + return false; + + return TryExecuteTask(task); + } + + protected override IEnumerable GetScheduledTasks() => _queue.ToArray(); + + public void Dispose() + { + if (Interlocked.Exchange(ref _disposed, 1) != 0) + return; + + _queue.CompleteAdding(); + foreach (var thread in _threads) + thread.Join(TimeSpan.FromSeconds(5)); + _queue.Dispose(); + } +} diff --git a/src/ScadaLink.SiteRuntime/SiteRuntimeOptions.cs b/src/ScadaLink.SiteRuntime/SiteRuntimeOptions.cs index f84f9a8..ae1a886 100644 --- a/src/ScadaLink.SiteRuntime/SiteRuntimeOptions.cs +++ b/src/ScadaLink.SiteRuntime/SiteRuntimeOptions.cs @@ -36,4 +36,12 @@ public class SiteRuntimeOptions /// Default: 1000. /// public int StreamBufferSize { get; set; } = 1000; + + /// + /// SiteRuntime-009: number of dedicated threads in the script-execution scheduler. + /// Script and alarm on-trigger bodies run on these threads instead of the shared + /// .NET thread pool, so blocking script I/O cannot starve the global pool. + /// Default: 8. + /// + public int ScriptExecutionThreadCount { get; set; } = 8; } diff --git a/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerMediumFindingsTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerMediumFindingsTests.cs new file mode 100644 index 0000000..a1c7b83 --- /dev/null +++ b/tests/ScadaLink.SiteRuntime.Tests/Actors/DeploymentManagerMediumFindingsTests.cs @@ -0,0 +1,248 @@ +using Akka.Actor; +using Akka.TestKit.Xunit2; +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.Commons.Messages.Deployment; +using ScadaLink.Commons.Types.Enums; +using ScadaLink.Commons.Types.Flattening; +using ScadaLink.SiteRuntime.Actors; +using ScadaLink.SiteRuntime.Persistence; +using ScadaLink.SiteRuntime.Scripts; +using System.Text.Json; + +namespace ScadaLink.SiteRuntime.Tests.Actors; + +/// +/// Regression tests for the Medium-severity DeploymentManagerActor findings: +/// SiteRuntime-005 (Success reported before persistence completes) and +/// SiteRuntime-008 (blocking shared-script load on the actor thread). +/// +public class DeploymentManagerMediumFindingsTests : TestKit, IDisposable +{ + private readonly ScriptCompilationService _compilationService; + private readonly SharedScriptLibrary _sharedScriptLibrary; + private readonly string _dbFile; + + public DeploymentManagerMediumFindingsTests() + { + _dbFile = Path.Combine(Path.GetTempPath(), $"dm-medium-test-{Guid.NewGuid():N}.db"); + _compilationService = new ScriptCompilationService( + NullLogger.Instance); + _sharedScriptLibrary = new SharedScriptLibrary( + _compilationService, NullLogger.Instance); + } + + void IDisposable.Dispose() + { + Shutdown(); + try { File.Delete(_dbFile); } catch { /* cleanup */ } + } + + private SiteStorageService NewStorage(string connectionString) + => new(connectionString, NullLogger.Instance); + + private IActorRef CreateDeploymentManager(SiteStorageService storage, IActorRef? dclManager = null) + { + return ActorOf(Props.Create(() => new DeploymentManagerActor( + storage, + _compilationService, + _sharedScriptLibrary, + null, + new SiteRuntimeOptions(), + NullLogger.Instance, + dclManager, + null, + null, + null))); + } + + private static string MakeConfigJsonWithConnection( + string instanceName, string endpoint, int failoverRetryCount) + { + var config = new FlattenedConfiguration + { + InstanceUniqueName = instanceName, + Attributes = + [ + new ResolvedAttribute { CanonicalName = "TestAttr", Value = "1", DataType = "Int32" } + ], + Connections = new Dictionary + { + ["Conn1"] = new ConnectionConfig + { + Protocol = "Custom", + ConfigurationJson = $"{{\"endpoint\":\"{endpoint}\"}}", + FailoverRetryCount = failoverRetryCount + } + } + }; + return JsonSerializer.Serialize(config); + } + + private static string MakeConfigJson(string instanceName) + { + var config = new FlattenedConfiguration + { + InstanceUniqueName = instanceName, + Attributes = + [ + new ResolvedAttribute { CanonicalName = "TestAttr", Value = "1", DataType = "Int32" } + ] + }; + return JsonSerializer.Serialize(config); + } + + /// + /// SiteRuntime-005: when SQLite persistence of the deployed config fails, the + /// Deployment Manager must report to central, + /// not . Reporting Success on a persistence + /// failure silently loses the deployment on the next restart/failover. + /// + [Fact] + public async Task Deploy_PersistenceFailure_ReportsFailedNotSuccess() + { + // A connection string pointing at an unwritable path makes every storage + // write throw, so StoreDeployedConfigAsync fails. + var badPath = Path.Combine( + Path.GetTempPath(), $"no-such-dir-{Guid.NewGuid():N}", "site.db"); + var storage = NewStorage($"Data Source={badPath}"); + + var actor = CreateDeploymentManager(storage); + await Task.Delay(500); // empty startup + + actor.Tell(new DeployInstanceCommand( + "dep-fail", "FailPump", "h1", MakeConfigJson("FailPump"), "admin", DateTimeOffset.UtcNow)); + + var response = ExpectMsg(TimeSpan.FromSeconds(10)); + Assert.Equal("FailPump", response.InstanceUniqueName); + Assert.Equal(DeploymentStatus.Failed, response.Status); + Assert.False(string.IsNullOrEmpty(response.ErrorMessage)); + } + + /// + /// SiteRuntime-005: a successful deployment must still report + /// , and only after the config row is + /// committed to SQLite (so a restart re-creates the instance). + /// + [Fact] + public async Task Deploy_Success_ReportsSuccessAndPersistsConfig() + { + var storage = NewStorage($"Data Source={_dbFile}"); + await storage.InitializeAsync(); + + var actor = CreateDeploymentManager(storage); + await Task.Delay(500); + + actor.Tell(new DeployInstanceCommand( + "dep-ok", "OkPump", "h1", MakeConfigJson("OkPump"), "admin", DateTimeOffset.UtcNow)); + + var response = ExpectMsg(TimeSpan.FromSeconds(10)); + Assert.Equal(DeploymentStatus.Success, response.Status); + + // By the time Success is reported, the config must be durable. + var configs = await storage.GetAllDeployedConfigsAsync(); + Assert.Contains(configs, c => c.InstanceUniqueName == "OkPump"); + } + + /// + /// SiteRuntime-010: when a redeployment changes a connection's configuration + /// (here the failover retry count and endpoint), the Deployment Manager must + /// re-issue a + /// so the DCL adopts the new configuration rather than keeping the stale one. + /// + [Fact] + public async Task EnsureDclConnections_ConnectionConfigChanged_ReissuesCreateCommand() + { + var storage = NewStorage($"Data Source={_dbFile}"); + await storage.InitializeAsync(); + + var dcl = CreateTestProbe(); + var actor = CreateDeploymentManager(storage, dcl.Ref); + await Task.Delay(500); + + // Initial deploy with one connection. + actor.Tell(new DeployInstanceCommand( + "dep-c1", "ConnPump", "h1", + MakeConfigJsonWithConnection("ConnPump", "opc.tcp://host-a:4840", 3), + "admin", DateTimeOffset.UtcNow)); + var firstCreate = dcl.ExpectMsg( + TimeSpan.FromSeconds(5)); + Assert.Equal("Conn1", firstCreate.ConnectionName); + Assert.Equal(3, firstCreate.FailoverRetryCount); + ExpectMsg(TimeSpan.FromSeconds(5)); + await Task.Delay(500); + + // Redeploy with a CHANGED connection configuration. + actor.Tell(new DeployInstanceCommand( + "dep-c2", "ConnPump", "h2", + MakeConfigJsonWithConnection("ConnPump", "opc.tcp://host-b:4840", 7), + "admin", DateTimeOffset.UtcNow)); + + // The DCL must receive a fresh create command reflecting the new config. + var secondCreate = dcl.ExpectMsg( + TimeSpan.FromSeconds(10)); + Assert.Equal("Conn1", secondCreate.ConnectionName); + Assert.Equal(7, secondCreate.FailoverRetryCount); + } + + /// + /// SiteRuntime-010: an unchanged connection configuration must still be skipped — + /// re-sending an identical create command on every deploy is wasteful. + /// + [Fact] + public async Task EnsureDclConnections_UnchangedConfig_DoesNotReissueCreateCommand() + { + var storage = NewStorage($"Data Source={_dbFile}"); + await storage.InitializeAsync(); + + var dcl = CreateTestProbe(); + var actor = CreateDeploymentManager(storage, dcl.Ref); + await Task.Delay(500); + + var json = MakeConfigJsonWithConnection("StablePump", "opc.tcp://host-a:4840", 3); + actor.Tell(new DeployInstanceCommand( + "dep-s1", "StablePump", "h1", json, "admin", DateTimeOffset.UtcNow)); + dcl.ExpectMsg( + TimeSpan.FromSeconds(5)); + ExpectMsg(TimeSpan.FromSeconds(5)); + await Task.Delay(500); + + // Redeploy with the IDENTICAL connection configuration. + actor.Tell(new DeployInstanceCommand( + "dep-s2", "StablePump", "h2", json, "admin", DateTimeOffset.UtcNow)); + ExpectMsg(TimeSpan.FromSeconds(10)); + + // No further create command for an unchanged connection. + dcl.ExpectNoMsg(TimeSpan.FromMilliseconds(800)); + } + + /// + /// SiteRuntime-008: startup must not block the Deployment Manager mailbox on a + /// synchronous shared-script load. With shared scripts present, the actor must + /// still load deployed configs, create Instance Actors, and remain responsive. + /// + [Fact] + public async Task Startup_WithSharedScripts_LoadsConfigsAndStaysResponsive() + { + var storage = NewStorage($"Data Source={_dbFile}"); + await storage.InitializeAsync(); + + // Several shared scripts to compile during startup. + for (var i = 0; i < 5; i++) + { + await storage.StoreSharedScriptAsync( + $"Shared{i}", "return 1 + 1;", null, null); + } + + await storage.StoreDeployedConfigAsync( + "StartupPump", MakeConfigJson("StartupPump"), "d1", "h1", true); + + var actor = CreateDeploymentManager(storage); + await Task.Delay(2000); + + // The instance loaded at startup must be operable — proves startup completed + // and the actor processed messages after the shared-script load. + actor.Tell(new DeploymentStateQueryRequest("corr-1", "StartupPump", DateTimeOffset.UtcNow)); + var response = ExpectMsg(TimeSpan.FromSeconds(5)); + Assert.True(response.IsDeployed); + } +} diff --git a/tests/ScadaLink.SiteRuntime.Tests/Repositories/SiteRepositoryTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Repositories/SiteRepositoryTests.cs new file mode 100644 index 0000000..72125ef --- /dev/null +++ b/tests/ScadaLink.SiteRuntime.Tests/Repositories/SiteRepositoryTests.cs @@ -0,0 +1,108 @@ +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.SiteRuntime.Persistence; +using ScadaLink.SiteRuntime.Repositories; + +namespace ScadaLink.SiteRuntime.Tests.Repositories; + +/// +/// SiteRuntime-006 / SiteRuntime-007 regression tests for the site-local repositories. +/// +/// SiteRuntime-006: the repositories must obtain a SQLite connection through +/// , not by reading a private field +/// via reflection. +/// +/// SiteRuntime-007: the synthetic integer IDs derived from entity names must be stable +/// across process restarts (a freshly-constructed service/repository), so an ID handed +/// to a caller still resolves the same entity later. +/// +public class SiteRepositoryTests : IDisposable +{ + private readonly string _dbFile; + + public SiteRepositoryTests() + { + _dbFile = Path.Combine(Path.GetTempPath(), $"site-repo-test-{Guid.NewGuid():N}.db"); + } + + public void Dispose() + { + try { File.Delete(_dbFile); } catch { /* cleanup */ } + GC.SuppressFinalize(this); + } + + private SiteStorageService NewStorage() + => new($"Data Source={_dbFile}", NullLogger.Instance); + + /// + /// SiteRuntime-006: an external system stored via + /// can be read back through the repository — proving the repository's connection + /// (now obtained from ) is valid. + /// + [Fact] + public async Task ExternalSystemRepository_RoundTripsStoredDefinition() + { + var storage = NewStorage(); + await storage.InitializeAsync(); + await storage.StoreExternalSystemAsync( + "WeatherApi", "https://api.example.com", "ApiKey", "{\"key\":\"x\"}", null); + + var repo = new SiteExternalSystemRepository(storage); + var all = await repo.GetAllExternalSystemsAsync(); + + Assert.Single(all); + Assert.Equal("WeatherApi", all[0].Name); + Assert.Equal("https://api.example.com", all[0].EndpointUrl); + } + + /// + /// SiteRuntime-007: the synthetic ID for an external system must be identical when + /// the storage service and repository are re-created (simulating a process restart). + /// With the old the ID was randomized per process + /// and a by-ID lookup after a restart would fail. + /// + [Fact] + public async Task ExternalSystemRepository_SyntheticId_IsStableAcrossRestart() + { + var storage1 = NewStorage(); + await storage1.InitializeAsync(); + await storage1.StoreExternalSystemAsync( + "StableSystem", "https://x", "None", null, null); + + var repo1 = new SiteExternalSystemRepository(storage1); + var idBeforeRestart = (await repo1.GetAllExternalSystemsAsync())[0].Id; + + // Simulate a process restart — brand-new service + repository instances. + var storage2 = NewStorage(); + var repo2 = new SiteExternalSystemRepository(storage2); + var idAfterRestart = (await repo2.GetAllExternalSystemsAsync())[0].Id; + + Assert.Equal(idBeforeRestart, idAfterRestart); + + // And the by-ID lookup must succeed using the pre-restart ID. + var found = await repo2.GetExternalSystemByIdAsync(idBeforeRestart); + Assert.NotNull(found); + Assert.Equal("StableSystem", found.Name); + } + + /// + /// SiteRuntime-007: the same stability guarantee for notification lists. + /// + [Fact] + public async Task NotificationRepository_SyntheticId_IsStableAcrossRestart() + { + var storage1 = NewStorage(); + await storage1.InitializeAsync(); + await storage1.StoreNotificationListAsync( + "OnCall", new[] { "a@example.com", "b@example.com" }); + + var repo1 = new SiteNotificationRepository(storage1); + var idBeforeRestart = (await repo1.GetAllNotificationListsAsync())[0].Id; + + var storage2 = NewStorage(); + var repo2 = new SiteNotificationRepository(storage2); + var found = await repo2.GetNotificationListByIdAsync(idBeforeRestart); + + Assert.NotNull(found); + Assert.Equal("OnCall", found.Name); + } +} diff --git a/tests/ScadaLink.SiteRuntime.Tests/Scripts/ScriptExecutionSchedulerTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Scripts/ScriptExecutionSchedulerTests.cs new file mode 100644 index 0000000..e01896b --- /dev/null +++ b/tests/ScadaLink.SiteRuntime.Tests/Scripts/ScriptExecutionSchedulerTests.cs @@ -0,0 +1,47 @@ +using ScadaLink.SiteRuntime; +using ScadaLink.SiteRuntime.Scripts; + +namespace ScadaLink.SiteRuntime.Tests.Scripts; + +/// +/// SiteRuntime-009: the dedicated script-execution scheduler must run script bodies on +/// its own dedicated threads, not on the shared .NET thread pool, so blocking script +/// I/O cannot starve the global pool. +/// +public class ScriptExecutionSchedulerTests +{ + [Fact] + public async Task Scheduler_RunsWork_OffTheThreadPool() + { + using var scheduler = new ScriptExecutionScheduler(2); + + bool wasThreadPoolThread = true; + string? threadName = null; + + await Task.Factory.StartNew(() => + { + wasThreadPoolThread = Thread.CurrentThread.IsThreadPoolThread; + threadName = Thread.CurrentThread.Name; + }, CancellationToken.None, TaskCreationOptions.DenyChildAttach, scheduler); + + Assert.False(wasThreadPoolThread, + "Script work must not run on a shared thread-pool thread."); + Assert.StartsWith("script-execution-", threadName); + } + + [Fact] + public void Scheduler_RespectsConfiguredThreadCount() + { + using var scheduler = new ScriptExecutionScheduler(4); + Assert.Equal(4, scheduler.MaximumConcurrencyLevel); + } + + [Fact] + public void Scheduler_Shared_ReturnsSameInstanceForOptions() + { + var options = new SiteRuntimeOptions { ScriptExecutionThreadCount = 3 }; + var a = ScriptExecutionScheduler.Shared(options); + var b = ScriptExecutionScheduler.Shared(options); + Assert.Same(a, b); + } +} diff --git a/tests/ScadaLink.SiteRuntime.Tests/Scripts/TrustModelSemanticTests.cs b/tests/ScadaLink.SiteRuntime.Tests/Scripts/TrustModelSemanticTests.cs new file mode 100644 index 0000000..66683f7 --- /dev/null +++ b/tests/ScadaLink.SiteRuntime.Tests/Scripts/TrustModelSemanticTests.cs @@ -0,0 +1,92 @@ +using Microsoft.Extensions.Logging.Abstractions; +using ScadaLink.SiteRuntime.Scripts; + +namespace ScadaLink.SiteRuntime.Tests.Scripts; + +/// +/// SiteRuntime-011: regression tests for the semantic-analysis trust-model validation. +/// The previous implementation was a raw substring scan of the source text — it both +/// missed forbidden APIs (no literal namespace string) and raised false positives on +/// the namespace string appearing in comments, string literals or unrelated identifiers. +/// +public class TrustModelSemanticTests +{ + private readonly ScriptCompilationService _service = + new(NullLogger.Instance); + + // ── Bypass cases (under-inclusive substring scan would MISS these) ── + + [Fact] + public void TrustModel_GlobalQualifiedForbiddenType_IsDetected() + { + // `global::`-prefixed name — the literal "System.IO" substring is still present + // here, but the resolved-symbol approach catches it regardless of spelling. + var violations = _service.ValidateTrustModel( + "global::System.IO.File.ReadAllText(\"/etc/passwd\")"); + Assert.NotEmpty(violations); + } + + [Fact] + public void TrustModel_ForbiddenTypeViaUsingAlias_IsDetected() + { + // A using-alias hides the forbidden namespace from a substring scan entirely: + // the script body never writes "System.IO". Semantic resolution still sees that + // the alias resolves to System.IO.File. + var code = """ + using F = System.IO.File; + F.ReadAllText("/etc/passwd"); + """; + var violations = _service.ValidateTrustModel(code); + Assert.NotEmpty(violations); + Assert.Contains(violations, v => v.Contains("System.IO")); + } + + // ── False-positive cases (over-inclusive substring scan would WRONGLY flag these) ── + + [Fact] + public void TrustModel_ForbiddenNamespaceInStringLiteral_IsNotFlagged() + { + // "System.IO" appears only inside a string literal — not an API reference. + var violations = _service.ValidateTrustModel( + "var label = \"System.IO is blocked\"; return label;"); + Assert.Empty(violations); + } + + [Fact] + public void TrustModel_ForbiddenNamespaceInComment_IsNotFlagged() + { + var code = """ + // This script does not use System.IO or System.Reflection at all. + var x = 1 + 2; + return x; + """; + var violations = _service.ValidateTrustModel(code); + Assert.Empty(violations); + } + + [Fact] + public void TrustModel_UnrelatedIdentifierContainingForbiddenSubstring_IsNotFlagged() + { + // A local variable whose name merely contains "Threading" is harmless. + var code = """ + var ProcessThreadingCount = 5; + return ProcessThreadingCount + 1; + """; + var violations = _service.ValidateTrustModel(code); + Assert.Empty(violations); + } + + // ── Allowed exceptions still resolve correctly ── + + [Fact] + public void TrustModel_TaskAndCancellationToken_RemainAllowed() + { + var code = """ + var cts = new System.Threading.CancellationTokenSource(); + await System.Threading.Tasks.Task.Delay(1, cts.Token); + return 0; + """; + var violations = _service.ValidateTrustModel(code); + Assert.Empty(violations); + } +}