diff --git a/code-reviews/Host/findings.md b/code-reviews/Host/findings.md index 0d78d15..567e5bc 100644 --- a/code-reviews/Host/findings.md +++ b/code-reviews/Host/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 8 | +| Open findings | 1 | ## Summary @@ -250,7 +250,7 @@ appropriately before the fix and pass after. |--|--| | Severity | Low | | Category | Concurrency & thread safety | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:345` | **Description** @@ -270,7 +270,20 @@ storeAndForwardService.StartAsync(cancellationToken)`. Propagate the **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`). Root cause confirmed: `RegisterSiteActors` +called `storeAndForwardService.StartAsync().GetAwaiter().GetResult()` synchronously +inside the `IHostedService.StartAsync` path, blocking a startup thread and wrapping +any failure in `AggregateException`. Fix: `AkkaHostedService.StartAsync` is now +genuinely `async Task`; the site path was renamed `RegisterSiteActorsAsync` and made +`async`; the store-and-forward init is now `await storeAndForwardService.StartAsync()` +(the service's own signature, in another module, takes no `CancellationToken`, so the +token is honoured via a `cancellationToken.ThrowIfCancellationRequested()` checkpoint +immediately before the await). No blocking, no sync-context deadlock risk, and the +original exception type now surfaces. No dedicated regression test: the only +observable change is non-blocking/exception-unwrapping behaviour on a path that +requires the full site DI graph plus a real SQLite store; the existing +`HostStartupTests.SiteRole_*` and `CompositionRootTests` continue to exercise the +async startup pipeline and remain green (175 passed). ### Host-006 — HOCON assembled by unescaped string interpolation @@ -278,7 +291,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Akka.NET conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:70-108` | **Description** @@ -302,7 +315,25 @@ and seed nodes) before substitution. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`). Root cause confirmed: the Akka HOCON was +built by an interpolated string injecting `NodeHostname`, `SeedNodes`, roles and +`SplitBrainResolverStrategy` directly into the document text with no escaping — a +value containing a quote, backslash or whitespace would corrupt or misparse the +document. Re-triaged the recommendation: switching wholesale to the typed +`Akka.Hosting` builder is a larger refactor than this Low finding warrants and would +touch how the whole cluster is bootstrapped, so the HOCON form was retained but made +safe. Fix: the HOCON construction was extracted into a testable static +`AkkaHostedService.BuildHocon`, and every interpolated *string* value is now routed +through a `QuoteHocon` helper that wraps the value in a double-quoted HOCON literal +and escapes embedded backslashes and quotes — `NodeHostname`, every seed node, every +role, and `SplitBrainResolverStrategy` (the last was previously unquoted entirely, so +whitespace alone would break it). Regression tests in new `HoconBuilderTests`: +`BuildHocon_HostnameWithQuote_DoesNotCorruptDocument` (a hostname with a `"` still +parses and following keys stay intact), `BuildHocon_StrategyWithWhitespace_RemainsQuoted`, +and `BuildHocon_PlainValues_ParsesAndPreservesValues`. With the old raw interpolation +a quote-containing hostname produced `hostname = "evil"host"`, closing the literal +early and corrupting following keys; the escaped form parses correctly. Full Host +suite green (175 passed). ### Host-007 — REQ-HOST-4 rule "GrpcPort ≠ RemotingPort" is not enforced @@ -310,7 +341,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Design-document adherence | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/StartupValidator.cs:43-47` | **Description** @@ -332,7 +363,18 @@ Add a check in the `role == "Site"` block: if `GrpcPort` (resolved, including th **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`). Root cause confirmed against +`StartupValidator.cs`: the `role == "Site"` block validated the GrpcPort range but +never compared it to `RemotingPort`, so a site config setting both ports equal +passed validation and then failed opaquely when Kestrel and Akka.Remote both bound +the port. Fix: added `if (port == grpcPort) errors.Add("ScadaLink:Node:GrpcPort must +differ from RemotingPort")` in the Site block, using the resolved GrpcPort (including +the 8083 `NodeOptions` default when the key is absent) against the parsed +RemotingPort. Regression tests in `StartupValidatorTests`: +`Site_GrpcPortEqualsRemotingPort_FailsValidation`, +`Site_DefaultGrpcPortEqualsRemotingPort_FailsValidation` (default-8083 path), and +`Site_GrpcPortDiffersFromRemotingPort_PassesValidation` — all failed appropriately +before the fix and pass after. Full Host suite green (175 passed). ### Host-008 — `MachineDataDb` is validated and declared but never consumed @@ -340,7 +382,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/StartupValidator.cs:33-34`, `src/ScadaLink.Host/DatabaseOptions.cs:6` | **Description** @@ -361,7 +403,22 @@ rule, the `DatabaseOptions` property, and the key from `appsettings.Central.json **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`). Root cause confirmed: a repo-wide search +shows `MachineDataDb` is read nowhere outside the Host module — only `ConfigurationDb` +is wired into `AddConfigurationDatabase` — yet `StartupValidator` failed Central +startup when it was absent. No machine-data store exists or is registered, so this is +dead configuration. Fix (the "if no" branch of the recommendation): removed the +`MachineDataDb` validation rule from `StartupValidator`, the `MachineDataDb` property +from `DatabaseOptions`, the `MachineDataDb` key from `appsettings.Central.json`, and +the now-pointless `MachineDataDb` reference from that file's `_secrets` note and from +the `CentralDbTestEnvironment` test helper. Regression test: the prior +`StartupValidatorTests.Central_MissingMachineDataDb_FailsValidation` was inverted to +`Central_MissingMachineDataDb_PassesValidation` (absence must now NOT fail startup), +and `MachineDataDb` was removed from the shared `ValidCentralConfig` fixture so all +Central tests prove the key is no longer required; it failed under the old code and +passes after. Full Host suite green (175 passed). Note: `appsettings.Central.json` +under `docker/central-node-*` is outside this task's edit scope and still carries the +key harmlessly — it will simply be ignored; a follow-up may remove it for tidiness. ### Host-009 — `StartAsync` reports success before role actors are confirmed running @@ -369,7 +426,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Akka.NET conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:127-141` | **Description** @@ -395,7 +452,24 @@ actor system exists, not that the cluster handshake has completed. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`) — re-triaged, documentation fix. Root cause +confirmed: `SetReady` is called optimistically after `ActorOf` registrations whose +`PreStart`/initial-contact handshakes run asynchronously. Re-triaged the first +recommended option (gating `SetReady` on a `SiteCommunicationActor` `Ask` +round-trip): that option is **incorrect** because the meaningful asynchronous step is +the `ClusterClient` initial-contact handshake with central — gating site gRPC +readiness on it would prevent a site from coming up and streaming locally while +central is briefly unreachable, contradicting the design's site-autonomy model. +Confirmed `SiteStreamGrpcServer` already rejects any stream opened before `SetReady` +with `StatusCode.Unavailable`, so the window is benign. Fix (the second, correct +option): the `SetReady` call site now carries an explicit comment documenting the +narrow readiness contract — by `SetReady` the actor system exists, +`SiteStreamManager.Initialize` has run, and every role actor has been created with +ordered synchronous `ActorOf` + registration `Tell`s; what is deliberately NOT +guaranteed is `PreStart` completion or the central handshake. No regression test is +meaningful for a documentation-only clarification of an intentional design contract; +the existing `HostStartupTests.SiteRole_*` cover the surrounding startup path. Full +Host suite green (175 passed). ### Host-010 — No retry/backoff around startup preconditions (DB migration, readiness) @@ -403,7 +477,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Error handling & resilience | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/Program.cs:112-125` | **Description** @@ -425,7 +499,22 @@ reports `503` until the database becomes reachable. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`). Root cause confirmed: Central startup called +`MigrationHelper.ApplyOrValidateMigrationsAsync` directly with no tolerance for a +database that is briefly unreachable at boot — a first connection failure threw, the +top-level `catch` logged `Fatal`, and the process exited. Fix: added a new +`StartupRetry.ExecuteWithRetryAsync` helper (bounded retry with capped exponential +backoff — no Polly dependency, keeping the Host's package set minimal) and wrapped +the migration step in it (`maxAttempts: 8`, `initialDelay: 2s`, doubling, capped at +30s). Each retry opens a fresh DI scope so a transient connection fault does not +leave a poisoned `DbContext`; once attempts are exhausted the last exception +propagates and the existing fatal-log path still fires. Regression tests in new +`StartupRetryTests`: `ExecuteWithRetry_SucceedsFirstTry_RunsOnce`, +`ExecuteWithRetry_TransientFailures_RetriesUntilSuccess` (fails twice then succeeds), +and `ExecuteWithRetry_ExhaustsAttempts_RethrowsLastException` (verifies the original +exception type/message surfaces after exhaustion) — they fail to compile/pass against +the pre-fix code (the helper did not exist) and pass after. Full Host suite green +(175 passed). ### Host-011 — `LoggingOptions.MinimumLevel` is dead configuration @@ -433,7 +522,7 @@ _Unresolved._ |--|--| | Severity | Low | | Category | Code organization & conventions | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.Host/LoggingOptions.cs:5`, `src/ScadaLink.Host/Program.cs:42-50` | **Description** @@ -456,4 +545,21 @@ configuration section. Keep one mechanism, not two. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit `pending`). Root cause confirmed: `LoggingOptions.MinimumLevel` +was bound from `ScadaLink:Logging` but Serilog was configured purely via +`ReadFrom.Configuration` (standard `Serilog` section), so editing +`ScadaLink:Logging:MinimumLevel` had no effect. Re-triaged the two options: the +"remove the option class" branch was rejected because `Component-Host.md` REQ-HOST-3 +explicitly lists `ScadaLink:Logging` → `LoggingOptions` and that design doc is outside +this task's edit scope — removing the class would create a fresh code-vs-doc drift. +Fix (the "consume it" branch): added a `LoggerConfigurationFactory.Build` that binds +`LoggingOptions`, parses `MinimumLevel` into a Serilog `LogEventLevel` (falling back +to `Information` for null/blank/unrecognised values), and pins it via +`MinimumLevel.Is(...)` on top of `ReadFrom.Configuration`; `Program.cs` now builds the +logger through this factory. `ScadaLink:Logging:MinimumLevel` is now live. +Regression tests in new `LoggerConfigurationTests`: +`MinimumLevel_Warning_SuppressesInformationLogs`, +`MinimumLevel_Debug_AllowsDebugLogs`, and `MinimumLevel_Absent_DefaultsToInformation` +— they assert the configured level actually filters log events; they pass against the +new factory (and would fail against the old inline configuration, which ignored the +key). Full Host suite green (175 passed). diff --git a/src/ScadaLink.Host/Actors/AkkaHostedService.cs b/src/ScadaLink.Host/Actors/AkkaHostedService.cs index cb9865d..9f313e3 100644 --- a/src/ScadaLink.Host/Actors/AkkaHostedService.cs +++ b/src/ScadaLink.Host/Actors/AkkaHostedService.cs @@ -54,58 +54,20 @@ public class AkkaHostedService : IHostedService /// public ActorSystem? ActorSystem => _actorSystem; - public Task StartAsync(CancellationToken cancellationToken) + public async Task StartAsync(CancellationToken cancellationToken) { - var seedNodesStr = string.Join(",", - _clusterOptions.SeedNodes.Select(s => $"\"{s}\"")); - // For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role var roles = BuildRoles(); - var rolesStr = string.Join(",", roles.Select(r => $"\"{r}\"")); // WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults) var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds; var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds; - var hocon = $@" -akka {{ - extensions = [ - ""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools"" - ] - actor {{ - provider = cluster - }} - remote {{ - dot-netty.tcp {{ - hostname = ""{_nodeOptions.NodeHostname}"" - port = {_nodeOptions.RemotingPort} - }} - transport-failure-detector {{ - heartbeat-interval = {transportHeartbeatSec:F0}s - acceptable-heartbeat-pause = {transportFailureSec:F0}s - }} - }} - cluster {{ - seed-nodes = [{seedNodesStr}] - roles = [{rolesStr}] - min-nr-of-members = {_clusterOptions.MinNrOfMembers} - split-brain-resolver {{ - active-strategy = {_clusterOptions.SplitBrainResolverStrategy} - stable-after = {_clusterOptions.StableAfter.TotalSeconds:F0}s - keep-oldest {{ - down-if-alone = on - }} - }} - failure-detector {{ - heartbeat-interval = {_clusterOptions.HeartbeatInterval.TotalSeconds:F0}s - acceptable-heartbeat-pause = {_clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s - }} - run-coordinated-shutdown-when-down = on - }} - coordinated-shutdown {{ - run-by-clr-shutdown-hook = on - }} -}}"; + // Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every + // interpolated value, so a hostname, seed node or strategy containing a quote, + // backslash or whitespace cannot corrupt the configuration document. + var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles, + transportHeartbeatSec, transportFailureSec); var config = ConfigurationFactory.ParseString(hocon); _actorSystem = ActorSystem.Create("scadalink", config); @@ -135,10 +97,78 @@ akka {{ } else if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase)) { - RegisterSiteActors(); + await RegisterSiteActorsAsync(cancellationToken); } + } - return Task.CompletedTask; + /// + /// Builds the Akka HOCON configuration document. Every interpolated value is + /// routed through (string values) so a hostname, + /// seed-node URI, role or split-brain strategy containing a quote, backslash or + /// whitespace cannot corrupt the document or be silently misparsed (Host-006). + /// + public static string BuildHocon( + NodeOptions nodeOptions, + ClusterOptions clusterOptions, + IEnumerable roles, + double transportHeartbeatSec, + double transportFailureSec) + { + var seedNodesStr = string.Join(",", + clusterOptions.SeedNodes.Select(QuoteHocon)); + var rolesStr = string.Join(",", roles.Select(QuoteHocon)); + + return $@" +akka {{ + extensions = [ + ""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools"" + ] + actor {{ + provider = cluster + }} + remote {{ + dot-netty.tcp {{ + hostname = {QuoteHocon(nodeOptions.NodeHostname)} + port = {nodeOptions.RemotingPort} + }} + transport-failure-detector {{ + heartbeat-interval = {transportHeartbeatSec:F0}s + acceptable-heartbeat-pause = {transportFailureSec:F0}s + }} + }} + cluster {{ + seed-nodes = [{seedNodesStr}] + roles = [{rolesStr}] + min-nr-of-members = {clusterOptions.MinNrOfMembers} + split-brain-resolver {{ + active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)} + stable-after = {clusterOptions.StableAfter.TotalSeconds:F0}s + keep-oldest {{ + down-if-alone = on + }} + }} + failure-detector {{ + heartbeat-interval = {clusterOptions.HeartbeatInterval.TotalSeconds:F0}s + acceptable-heartbeat-pause = {clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s + }} + run-coordinated-shutdown-when-down = on + }} + coordinated-shutdown {{ + run-by-clr-shutdown-hook = on + }} +}}"; + } + + /// + /// Renders a value as a HOCON double-quoted string, escaping backslashes and + /// double quotes so the resulting token cannot break out of its string literal. + /// + private static string QuoteHocon(string? value) + { + var escaped = (value ?? string.Empty) + .Replace("\\", "\\\\") + .Replace("\"", "\\\""); + return $"\"{escaped}\""; } public async Task StopAsync(CancellationToken cancellationToken) @@ -218,7 +248,7 @@ akka {{ /// The singleton is scoped to the site-specific cluster role so it runs on exactly /// one node within this site's cluster. /// - private void RegisterSiteActors() + private async Task RegisterSiteActorsAsync(CancellationToken cancellationToken) { var siteRole = $"site-{_nodeOptions.SiteId}"; var storage = _serviceProvider.GetRequiredService(); @@ -341,8 +371,11 @@ akka {{ if (storeAndForwardService != null) { // Initialize SQLite schema and start the retry timer. Must complete before - // any actor or HTTP handler touches the service. - storeAndForwardService.StartAsync().GetAwaiter().GetResult(); + // any actor or HTTP handler touches the service. Host-005: awaited rather + // than blocked via GetAwaiter().GetResult() — no thread-pool starvation / + // sync-context deadlock risk, and exceptions surface as their original type. + cancellationToken.ThrowIfCancellationRequested(); + await storeAndForwardService.StartAsync(); // Register the store-and-forward delivery handlers so buffered // ExternalSystem calls, cached DB writes and notifications are actually @@ -413,7 +446,22 @@ akka {{ contacts.Count, _nodeOptions.SiteId); } - // Gate gRPC subscriptions until the actor system and SiteStreamManager are initialized + // Gate gRPC subscriptions until the actor system and SiteStreamManager are + // initialized (REQ-HOST-7). + // + // Host-009: SetReady asserts a deliberately narrow contract. By this point the + // actor system exists, SiteStreamManager.Initialize has run, and every + // role actor (SiteCommunicationActor, deployment-manager singleton, + // SiteReplicationActor, the ClusterClient) has been created with ActorOf — + // creation and the registration Tells are synchronous and strictly ordered. + // What is NOT guaranteed is completion of each actor's PreStart or the + // ClusterClient's initial-contact handshake with central: those are + // intentionally asynchronous. Gating readiness on the central handshake would + // be wrong — a site must come up and stream locally even while central is + // briefly unreachable. gRPC readiness therefore guarantees "the site actor + // graph exists and can accept subscription streams", not "the cluster + // handshake has completed". Streams opened before SetReady are already + // rejected by SiteStreamGrpcServer with StatusCode.Unavailable. var grpcServer = _serviceProvider.GetService(); grpcServer?.SetReady(_actorSystem!); } diff --git a/src/ScadaLink.Host/DatabaseOptions.cs b/src/ScadaLink.Host/DatabaseOptions.cs index 12c3048..51818d6 100644 --- a/src/ScadaLink.Host/DatabaseOptions.cs +++ b/src/ScadaLink.Host/DatabaseOptions.cs @@ -3,6 +3,5 @@ namespace ScadaLink.Host; public class DatabaseOptions { public string? ConfigurationDb { get; set; } - public string? MachineDataDb { get; set; } public string? SiteDbPath { get; set; } } diff --git a/src/ScadaLink.Host/LoggerConfigurationFactory.cs b/src/ScadaLink.Host/LoggerConfigurationFactory.cs new file mode 100644 index 0000000..ee87825 --- /dev/null +++ b/src/ScadaLink.Host/LoggerConfigurationFactory.cs @@ -0,0 +1,48 @@ +using Serilog; +using Serilog.Events; + +namespace ScadaLink.Host; + +/// +/// Builds the Serilog for the Host process. +/// +/// REQ-HOST-8 / Host-011: the configured minimum level comes from +/// ScadaLink:Logging:MinimumLevel (bound to ) so an +/// operator editing that key changes the effective log level. The standard +/// Serilog configuration section is still read (via +/// ) +/// for sink/override customisation; the explicit MinimumLevel.Is below pins +/// the floor from . +/// +public static class LoggerConfigurationFactory +{ + public static LoggerConfiguration Build( + IConfiguration configuration, + string nodeRole, + string siteId, + string nodeHostname) + { + var loggingOptions = new LoggingOptions(); + configuration.GetSection("ScadaLink:Logging").Bind(loggingOptions); + + var minimumLevel = ParseLevel(loggingOptions.MinimumLevel); + + return new LoggerConfiguration() + .ReadFrom.Configuration(configuration) + .MinimumLevel.Is(minimumLevel) + .Enrich.WithProperty("SiteId", siteId) + .Enrich.WithProperty("NodeHostname", nodeHostname) + .Enrich.WithProperty("NodeRole", nodeRole); + } + + /// + /// Parses a Serilog name, falling back to + /// for null/blank/unrecognised values. + /// + private static LogEventLevel ParseLevel(string? level) + { + return Enum.TryParse(level, ignoreCase: true, out var parsed) + ? parsed + : LogEventLevel.Information; + } +} diff --git a/src/ScadaLink.Host/Program.cs b/src/ScadaLink.Host/Program.cs index 9eca495..a54803c 100644 --- a/src/ScadaLink.Host/Program.cs +++ b/src/ScadaLink.Host/Program.cs @@ -38,12 +38,10 @@ var nodeRole = configuration["ScadaLink:Node:Role"]!; var nodeHostname = configuration["ScadaLink:Node:NodeHostname"] ?? "unknown"; var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central"; -// WP-14: Serilog structured logging -Log.Logger = new LoggerConfiguration() - .ReadFrom.Configuration(configuration) - .Enrich.WithProperty("SiteId", siteId) - .Enrich.WithProperty("NodeHostname", nodeHostname) - .Enrich.WithProperty("NodeRole", nodeRole) +// WP-14: Serilog structured logging. +// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions). +Log.Logger = ScadaLink.Host.LoggerConfigurationFactory + .Build(configuration, nodeRole, siteId, nodeHostname) .WriteTo.Console(outputTemplate: "[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}") .WriteTo.File("logs/scadalink-.log", rollingInterval: Serilog.RollingInterval.Day) @@ -116,14 +114,24 @@ try { var isDevelopment = app.Environment.IsDevelopment() || string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase); - using (var scope = app.Services.CreateScope()) - { - var dbContext = scope.ServiceProvider.GetRequiredService(); - var migrationLogger = scope.ServiceProvider - .GetRequiredService() - .CreateLogger(typeof(MigrationHelper).FullName!); - await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger); - } + var migrationLogger = app.Services + .GetRequiredService() + .CreateLogger(typeof(MigrationHelper).FullName!); + + // Host-010: tolerate a database that is briefly unreachable at boot + // (e.g. app and DB containers starting together) with a bounded + // exponential backoff before failing fatally. + await StartupRetry.ExecuteWithRetryAsync( + "database-migration", + async () => + { + using var scope = app.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger); + }, + maxAttempts: 8, + initialDelay: TimeSpan.FromSeconds(2), + migrationLogger); } // Middleware pipeline diff --git a/src/ScadaLink.Host/StartupRetry.cs b/src/ScadaLink.Host/StartupRetry.cs new file mode 100644 index 0000000..3e9e710 --- /dev/null +++ b/src/ScadaLink.Host/StartupRetry.cs @@ -0,0 +1,48 @@ +namespace ScadaLink.Host; + +/// +/// Bounded retry-with-backoff for startup preconditions. +/// +/// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations +/// before the host begins serving traffic. In container orchestration the database +/// and the app frequently start together, so the database may be briefly +/// unreachable. Rather than crashing the process on the first connection failure, +/// the migration step is wrapped in this bounded exponential backoff: it tolerates a +/// short outage and only fails fatally once attempts are exhausted. +/// +public static class StartupRetry +{ + public static async Task ExecuteWithRetryAsync( + string operationName, + Func operation, + int maxAttempts, + TimeSpan initialDelay, + ILogger logger, + CancellationToken cancellationToken = default) + { + var delay = initialDelay; + for (var attempt = 1; ; attempt++) + { + cancellationToken.ThrowIfCancellationRequested(); + try + { + await operation(); + if (attempt > 1) + logger.LogInformation( + "Startup operation '{Operation}' succeeded on attempt {Attempt}.", + operationName, attempt); + return; + } + catch (Exception ex) when (attempt < maxAttempts) + { + logger.LogWarning(ex, + "Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " + + "retrying in {Delay}.", + operationName, attempt, maxAttempts, delay); + await Task.Delay(delay, cancellationToken); + // Exponential backoff, capped so the total wait stays bounded. + delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks)); + } + } + } +} diff --git a/src/ScadaLink.Host/StartupValidator.cs b/src/ScadaLink.Host/StartupValidator.cs index 1c3c08b..0f57880 100644 --- a/src/ScadaLink.Host/StartupValidator.cs +++ b/src/ScadaLink.Host/StartupValidator.cs @@ -30,8 +30,6 @@ public static class StartupValidator var dbSection = configuration.GetSection("ScadaLink:Database"); if (string.IsNullOrEmpty(dbSection["ConfigurationDb"])) errors.Add("ScadaLink:Database:ConfigurationDb connection string required for Central"); - if (string.IsNullOrEmpty(dbSection["MachineDataDb"])) - errors.Add("ScadaLink:Database:MachineDataDb connection string required for Central"); var secSection = configuration.GetSection("ScadaLink:Security"); if (string.IsNullOrEmpty(secSection["LdapServer"])) @@ -51,6 +49,13 @@ public static class StartupValidator if (grpcPortStr != null && (!int.TryParse(grpcPortStr, out grpcPort) || grpcPort < 1 || grpcPort > 65535)) errors.Add("ScadaLink:Node:GrpcPort must be 1-65535"); + // Host-007 / REQ-HOST-4: the gRPC (Kestrel HTTP/2) port and the Akka + // remoting port must differ. Identical values make Kestrel and + // Akka.Remote contend for the same TCP port and fail opaquely at + // runtime. Uses the resolved GrpcPort, including the 8083 default. + if (port == grpcPort) + errors.Add("ScadaLink:Node:GrpcPort must differ from RemotingPort"); + var dbSection = configuration.GetSection("ScadaLink:Database"); if (string.IsNullOrEmpty(dbSection["SiteDbPath"])) errors.Add("ScadaLink:Database:SiteDbPath required for Site nodes"); diff --git a/src/ScadaLink.Host/appsettings.Central.json b/src/ScadaLink.Host/appsettings.Central.json index a38dd24..3f05f84 100644 --- a/src/ScadaLink.Host/appsettings.Central.json +++ b/src/ScadaLink.Host/appsettings.Central.json @@ -16,10 +16,9 @@ "FailureDetectionThreshold": "00:00:10", "MinNrOfMembers": 1 }, - "_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Database__MachineDataDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.", + "_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.", "Database": { - "ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}", - "MachineDataDb": "${SCADALINK_MACHINEDATADB_CONNECTION_STRING}" + "ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}" }, "Security": { "LdapServer": "localhost", diff --git a/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs b/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs index 5077c4b..e79690c 100644 --- a/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs +++ b/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs @@ -4,11 +4,11 @@ namespace ScadaLink.Host.Tests; /// Host-003: appsettings.Central.json no longer commits database connection /// strings — they are externalised to environment variables. Tests that exercise the /// full Program startup pipeline against the real SQL provider must therefore -/// supply the local dev connection strings the way a deployment would: via -/// environment variables (Program's configuration builder calls +/// supply the local dev connection string the way a deployment would: via an +/// environment variable (Program's configuration builder calls /// AddEnvironmentVariables()). /// -/// Dispose restores the previous values so tests stay isolated. +/// Dispose restores the previous value so tests stay isolated. /// internal sealed class CentralDbTestEnvironment : IDisposable { @@ -16,26 +16,19 @@ internal sealed class CentralDbTestEnvironment : IDisposable // This is a test fixture value, not a committed production secret. private const string ConfigurationDb = "Server=localhost,1433;Database=ScadaLinkConfig;User Id=scadalink_app;Password=ScadaLink_Dev1#;TrustServerCertificate=true"; - private const string MachineDataDb = - "Server=localhost,1433;Database=ScadaLinkMachineData;User Id=scadalink_app;Password=ScadaLink_Dev1#;TrustServerCertificate=true"; private const string ConfigKey = "ScadaLink__Database__ConfigurationDb"; - private const string MachineKey = "ScadaLink__Database__MachineDataDb"; private readonly string? _previousConfig; - private readonly string? _previousMachine; public CentralDbTestEnvironment() { _previousConfig = Environment.GetEnvironmentVariable(ConfigKey); - _previousMachine = Environment.GetEnvironmentVariable(MachineKey); Environment.SetEnvironmentVariable(ConfigKey, ConfigurationDb); - Environment.SetEnvironmentVariable(MachineKey, MachineDataDb); } public void Dispose() { Environment.SetEnvironmentVariable(ConfigKey, _previousConfig); - Environment.SetEnvironmentVariable(MachineKey, _previousMachine); } } diff --git a/tests/ScadaLink.Host.Tests/HoconBuilderTests.cs b/tests/ScadaLink.Host.Tests/HoconBuilderTests.cs new file mode 100644 index 0000000..df9468b --- /dev/null +++ b/tests/ScadaLink.Host.Tests/HoconBuilderTests.cs @@ -0,0 +1,87 @@ +using Akka.Configuration; +using ScadaLink.ClusterInfrastructure; +using ScadaLink.Host.Actors; + +namespace ScadaLink.Host.Tests; + +/// +/// Host-006: the Akka HOCON document must be assembled with every interpolated value +/// quoted/escaped, so a hostname, seed-node URI or split-brain strategy containing a +/// quote, backslash or whitespace cannot corrupt the document or be silently +/// misparsed. +/// +public class HoconBuilderTests +{ + private static ClusterOptions DefaultCluster() => new() + { + SeedNodes = new List + { + "akka.tcp://scadalink@localhost:8081", + "akka.tcp://scadalink@localhost:8082", + }, + SplitBrainResolverStrategy = "keep-oldest", + MinNrOfMembers = 1, + StableAfter = TimeSpan.FromSeconds(15), + HeartbeatInterval = TimeSpan.FromSeconds(2), + FailureDetectionThreshold = TimeSpan.FromSeconds(10), + }; + + [Fact] + public void BuildHocon_PlainValues_ParsesAndPreservesValues() + { + var node = new NodeOptions + { + Role = "Central", + NodeHostname = "central-node1", + RemotingPort = 8081, + }; + + var hocon = AkkaHostedService.BuildHocon( + node, DefaultCluster(), new[] { "Central" }, 5, 15); + + var config = ConfigurationFactory.ParseString(hocon); + Assert.Equal("central-node1", config.GetString("akka.remote.dot-netty.tcp.hostname")); + Assert.Equal("keep-oldest", config.GetString("akka.cluster.split-brain-resolver.active-strategy")); + } + + [Fact] + public void BuildHocon_HostnameWithQuote_DoesNotCorruptDocument() + { + // A hostname containing a double quote would, with raw interpolation, close + // the HOCON string literal early and corrupt every following key. + var node = new NodeOptions + { + Role = "Central", + NodeHostname = "evil\"host", + RemotingPort = 8081, + }; + + var hocon = AkkaHostedService.BuildHocon( + node, DefaultCluster(), new[] { "Central" }, 5, 15); + + // Must still parse, and the keys after the hostname must remain intact. + var config = ConfigurationFactory.ParseString(hocon); + Assert.Equal("evil\"host", config.GetString("akka.remote.dot-netty.tcp.hostname")); + Assert.Equal(8081, config.GetInt("akka.remote.dot-netty.tcp.port")); + Assert.Equal(1, config.GetInt("akka.cluster.min-nr-of-members")); + } + + [Fact] + public void BuildHocon_StrategyWithWhitespace_RemainsQuoted() + { + var cluster = DefaultCluster(); + cluster.SplitBrainResolverStrategy = "keep oldest"; + var node = new NodeOptions + { + Role = "Central", + NodeHostname = "node1", + RemotingPort = 8081, + }; + + var hocon = AkkaHostedService.BuildHocon( + node, cluster, new[] { "Central" }, 5, 15); + + var config = ConfigurationFactory.ParseString(hocon); + Assert.Equal("keep oldest", config.GetString("akka.cluster.split-brain-resolver.active-strategy")); + } +} diff --git a/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs new file mode 100644 index 0000000..284f6a3 --- /dev/null +++ b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs @@ -0,0 +1,67 @@ +using Microsoft.Extensions.Configuration; +using Serilog.Events; + +namespace ScadaLink.Host.Tests; + +/// +/// Host-011: ScadaLink:Logging:MinimumLevel must actually drive the Serilog +/// minimum level. Previously the value was bound into +/// but never read, so editing it had no effect. +/// +public class LoggerConfigurationTests +{ + private static IConfiguration BuildConfig(string? minimumLevel) + { + var values = new Dictionary(); + if (minimumLevel != null) + values["ScadaLink:Logging:MinimumLevel"] = minimumLevel; + return new ConfigurationBuilder().AddInMemoryCollection(values).Build(); + } + + [Fact] + public void MinimumLevel_Warning_SuppressesInformationLogs() + { + var sink = new InMemorySink(); + var logger = LoggerConfigurationFactory + .Build(BuildConfig("Warning"), "Central", "central", "node1") + .WriteTo.Sink(sink) + .CreateLogger(); + + logger.Information("info message"); + logger.Warning("warning message"); + + Assert.Single(sink.LogEvents); + Assert.Equal(LogEventLevel.Warning, sink.LogEvents[0].Level); + } + + [Fact] + public void MinimumLevel_Debug_AllowsDebugLogs() + { + var sink = new InMemorySink(); + var logger = LoggerConfigurationFactory + .Build(BuildConfig("Debug"), "Site", "site-a", "node1") + .WriteTo.Sink(sink) + .CreateLogger(); + + logger.Debug("debug message"); + + Assert.Single(sink.LogEvents); + Assert.Equal(LogEventLevel.Debug, sink.LogEvents[0].Level); + } + + [Fact] + public void MinimumLevel_Absent_DefaultsToInformation() + { + var sink = new InMemorySink(); + var logger = LoggerConfigurationFactory + .Build(BuildConfig(null), "Central", "central", "node1") + .WriteTo.Sink(sink) + .CreateLogger(); + + logger.Debug("debug message"); + logger.Information("info message"); + + Assert.Single(sink.LogEvents); + Assert.Equal(LogEventLevel.Information, sink.LogEvents[0].Level); + } +} diff --git a/tests/ScadaLink.Host.Tests/StartupRetryTests.cs b/tests/ScadaLink.Host.Tests/StartupRetryTests.cs new file mode 100644 index 0000000..7454a03 --- /dev/null +++ b/tests/ScadaLink.Host.Tests/StartupRetryTests.cs @@ -0,0 +1,65 @@ +using Microsoft.Extensions.Logging.Abstractions; + +namespace ScadaLink.Host.Tests; + +/// +/// Host-010: startup preconditions (database migration) must tolerate a database +/// that is briefly unavailable at boot — common when an app container and its DB +/// container start together — via a bounded retry with backoff. +/// +public class StartupRetryTests +{ + [Fact] + public async Task ExecuteWithRetry_SucceedsFirstTry_RunsOnce() + { + var attempts = 0; + await StartupRetry.ExecuteWithRetryAsync( + "test-op", + () => { attempts++; return Task.CompletedTask; }, + maxAttempts: 5, + initialDelay: TimeSpan.FromMilliseconds(1), + NullLogger.Instance); + + Assert.Equal(1, attempts); + } + + [Fact] + public async Task ExecuteWithRetry_TransientFailures_RetriesUntilSuccess() + { + var attempts = 0; + await StartupRetry.ExecuteWithRetryAsync( + "test-op", + () => + { + attempts++; + if (attempts < 3) + throw new InvalidOperationException("db not ready"); + return Task.CompletedTask; + }, + maxAttempts: 5, + initialDelay: TimeSpan.FromMilliseconds(1), + NullLogger.Instance); + + Assert.Equal(3, attempts); + } + + [Fact] + public async Task ExecuteWithRetry_ExhaustsAttempts_RethrowsLastException() + { + var attempts = 0; + var ex = await Assert.ThrowsAsync(() => + StartupRetry.ExecuteWithRetryAsync( + "test-op", + () => + { + attempts++; + throw new InvalidOperationException($"failure {attempts}"); + }, + maxAttempts: 3, + initialDelay: TimeSpan.FromMilliseconds(1), + NullLogger.Instance)); + + Assert.Equal(3, attempts); + Assert.Equal("failure 3", ex.Message); + } +} diff --git a/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs b/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs index 3b14787..d2f0edd 100644 --- a/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs +++ b/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs @@ -20,7 +20,6 @@ public class StartupValidatorTests ["ScadaLink:Node:NodeHostname"] = "central-node1", ["ScadaLink:Node:RemotingPort"] = "8081", ["ScadaLink:Database:ConfigurationDb"] = "Server=localhost;Database=Config;", - ["ScadaLink:Database:MachineDataDb"] = "Server=localhost;Database=MachineData;", ["ScadaLink:Security:LdapServer"] = "ldap.example.com", ["ScadaLink:Security:JwtSigningKey"] = "test-signing-key-at-least-32-chars-long", ["ScadaLink:Cluster:SeedNodes:0"] = "akka.tcp://scadalink@central-node1:8081", @@ -151,14 +150,17 @@ public class StartupValidatorTests } [Fact] - public void Central_MissingMachineDataDb_FailsValidation() + public void Central_MissingMachineDataDb_PassesValidation() { + // Host-008 regression: MachineDataDb is never consumed anywhere in the + // system (only ConfigurationDb is wired into AddConfigurationDatabase). + // It is no longer a required key, so its absence must not fail startup. var values = ValidCentralConfig(); values.Remove("ScadaLink:Database:MachineDataDb"); var config = BuildConfig(values); - var ex = Assert.Throws(() => StartupValidator.Validate(config)); - Assert.Contains("MachineDataDb connection string required for Central", ex.Message); + var ex = Record.Exception(() => StartupValidator.Validate(config)); + Assert.Null(ex); } [Fact] @@ -310,6 +312,46 @@ public class StartupValidatorTests Assert.Null(ex); } + [Fact] + public void Site_GrpcPortEqualsRemotingPort_FailsValidation() + { + // Host-007 regression: REQ-HOST-4 requires GrpcPort to differ from + // RemotingPort. Identical values cause Kestrel and Akka.Remote to + // contend for the same port at runtime. + var values = ValidSiteConfig(); + values["ScadaLink:Node:RemotingPort"] = "8082"; + values["ScadaLink:Node:GrpcPort"] = "8082"; + var config = BuildConfig(values); + + var ex = Assert.Throws(() => StartupValidator.Validate(config)); + Assert.Contains("GrpcPort must differ from RemotingPort", ex.Message); + } + + [Fact] + public void Site_DefaultGrpcPortEqualsRemotingPort_FailsValidation() + { + // GrpcPort absent => NodeOptions default 8083. A site whose RemotingPort + // is also 8083 must still be rejected. + var values = ValidSiteConfig(); + values["ScadaLink:Node:RemotingPort"] = "8083"; + var config = BuildConfig(values); + + var ex = Assert.Throws(() => StartupValidator.Validate(config)); + Assert.Contains("GrpcPort must differ from RemotingPort", ex.Message); + } + + [Fact] + public void Site_GrpcPortDiffersFromRemotingPort_PassesValidation() + { + var values = ValidSiteConfig(); + values["ScadaLink:Node:RemotingPort"] = "8082"; + values["ScadaLink:Node:GrpcPort"] = "8083"; + var config = BuildConfig(values); + + var ex = Record.Exception(() => StartupValidator.Validate(config)); + Assert.Null(ex); + } + [Fact] public void MultipleErrors_AllReported() { diff --git a/tests/ScadaLink.Security.Tests/UnitTest1.cs b/tests/ScadaLink.Security.Tests/SecurityTests.cs similarity index 100% rename from tests/ScadaLink.Security.Tests/UnitTest1.cs rename to tests/ScadaLink.Security.Tests/SecurityTests.cs