diff --git a/code-reviews/Host/findings.md b/code-reviews/Host/findings.md
index 0d78d15..567e5bc 100644
--- a/code-reviews/Host/findings.md
+++ b/code-reviews/Host/findings.md
@@ -8,7 +8,7 @@
| Last reviewed | 2026-05-16 |
| Reviewer | claude-agent |
| Commit reviewed | `9c60592` |
-| Open findings | 8 |
+| Open findings | 1 |
## Summary
@@ -250,7 +250,7 @@ appropriately before the fix and pass after.
|--|--|
| Severity | Low |
| Category | Concurrency & thread safety |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:345` |
**Description**
@@ -270,7 +270,20 @@ storeAndForwardService.StartAsync(cancellationToken)`. Propagate the
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`). Root cause confirmed: `RegisterSiteActors`
+called `storeAndForwardService.StartAsync().GetAwaiter().GetResult()` synchronously
+inside the `IHostedService.StartAsync` path, blocking a startup thread and wrapping
+any failure in `AggregateException`. Fix: `AkkaHostedService.StartAsync` is now
+genuinely `async Task`; the site path was renamed `RegisterSiteActorsAsync` and made
+`async`; the store-and-forward init is now `await storeAndForwardService.StartAsync()`
+(the service's own signature, in another module, takes no `CancellationToken`, so the
+token is honoured via a `cancellationToken.ThrowIfCancellationRequested()` checkpoint
+immediately before the await). No blocking, no sync-context deadlock risk, and the
+original exception type now surfaces. No dedicated regression test: the only
+observable change is non-blocking/exception-unwrapping behaviour on a path that
+requires the full site DI graph plus a real SQLite store; the existing
+`HostStartupTests.SiteRole_*` and `CompositionRootTests` continue to exercise the
+async startup pipeline and remain green (175 passed).
### Host-006 — HOCON assembled by unescaped string interpolation
@@ -278,7 +291,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:70-108` |
**Description**
@@ -302,7 +315,25 @@ and seed nodes) before substitution.
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`). Root cause confirmed: the Akka HOCON was
+built by an interpolated string injecting `NodeHostname`, `SeedNodes`, roles and
+`SplitBrainResolverStrategy` directly into the document text with no escaping — a
+value containing a quote, backslash or whitespace would corrupt or misparse the
+document. Re-triaged the recommendation: switching wholesale to the typed
+`Akka.Hosting` builder is a larger refactor than this Low finding warrants and would
+touch how the whole cluster is bootstrapped, so the HOCON form was retained but made
+safe. Fix: the HOCON construction was extracted into a testable static
+`AkkaHostedService.BuildHocon`, and every interpolated *string* value is now routed
+through a `QuoteHocon` helper that wraps the value in a double-quoted HOCON literal
+and escapes embedded backslashes and quotes — `NodeHostname`, every seed node, every
+role, and `SplitBrainResolverStrategy` (the last was previously unquoted entirely, so
+whitespace alone would break it). Regression tests in new `HoconBuilderTests`:
+`BuildHocon_HostnameWithQuote_DoesNotCorruptDocument` (a hostname with a `"` still
+parses and following keys stay intact), `BuildHocon_StrategyWithWhitespace_RemainsQuoted`,
+and `BuildHocon_PlainValues_ParsesAndPreservesValues`. With the old raw interpolation
+a quote-containing hostname produced `hostname = "evil"host"`, closing the literal
+early and corrupting following keys; the escaped form parses correctly. Full Host
+suite green (175 passed).
### Host-007 — REQ-HOST-4 rule "GrpcPort ≠ RemotingPort" is not enforced
@@ -310,7 +341,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Design-document adherence |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/StartupValidator.cs:43-47` |
**Description**
@@ -332,7 +363,18 @@ Add a check in the `role == "Site"` block: if `GrpcPort` (resolved, including th
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`). Root cause confirmed against
+`StartupValidator.cs`: the `role == "Site"` block validated the GrpcPort range but
+never compared it to `RemotingPort`, so a site config setting both ports equal
+passed validation and then failed opaquely when Kestrel and Akka.Remote both bound
+the port. Fix: added `if (port == grpcPort) errors.Add("ScadaLink:Node:GrpcPort must
+differ from RemotingPort")` in the Site block, using the resolved GrpcPort (including
+the 8083 `NodeOptions` default when the key is absent) against the parsed
+RemotingPort. Regression tests in `StartupValidatorTests`:
+`Site_GrpcPortEqualsRemotingPort_FailsValidation`,
+`Site_DefaultGrpcPortEqualsRemotingPort_FailsValidation` (default-8083 path), and
+`Site_GrpcPortDiffersFromRemotingPort_PassesValidation` — all failed appropriately
+before the fix and pass after. Full Host suite green (175 passed).
### Host-008 — `MachineDataDb` is validated and declared but never consumed
@@ -340,7 +382,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/StartupValidator.cs:33-34`, `src/ScadaLink.Host/DatabaseOptions.cs:6` |
**Description**
@@ -361,7 +403,22 @@ rule, the `DatabaseOptions` property, and the key from `appsettings.Central.json
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`). Root cause confirmed: a repo-wide search
+shows `MachineDataDb` is read nowhere outside the Host module — only `ConfigurationDb`
+is wired into `AddConfigurationDatabase` — yet `StartupValidator` failed Central
+startup when it was absent. No machine-data store exists or is registered, so this is
+dead configuration. Fix (the "if no" branch of the recommendation): removed the
+`MachineDataDb` validation rule from `StartupValidator`, the `MachineDataDb` property
+from `DatabaseOptions`, the `MachineDataDb` key from `appsettings.Central.json`, and
+the now-pointless `MachineDataDb` reference from that file's `_secrets` note and from
+the `CentralDbTestEnvironment` test helper. Regression test: the prior
+`StartupValidatorTests.Central_MissingMachineDataDb_FailsValidation` was inverted to
+`Central_MissingMachineDataDb_PassesValidation` (absence must now NOT fail startup),
+and `MachineDataDb` was removed from the shared `ValidCentralConfig` fixture so all
+Central tests prove the key is no longer required; it failed under the old code and
+passes after. Full Host suite green (175 passed). Note: `appsettings.Central.json`
+under `docker/central-node-*` is outside this task's edit scope and still carries the
+key harmlessly — it will simply be ignored; a follow-up may remove it for tidiness.
### Host-009 — `StartAsync` reports success before role actors are confirmed running
@@ -369,7 +426,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Akka.NET conventions |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/Actors/AkkaHostedService.cs:127-141` |
**Description**
@@ -395,7 +452,24 @@ actor system exists, not that the cluster handshake has completed.
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`) — re-triaged, documentation fix. Root cause
+confirmed: `SetReady` is called optimistically after `ActorOf` registrations whose
+`PreStart`/initial-contact handshakes run asynchronously. Re-triaged the first
+recommended option (gating `SetReady` on a `SiteCommunicationActor` `Ask`
+round-trip): that option is **incorrect** because the meaningful asynchronous step is
+the `ClusterClient` initial-contact handshake with central — gating site gRPC
+readiness on it would prevent a site from coming up and streaming locally while
+central is briefly unreachable, contradicting the design's site-autonomy model.
+Confirmed `SiteStreamGrpcServer` already rejects any stream opened before `SetReady`
+with `StatusCode.Unavailable`, so the window is benign. Fix (the second, correct
+option): the `SetReady` call site now carries an explicit comment documenting the
+narrow readiness contract — by `SetReady` the actor system exists,
+`SiteStreamManager.Initialize` has run, and every role actor has been created with
+ordered synchronous `ActorOf` + registration `Tell`s; what is deliberately NOT
+guaranteed is `PreStart` completion or the central handshake. No regression test is
+meaningful for a documentation-only clarification of an intentional design contract;
+the existing `HostStartupTests.SiteRole_*` cover the surrounding startup path. Full
+Host suite green (175 passed).
### Host-010 — No retry/backoff around startup preconditions (DB migration, readiness)
@@ -403,7 +477,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Error handling & resilience |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/Program.cs:112-125` |
**Description**
@@ -425,7 +499,22 @@ reports `503` until the database becomes reachable.
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`). Root cause confirmed: Central startup called
+`MigrationHelper.ApplyOrValidateMigrationsAsync` directly with no tolerance for a
+database that is briefly unreachable at boot — a first connection failure threw, the
+top-level `catch` logged `Fatal`, and the process exited. Fix: added a new
+`StartupRetry.ExecuteWithRetryAsync` helper (bounded retry with capped exponential
+backoff — no Polly dependency, keeping the Host's package set minimal) and wrapped
+the migration step in it (`maxAttempts: 8`, `initialDelay: 2s`, doubling, capped at
+30s). Each retry opens a fresh DI scope so a transient connection fault does not
+leave a poisoned `DbContext`; once attempts are exhausted the last exception
+propagates and the existing fatal-log path still fires. Regression tests in new
+`StartupRetryTests`: `ExecuteWithRetry_SucceedsFirstTry_RunsOnce`,
+`ExecuteWithRetry_TransientFailures_RetriesUntilSuccess` (fails twice then succeeds),
+and `ExecuteWithRetry_ExhaustsAttempts_RethrowsLastException` (verifies the original
+exception type/message surfaces after exhaustion) — they fail to compile/pass against
+the pre-fix code (the helper did not exist) and pass after. Full Host suite green
+(175 passed).
### Host-011 — `LoggingOptions.MinimumLevel` is dead configuration
@@ -433,7 +522,7 @@ _Unresolved._
|--|--|
| Severity | Low |
| Category | Code organization & conventions |
-| Status | Open |
+| Status | Resolved |
| Location | `src/ScadaLink.Host/LoggingOptions.cs:5`, `src/ScadaLink.Host/Program.cs:42-50` |
**Description**
@@ -456,4 +545,21 @@ configuration section. Keep one mechanism, not two.
**Resolution**
-_Unresolved._
+Resolved 2026-05-16 (commit `pending`). Root cause confirmed: `LoggingOptions.MinimumLevel`
+was bound from `ScadaLink:Logging` but Serilog was configured purely via
+`ReadFrom.Configuration` (standard `Serilog` section), so editing
+`ScadaLink:Logging:MinimumLevel` had no effect. Re-triaged the two options: the
+"remove the option class" branch was rejected because `Component-Host.md` REQ-HOST-3
+explicitly lists `ScadaLink:Logging` → `LoggingOptions` and that design doc is outside
+this task's edit scope — removing the class would create a fresh code-vs-doc drift.
+Fix (the "consume it" branch): added a `LoggerConfigurationFactory.Build` that binds
+`LoggingOptions`, parses `MinimumLevel` into a Serilog `LogEventLevel` (falling back
+to `Information` for null/blank/unrecognised values), and pins it via
+`MinimumLevel.Is(...)` on top of `ReadFrom.Configuration`; `Program.cs` now builds the
+logger through this factory. `ScadaLink:Logging:MinimumLevel` is now live.
+Regression tests in new `LoggerConfigurationTests`:
+`MinimumLevel_Warning_SuppressesInformationLogs`,
+`MinimumLevel_Debug_AllowsDebugLogs`, and `MinimumLevel_Absent_DefaultsToInformation`
+— they assert the configured level actually filters log events; they pass against the
+new factory (and would fail against the old inline configuration, which ignored the
+key). Full Host suite green (175 passed).
diff --git a/src/ScadaLink.Host/Actors/AkkaHostedService.cs b/src/ScadaLink.Host/Actors/AkkaHostedService.cs
index cb9865d..9f313e3 100644
--- a/src/ScadaLink.Host/Actors/AkkaHostedService.cs
+++ b/src/ScadaLink.Host/Actors/AkkaHostedService.cs
@@ -54,58 +54,20 @@ public class AkkaHostedService : IHostedService
///
public ActorSystem? ActorSystem => _actorSystem;
- public Task StartAsync(CancellationToken cancellationToken)
+ public async Task StartAsync(CancellationToken cancellationToken)
{
- var seedNodesStr = string.Join(",",
- _clusterOptions.SeedNodes.Select(s => $"\"{s}\""));
-
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
var roles = BuildRoles();
- var rolesStr = string.Join(",", roles.Select(r => $"\"{r}\""));
// WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults)
var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds;
var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds;
- var hocon = $@"
-akka {{
- extensions = [
- ""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
- ]
- actor {{
- provider = cluster
- }}
- remote {{
- dot-netty.tcp {{
- hostname = ""{_nodeOptions.NodeHostname}""
- port = {_nodeOptions.RemotingPort}
- }}
- transport-failure-detector {{
- heartbeat-interval = {transportHeartbeatSec:F0}s
- acceptable-heartbeat-pause = {transportFailureSec:F0}s
- }}
- }}
- cluster {{
- seed-nodes = [{seedNodesStr}]
- roles = [{rolesStr}]
- min-nr-of-members = {_clusterOptions.MinNrOfMembers}
- split-brain-resolver {{
- active-strategy = {_clusterOptions.SplitBrainResolverStrategy}
- stable-after = {_clusterOptions.StableAfter.TotalSeconds:F0}s
- keep-oldest {{
- down-if-alone = on
- }}
- }}
- failure-detector {{
- heartbeat-interval = {_clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
- acceptable-heartbeat-pause = {_clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
- }}
- run-coordinated-shutdown-when-down = on
- }}
- coordinated-shutdown {{
- run-by-clr-shutdown-hook = on
- }}
-}}";
+ // Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
+ // interpolated value, so a hostname, seed node or strategy containing a quote,
+ // backslash or whitespace cannot corrupt the configuration document.
+ var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
+ transportHeartbeatSec, transportFailureSec);
var config = ConfigurationFactory.ParseString(hocon);
_actorSystem = ActorSystem.Create("scadalink", config);
@@ -135,10 +97,78 @@ akka {{
}
else if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase))
{
- RegisterSiteActors();
+ await RegisterSiteActorsAsync(cancellationToken);
}
+ }
- return Task.CompletedTask;
+ ///
+ /// Builds the Akka HOCON configuration document. Every interpolated value is
+ /// routed through (string values) so a hostname,
+ /// seed-node URI, role or split-brain strategy containing a quote, backslash or
+ /// whitespace cannot corrupt the document or be silently misparsed (Host-006).
+ ///
+ public static string BuildHocon(
+ NodeOptions nodeOptions,
+ ClusterOptions clusterOptions,
+ IEnumerable roles,
+ double transportHeartbeatSec,
+ double transportFailureSec)
+ {
+ var seedNodesStr = string.Join(",",
+ clusterOptions.SeedNodes.Select(QuoteHocon));
+ var rolesStr = string.Join(",", roles.Select(QuoteHocon));
+
+ return $@"
+akka {{
+ extensions = [
+ ""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
+ ]
+ actor {{
+ provider = cluster
+ }}
+ remote {{
+ dot-netty.tcp {{
+ hostname = {QuoteHocon(nodeOptions.NodeHostname)}
+ port = {nodeOptions.RemotingPort}
+ }}
+ transport-failure-detector {{
+ heartbeat-interval = {transportHeartbeatSec:F0}s
+ acceptable-heartbeat-pause = {transportFailureSec:F0}s
+ }}
+ }}
+ cluster {{
+ seed-nodes = [{seedNodesStr}]
+ roles = [{rolesStr}]
+ min-nr-of-members = {clusterOptions.MinNrOfMembers}
+ split-brain-resolver {{
+ active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
+ stable-after = {clusterOptions.StableAfter.TotalSeconds:F0}s
+ keep-oldest {{
+ down-if-alone = on
+ }}
+ }}
+ failure-detector {{
+ heartbeat-interval = {clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
+ acceptable-heartbeat-pause = {clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
+ }}
+ run-coordinated-shutdown-when-down = on
+ }}
+ coordinated-shutdown {{
+ run-by-clr-shutdown-hook = on
+ }}
+}}";
+ }
+
+ ///
+ /// Renders a value as a HOCON double-quoted string, escaping backslashes and
+ /// double quotes so the resulting token cannot break out of its string literal.
+ ///
+ private static string QuoteHocon(string? value)
+ {
+ var escaped = (value ?? string.Empty)
+ .Replace("\\", "\\\\")
+ .Replace("\"", "\\\"");
+ return $"\"{escaped}\"";
}
public async Task StopAsync(CancellationToken cancellationToken)
@@ -218,7 +248,7 @@ akka {{
/// The singleton is scoped to the site-specific cluster role so it runs on exactly
/// one node within this site's cluster.
///
- private void RegisterSiteActors()
+ private async Task RegisterSiteActorsAsync(CancellationToken cancellationToken)
{
var siteRole = $"site-{_nodeOptions.SiteId}";
var storage = _serviceProvider.GetRequiredService();
@@ -341,8 +371,11 @@ akka {{
if (storeAndForwardService != null)
{
// Initialize SQLite schema and start the retry timer. Must complete before
- // any actor or HTTP handler touches the service.
- storeAndForwardService.StartAsync().GetAwaiter().GetResult();
+ // any actor or HTTP handler touches the service. Host-005: awaited rather
+ // than blocked via GetAwaiter().GetResult() — no thread-pool starvation /
+ // sync-context deadlock risk, and exceptions surface as their original type.
+ cancellationToken.ThrowIfCancellationRequested();
+ await storeAndForwardService.StartAsync();
// Register the store-and-forward delivery handlers so buffered
// ExternalSystem calls, cached DB writes and notifications are actually
@@ -413,7 +446,22 @@ akka {{
contacts.Count, _nodeOptions.SiteId);
}
- // Gate gRPC subscriptions until the actor system and SiteStreamManager are initialized
+ // Gate gRPC subscriptions until the actor system and SiteStreamManager are
+ // initialized (REQ-HOST-7).
+ //
+ // Host-009: SetReady asserts a deliberately narrow contract. By this point the
+ // actor system exists, SiteStreamManager.Initialize has run, and every
+ // role actor (SiteCommunicationActor, deployment-manager singleton,
+ // SiteReplicationActor, the ClusterClient) has been created with ActorOf —
+ // creation and the registration Tells are synchronous and strictly ordered.
+ // What is NOT guaranteed is completion of each actor's PreStart or the
+ // ClusterClient's initial-contact handshake with central: those are
+ // intentionally asynchronous. Gating readiness on the central handshake would
+ // be wrong — a site must come up and stream locally even while central is
+ // briefly unreachable. gRPC readiness therefore guarantees "the site actor
+ // graph exists and can accept subscription streams", not "the cluster
+ // handshake has completed". Streams opened before SetReady are already
+ // rejected by SiteStreamGrpcServer with StatusCode.Unavailable.
var grpcServer = _serviceProvider.GetService();
grpcServer?.SetReady(_actorSystem!);
}
diff --git a/src/ScadaLink.Host/DatabaseOptions.cs b/src/ScadaLink.Host/DatabaseOptions.cs
index 12c3048..51818d6 100644
--- a/src/ScadaLink.Host/DatabaseOptions.cs
+++ b/src/ScadaLink.Host/DatabaseOptions.cs
@@ -3,6 +3,5 @@ namespace ScadaLink.Host;
public class DatabaseOptions
{
public string? ConfigurationDb { get; set; }
- public string? MachineDataDb { get; set; }
public string? SiteDbPath { get; set; }
}
diff --git a/src/ScadaLink.Host/LoggerConfigurationFactory.cs b/src/ScadaLink.Host/LoggerConfigurationFactory.cs
new file mode 100644
index 0000000..ee87825
--- /dev/null
+++ b/src/ScadaLink.Host/LoggerConfigurationFactory.cs
@@ -0,0 +1,48 @@
+using Serilog;
+using Serilog.Events;
+
+namespace ScadaLink.Host;
+
+///
+/// Builds the Serilog for the Host process.
+///
+/// REQ-HOST-8 / Host-011: the configured minimum level comes from
+/// ScadaLink:Logging:MinimumLevel (bound to ) so an
+/// operator editing that key changes the effective log level. The standard
+/// Serilog configuration section is still read (via
+/// )
+/// for sink/override customisation; the explicit MinimumLevel.Is below pins
+/// the floor from .
+///
+public static class LoggerConfigurationFactory
+{
+ public static LoggerConfiguration Build(
+ IConfiguration configuration,
+ string nodeRole,
+ string siteId,
+ string nodeHostname)
+ {
+ var loggingOptions = new LoggingOptions();
+ configuration.GetSection("ScadaLink:Logging").Bind(loggingOptions);
+
+ var minimumLevel = ParseLevel(loggingOptions.MinimumLevel);
+
+ return new LoggerConfiguration()
+ .ReadFrom.Configuration(configuration)
+ .MinimumLevel.Is(minimumLevel)
+ .Enrich.WithProperty("SiteId", siteId)
+ .Enrich.WithProperty("NodeHostname", nodeHostname)
+ .Enrich.WithProperty("NodeRole", nodeRole);
+ }
+
+ ///
+ /// Parses a Serilog name, falling back to
+ /// for null/blank/unrecognised values.
+ ///
+ private static LogEventLevel ParseLevel(string? level)
+ {
+ return Enum.TryParse(level, ignoreCase: true, out var parsed)
+ ? parsed
+ : LogEventLevel.Information;
+ }
+}
diff --git a/src/ScadaLink.Host/Program.cs b/src/ScadaLink.Host/Program.cs
index 9eca495..a54803c 100644
--- a/src/ScadaLink.Host/Program.cs
+++ b/src/ScadaLink.Host/Program.cs
@@ -38,12 +38,10 @@ var nodeRole = configuration["ScadaLink:Node:Role"]!;
var nodeHostname = configuration["ScadaLink:Node:NodeHostname"] ?? "unknown";
var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central";
-// WP-14: Serilog structured logging
-Log.Logger = new LoggerConfiguration()
- .ReadFrom.Configuration(configuration)
- .Enrich.WithProperty("SiteId", siteId)
- .Enrich.WithProperty("NodeHostname", nodeHostname)
- .Enrich.WithProperty("NodeRole", nodeRole)
+// WP-14: Serilog structured logging.
+// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions).
+Log.Logger = ScadaLink.Host.LoggerConfigurationFactory
+ .Build(configuration, nodeRole, siteId, nodeHostname)
.WriteTo.Console(outputTemplate:
"[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}")
.WriteTo.File("logs/scadalink-.log", rollingInterval: Serilog.RollingInterval.Day)
@@ -116,14 +114,24 @@ try
{
var isDevelopment = app.Environment.IsDevelopment()
|| string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase);
- using (var scope = app.Services.CreateScope())
- {
- var dbContext = scope.ServiceProvider.GetRequiredService();
- var migrationLogger = scope.ServiceProvider
- .GetRequiredService()
- .CreateLogger(typeof(MigrationHelper).FullName!);
- await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger);
- }
+ var migrationLogger = app.Services
+ .GetRequiredService()
+ .CreateLogger(typeof(MigrationHelper).FullName!);
+
+ // Host-010: tolerate a database that is briefly unreachable at boot
+ // (e.g. app and DB containers starting together) with a bounded
+ // exponential backoff before failing fatally.
+ await StartupRetry.ExecuteWithRetryAsync(
+ "database-migration",
+ async () =>
+ {
+ using var scope = app.Services.CreateScope();
+ var dbContext = scope.ServiceProvider.GetRequiredService();
+ await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger);
+ },
+ maxAttempts: 8,
+ initialDelay: TimeSpan.FromSeconds(2),
+ migrationLogger);
}
// Middleware pipeline
diff --git a/src/ScadaLink.Host/StartupRetry.cs b/src/ScadaLink.Host/StartupRetry.cs
new file mode 100644
index 0000000..3e9e710
--- /dev/null
+++ b/src/ScadaLink.Host/StartupRetry.cs
@@ -0,0 +1,48 @@
+namespace ScadaLink.Host;
+
+///
+/// Bounded retry-with-backoff for startup preconditions.
+///
+/// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations
+/// before the host begins serving traffic. In container orchestration the database
+/// and the app frequently start together, so the database may be briefly
+/// unreachable. Rather than crashing the process on the first connection failure,
+/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
+/// short outage and only fails fatally once attempts are exhausted.
+///
+public static class StartupRetry
+{
+ public static async Task ExecuteWithRetryAsync(
+ string operationName,
+ Func operation,
+ int maxAttempts,
+ TimeSpan initialDelay,
+ ILogger logger,
+ CancellationToken cancellationToken = default)
+ {
+ var delay = initialDelay;
+ for (var attempt = 1; ; attempt++)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ try
+ {
+ await operation();
+ if (attempt > 1)
+ logger.LogInformation(
+ "Startup operation '{Operation}' succeeded on attempt {Attempt}.",
+ operationName, attempt);
+ return;
+ }
+ catch (Exception ex) when (attempt < maxAttempts)
+ {
+ logger.LogWarning(ex,
+ "Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " +
+ "retrying in {Delay}.",
+ operationName, attempt, maxAttempts, delay);
+ await Task.Delay(delay, cancellationToken);
+ // Exponential backoff, capped so the total wait stays bounded.
+ delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks));
+ }
+ }
+ }
+}
diff --git a/src/ScadaLink.Host/StartupValidator.cs b/src/ScadaLink.Host/StartupValidator.cs
index 1c3c08b..0f57880 100644
--- a/src/ScadaLink.Host/StartupValidator.cs
+++ b/src/ScadaLink.Host/StartupValidator.cs
@@ -30,8 +30,6 @@ public static class StartupValidator
var dbSection = configuration.GetSection("ScadaLink:Database");
if (string.IsNullOrEmpty(dbSection["ConfigurationDb"]))
errors.Add("ScadaLink:Database:ConfigurationDb connection string required for Central");
- if (string.IsNullOrEmpty(dbSection["MachineDataDb"]))
- errors.Add("ScadaLink:Database:MachineDataDb connection string required for Central");
var secSection = configuration.GetSection("ScadaLink:Security");
if (string.IsNullOrEmpty(secSection["LdapServer"]))
@@ -51,6 +49,13 @@ public static class StartupValidator
if (grpcPortStr != null && (!int.TryParse(grpcPortStr, out grpcPort) || grpcPort < 1 || grpcPort > 65535))
errors.Add("ScadaLink:Node:GrpcPort must be 1-65535");
+ // Host-007 / REQ-HOST-4: the gRPC (Kestrel HTTP/2) port and the Akka
+ // remoting port must differ. Identical values make Kestrel and
+ // Akka.Remote contend for the same TCP port and fail opaquely at
+ // runtime. Uses the resolved GrpcPort, including the 8083 default.
+ if (port == grpcPort)
+ errors.Add("ScadaLink:Node:GrpcPort must differ from RemotingPort");
+
var dbSection = configuration.GetSection("ScadaLink:Database");
if (string.IsNullOrEmpty(dbSection["SiteDbPath"]))
errors.Add("ScadaLink:Database:SiteDbPath required for Site nodes");
diff --git a/src/ScadaLink.Host/appsettings.Central.json b/src/ScadaLink.Host/appsettings.Central.json
index a38dd24..3f05f84 100644
--- a/src/ScadaLink.Host/appsettings.Central.json
+++ b/src/ScadaLink.Host/appsettings.Central.json
@@ -16,10 +16,9 @@
"FailureDetectionThreshold": "00:00:10",
"MinNrOfMembers": 1
},
- "_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Database__MachineDataDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
+ "_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
"Database": {
- "ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}",
- "MachineDataDb": "${SCADALINK_MACHINEDATADB_CONNECTION_STRING}"
+ "ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}"
},
"Security": {
"LdapServer": "localhost",
diff --git a/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs b/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs
index 5077c4b..e79690c 100644
--- a/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs
+++ b/tests/ScadaLink.Host.Tests/CentralDbTestEnvironment.cs
@@ -4,11 +4,11 @@ namespace ScadaLink.Host.Tests;
/// Host-003: appsettings.Central.json no longer commits database connection
/// strings — they are externalised to environment variables. Tests that exercise the
/// full Program startup pipeline against the real SQL provider must therefore
-/// supply the local dev connection strings the way a deployment would: via
-/// environment variables (Program's configuration builder calls
+/// supply the local dev connection string the way a deployment would: via an
+/// environment variable (Program's configuration builder calls
/// AddEnvironmentVariables()).
///
-/// Dispose restores the previous values so tests stay isolated.
+/// Dispose restores the previous value so tests stay isolated.
///
internal sealed class CentralDbTestEnvironment : IDisposable
{
@@ -16,26 +16,19 @@ internal sealed class CentralDbTestEnvironment : IDisposable
// This is a test fixture value, not a committed production secret.
private const string ConfigurationDb =
"Server=localhost,1433;Database=ScadaLinkConfig;User Id=scadalink_app;Password=ScadaLink_Dev1#;TrustServerCertificate=true";
- private const string MachineDataDb =
- "Server=localhost,1433;Database=ScadaLinkMachineData;User Id=scadalink_app;Password=ScadaLink_Dev1#;TrustServerCertificate=true";
private const string ConfigKey = "ScadaLink__Database__ConfigurationDb";
- private const string MachineKey = "ScadaLink__Database__MachineDataDb";
private readonly string? _previousConfig;
- private readonly string? _previousMachine;
public CentralDbTestEnvironment()
{
_previousConfig = Environment.GetEnvironmentVariable(ConfigKey);
- _previousMachine = Environment.GetEnvironmentVariable(MachineKey);
Environment.SetEnvironmentVariable(ConfigKey, ConfigurationDb);
- Environment.SetEnvironmentVariable(MachineKey, MachineDataDb);
}
public void Dispose()
{
Environment.SetEnvironmentVariable(ConfigKey, _previousConfig);
- Environment.SetEnvironmentVariable(MachineKey, _previousMachine);
}
}
diff --git a/tests/ScadaLink.Host.Tests/HoconBuilderTests.cs b/tests/ScadaLink.Host.Tests/HoconBuilderTests.cs
new file mode 100644
index 0000000..df9468b
--- /dev/null
+++ b/tests/ScadaLink.Host.Tests/HoconBuilderTests.cs
@@ -0,0 +1,87 @@
+using Akka.Configuration;
+using ScadaLink.ClusterInfrastructure;
+using ScadaLink.Host.Actors;
+
+namespace ScadaLink.Host.Tests;
+
+///
+/// Host-006: the Akka HOCON document must be assembled with every interpolated value
+/// quoted/escaped, so a hostname, seed-node URI or split-brain strategy containing a
+/// quote, backslash or whitespace cannot corrupt the document or be silently
+/// misparsed.
+///
+public class HoconBuilderTests
+{
+ private static ClusterOptions DefaultCluster() => new()
+ {
+ SeedNodes = new List
+ {
+ "akka.tcp://scadalink@localhost:8081",
+ "akka.tcp://scadalink@localhost:8082",
+ },
+ SplitBrainResolverStrategy = "keep-oldest",
+ MinNrOfMembers = 1,
+ StableAfter = TimeSpan.FromSeconds(15),
+ HeartbeatInterval = TimeSpan.FromSeconds(2),
+ FailureDetectionThreshold = TimeSpan.FromSeconds(10),
+ };
+
+ [Fact]
+ public void BuildHocon_PlainValues_ParsesAndPreservesValues()
+ {
+ var node = new NodeOptions
+ {
+ Role = "Central",
+ NodeHostname = "central-node1",
+ RemotingPort = 8081,
+ };
+
+ var hocon = AkkaHostedService.BuildHocon(
+ node, DefaultCluster(), new[] { "Central" }, 5, 15);
+
+ var config = ConfigurationFactory.ParseString(hocon);
+ Assert.Equal("central-node1", config.GetString("akka.remote.dot-netty.tcp.hostname"));
+ Assert.Equal("keep-oldest", config.GetString("akka.cluster.split-brain-resolver.active-strategy"));
+ }
+
+ [Fact]
+ public void BuildHocon_HostnameWithQuote_DoesNotCorruptDocument()
+ {
+ // A hostname containing a double quote would, with raw interpolation, close
+ // the HOCON string literal early and corrupt every following key.
+ var node = new NodeOptions
+ {
+ Role = "Central",
+ NodeHostname = "evil\"host",
+ RemotingPort = 8081,
+ };
+
+ var hocon = AkkaHostedService.BuildHocon(
+ node, DefaultCluster(), new[] { "Central" }, 5, 15);
+
+ // Must still parse, and the keys after the hostname must remain intact.
+ var config = ConfigurationFactory.ParseString(hocon);
+ Assert.Equal("evil\"host", config.GetString("akka.remote.dot-netty.tcp.hostname"));
+ Assert.Equal(8081, config.GetInt("akka.remote.dot-netty.tcp.port"));
+ Assert.Equal(1, config.GetInt("akka.cluster.min-nr-of-members"));
+ }
+
+ [Fact]
+ public void BuildHocon_StrategyWithWhitespace_RemainsQuoted()
+ {
+ var cluster = DefaultCluster();
+ cluster.SplitBrainResolverStrategy = "keep oldest";
+ var node = new NodeOptions
+ {
+ Role = "Central",
+ NodeHostname = "node1",
+ RemotingPort = 8081,
+ };
+
+ var hocon = AkkaHostedService.BuildHocon(
+ node, cluster, new[] { "Central" }, 5, 15);
+
+ var config = ConfigurationFactory.ParseString(hocon);
+ Assert.Equal("keep oldest", config.GetString("akka.cluster.split-brain-resolver.active-strategy"));
+ }
+}
diff --git a/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs
new file mode 100644
index 0000000..284f6a3
--- /dev/null
+++ b/tests/ScadaLink.Host.Tests/LoggerConfigurationTests.cs
@@ -0,0 +1,67 @@
+using Microsoft.Extensions.Configuration;
+using Serilog.Events;
+
+namespace ScadaLink.Host.Tests;
+
+///
+/// Host-011: ScadaLink:Logging:MinimumLevel must actually drive the Serilog
+/// minimum level. Previously the value was bound into
+/// but never read, so editing it had no effect.
+///
+public class LoggerConfigurationTests
+{
+ private static IConfiguration BuildConfig(string? minimumLevel)
+ {
+ var values = new Dictionary();
+ if (minimumLevel != null)
+ values["ScadaLink:Logging:MinimumLevel"] = minimumLevel;
+ return new ConfigurationBuilder().AddInMemoryCollection(values).Build();
+ }
+
+ [Fact]
+ public void MinimumLevel_Warning_SuppressesInformationLogs()
+ {
+ var sink = new InMemorySink();
+ var logger = LoggerConfigurationFactory
+ .Build(BuildConfig("Warning"), "Central", "central", "node1")
+ .WriteTo.Sink(sink)
+ .CreateLogger();
+
+ logger.Information("info message");
+ logger.Warning("warning message");
+
+ Assert.Single(sink.LogEvents);
+ Assert.Equal(LogEventLevel.Warning, sink.LogEvents[0].Level);
+ }
+
+ [Fact]
+ public void MinimumLevel_Debug_AllowsDebugLogs()
+ {
+ var sink = new InMemorySink();
+ var logger = LoggerConfigurationFactory
+ .Build(BuildConfig("Debug"), "Site", "site-a", "node1")
+ .WriteTo.Sink(sink)
+ .CreateLogger();
+
+ logger.Debug("debug message");
+
+ Assert.Single(sink.LogEvents);
+ Assert.Equal(LogEventLevel.Debug, sink.LogEvents[0].Level);
+ }
+
+ [Fact]
+ public void MinimumLevel_Absent_DefaultsToInformation()
+ {
+ var sink = new InMemorySink();
+ var logger = LoggerConfigurationFactory
+ .Build(BuildConfig(null), "Central", "central", "node1")
+ .WriteTo.Sink(sink)
+ .CreateLogger();
+
+ logger.Debug("debug message");
+ logger.Information("info message");
+
+ Assert.Single(sink.LogEvents);
+ Assert.Equal(LogEventLevel.Information, sink.LogEvents[0].Level);
+ }
+}
diff --git a/tests/ScadaLink.Host.Tests/StartupRetryTests.cs b/tests/ScadaLink.Host.Tests/StartupRetryTests.cs
new file mode 100644
index 0000000..7454a03
--- /dev/null
+++ b/tests/ScadaLink.Host.Tests/StartupRetryTests.cs
@@ -0,0 +1,65 @@
+using Microsoft.Extensions.Logging.Abstractions;
+
+namespace ScadaLink.Host.Tests;
+
+///
+/// Host-010: startup preconditions (database migration) must tolerate a database
+/// that is briefly unavailable at boot — common when an app container and its DB
+/// container start together — via a bounded retry with backoff.
+///
+public class StartupRetryTests
+{
+ [Fact]
+ public async Task ExecuteWithRetry_SucceedsFirstTry_RunsOnce()
+ {
+ var attempts = 0;
+ await StartupRetry.ExecuteWithRetryAsync(
+ "test-op",
+ () => { attempts++; return Task.CompletedTask; },
+ maxAttempts: 5,
+ initialDelay: TimeSpan.FromMilliseconds(1),
+ NullLogger.Instance);
+
+ Assert.Equal(1, attempts);
+ }
+
+ [Fact]
+ public async Task ExecuteWithRetry_TransientFailures_RetriesUntilSuccess()
+ {
+ var attempts = 0;
+ await StartupRetry.ExecuteWithRetryAsync(
+ "test-op",
+ () =>
+ {
+ attempts++;
+ if (attempts < 3)
+ throw new InvalidOperationException("db not ready");
+ return Task.CompletedTask;
+ },
+ maxAttempts: 5,
+ initialDelay: TimeSpan.FromMilliseconds(1),
+ NullLogger.Instance);
+
+ Assert.Equal(3, attempts);
+ }
+
+ [Fact]
+ public async Task ExecuteWithRetry_ExhaustsAttempts_RethrowsLastException()
+ {
+ var attempts = 0;
+ var ex = await Assert.ThrowsAsync(() =>
+ StartupRetry.ExecuteWithRetryAsync(
+ "test-op",
+ () =>
+ {
+ attempts++;
+ throw new InvalidOperationException($"failure {attempts}");
+ },
+ maxAttempts: 3,
+ initialDelay: TimeSpan.FromMilliseconds(1),
+ NullLogger.Instance));
+
+ Assert.Equal(3, attempts);
+ Assert.Equal("failure 3", ex.Message);
+ }
+}
diff --git a/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs b/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs
index 3b14787..d2f0edd 100644
--- a/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs
+++ b/tests/ScadaLink.Host.Tests/StartupValidatorTests.cs
@@ -20,7 +20,6 @@ public class StartupValidatorTests
["ScadaLink:Node:NodeHostname"] = "central-node1",
["ScadaLink:Node:RemotingPort"] = "8081",
["ScadaLink:Database:ConfigurationDb"] = "Server=localhost;Database=Config;",
- ["ScadaLink:Database:MachineDataDb"] = "Server=localhost;Database=MachineData;",
["ScadaLink:Security:LdapServer"] = "ldap.example.com",
["ScadaLink:Security:JwtSigningKey"] = "test-signing-key-at-least-32-chars-long",
["ScadaLink:Cluster:SeedNodes:0"] = "akka.tcp://scadalink@central-node1:8081",
@@ -151,14 +150,17 @@ public class StartupValidatorTests
}
[Fact]
- public void Central_MissingMachineDataDb_FailsValidation()
+ public void Central_MissingMachineDataDb_PassesValidation()
{
+ // Host-008 regression: MachineDataDb is never consumed anywhere in the
+ // system (only ConfigurationDb is wired into AddConfigurationDatabase).
+ // It is no longer a required key, so its absence must not fail startup.
var values = ValidCentralConfig();
values.Remove("ScadaLink:Database:MachineDataDb");
var config = BuildConfig(values);
- var ex = Assert.Throws(() => StartupValidator.Validate(config));
- Assert.Contains("MachineDataDb connection string required for Central", ex.Message);
+ var ex = Record.Exception(() => StartupValidator.Validate(config));
+ Assert.Null(ex);
}
[Fact]
@@ -310,6 +312,46 @@ public class StartupValidatorTests
Assert.Null(ex);
}
+ [Fact]
+ public void Site_GrpcPortEqualsRemotingPort_FailsValidation()
+ {
+ // Host-007 regression: REQ-HOST-4 requires GrpcPort to differ from
+ // RemotingPort. Identical values cause Kestrel and Akka.Remote to
+ // contend for the same port at runtime.
+ var values = ValidSiteConfig();
+ values["ScadaLink:Node:RemotingPort"] = "8082";
+ values["ScadaLink:Node:GrpcPort"] = "8082";
+ var config = BuildConfig(values);
+
+ var ex = Assert.Throws(() => StartupValidator.Validate(config));
+ Assert.Contains("GrpcPort must differ from RemotingPort", ex.Message);
+ }
+
+ [Fact]
+ public void Site_DefaultGrpcPortEqualsRemotingPort_FailsValidation()
+ {
+ // GrpcPort absent => NodeOptions default 8083. A site whose RemotingPort
+ // is also 8083 must still be rejected.
+ var values = ValidSiteConfig();
+ values["ScadaLink:Node:RemotingPort"] = "8083";
+ var config = BuildConfig(values);
+
+ var ex = Assert.Throws(() => StartupValidator.Validate(config));
+ Assert.Contains("GrpcPort must differ from RemotingPort", ex.Message);
+ }
+
+ [Fact]
+ public void Site_GrpcPortDiffersFromRemotingPort_PassesValidation()
+ {
+ var values = ValidSiteConfig();
+ values["ScadaLink:Node:RemotingPort"] = "8082";
+ values["ScadaLink:Node:GrpcPort"] = "8083";
+ var config = BuildConfig(values);
+
+ var ex = Record.Exception(() => StartupValidator.Validate(config));
+ Assert.Null(ex);
+ }
+
[Fact]
public void MultipleErrors_AllReported()
{
diff --git a/tests/ScadaLink.Security.Tests/UnitTest1.cs b/tests/ScadaLink.Security.Tests/SecurityTests.cs
similarity index 100%
rename from tests/ScadaLink.Security.Tests/UnitTest1.cs
rename to tests/ScadaLink.Security.Tests/SecurityTests.cs