fix(host): resolve Host-005..011 — async startup, HOCON escaping, port-conflict check, dead-config cleanup, migration retry, log-level wiring; Host-002 flagged

This commit is contained in:
Joseph Doherty
2026-05-16 22:24:03 -04:00
parent 3f19371017
commit 8664cdf940
14 changed files with 614 additions and 99 deletions

View File

@@ -54,58 +54,20 @@ public class AkkaHostedService : IHostedService
/// </summary>
public ActorSystem? ActorSystem => _actorSystem;
public Task StartAsync(CancellationToken cancellationToken)
public async Task StartAsync(CancellationToken cancellationToken)
{
var seedNodesStr = string.Join(",",
_clusterOptions.SeedNodes.Select(s => $"\"{s}\""));
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
var roles = BuildRoles();
var rolesStr = string.Join(",", roles.Select(r => $"\"{r}\""));
// WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults)
var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds;
var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds;
var hocon = $@"
akka {{
extensions = [
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
]
actor {{
provider = cluster
}}
remote {{
dot-netty.tcp {{
hostname = ""{_nodeOptions.NodeHostname}""
port = {_nodeOptions.RemotingPort}
}}
transport-failure-detector {{
heartbeat-interval = {transportHeartbeatSec:F0}s
acceptable-heartbeat-pause = {transportFailureSec:F0}s
}}
}}
cluster {{
seed-nodes = [{seedNodesStr}]
roles = [{rolesStr}]
min-nr-of-members = {_clusterOptions.MinNrOfMembers}
split-brain-resolver {{
active-strategy = {_clusterOptions.SplitBrainResolverStrategy}
stable-after = {_clusterOptions.StableAfter.TotalSeconds:F0}s
keep-oldest {{
down-if-alone = on
}}
}}
failure-detector {{
heartbeat-interval = {_clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
acceptable-heartbeat-pause = {_clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
}}
run-coordinated-shutdown-when-down = on
}}
coordinated-shutdown {{
run-by-clr-shutdown-hook = on
}}
}}";
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
// interpolated value, so a hostname, seed node or strategy containing a quote,
// backslash or whitespace cannot corrupt the configuration document.
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
transportHeartbeatSec, transportFailureSec);
var config = ConfigurationFactory.ParseString(hocon);
_actorSystem = ActorSystem.Create("scadalink", config);
@@ -135,10 +97,78 @@ akka {{
}
else if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase))
{
RegisterSiteActors();
await RegisterSiteActorsAsync(cancellationToken);
}
}
return Task.CompletedTask;
/// <summary>
/// Builds the Akka HOCON configuration document. Every interpolated value is
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
/// seed-node URI, role or split-brain strategy containing a quote, backslash or
/// whitespace cannot corrupt the document or be silently misparsed (Host-006).
/// </summary>
public static string BuildHocon(
NodeOptions nodeOptions,
ClusterOptions clusterOptions,
IEnumerable<string> roles,
double transportHeartbeatSec,
double transportFailureSec)
{
var seedNodesStr = string.Join(",",
clusterOptions.SeedNodes.Select(QuoteHocon));
var rolesStr = string.Join(",", roles.Select(QuoteHocon));
return $@"
akka {{
extensions = [
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
]
actor {{
provider = cluster
}}
remote {{
dot-netty.tcp {{
hostname = {QuoteHocon(nodeOptions.NodeHostname)}
port = {nodeOptions.RemotingPort}
}}
transport-failure-detector {{
heartbeat-interval = {transportHeartbeatSec:F0}s
acceptable-heartbeat-pause = {transportFailureSec:F0}s
}}
}}
cluster {{
seed-nodes = [{seedNodesStr}]
roles = [{rolesStr}]
min-nr-of-members = {clusterOptions.MinNrOfMembers}
split-brain-resolver {{
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
stable-after = {clusterOptions.StableAfter.TotalSeconds:F0}s
keep-oldest {{
down-if-alone = on
}}
}}
failure-detector {{
heartbeat-interval = {clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
acceptable-heartbeat-pause = {clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
}}
run-coordinated-shutdown-when-down = on
}}
coordinated-shutdown {{
run-by-clr-shutdown-hook = on
}}
}}";
}
/// <summary>
/// Renders a value as a HOCON double-quoted string, escaping backslashes and
/// double quotes so the resulting token cannot break out of its string literal.
/// </summary>
private static string QuoteHocon(string? value)
{
var escaped = (value ?? string.Empty)
.Replace("\\", "\\\\")
.Replace("\"", "\\\"");
return $"\"{escaped}\"";
}
public async Task StopAsync(CancellationToken cancellationToken)
@@ -218,7 +248,7 @@ akka {{
/// The singleton is scoped to the site-specific cluster role so it runs on exactly
/// one node within this site's cluster.
/// </summary>
private void RegisterSiteActors()
private async Task RegisterSiteActorsAsync(CancellationToken cancellationToken)
{
var siteRole = $"site-{_nodeOptions.SiteId}";
var storage = _serviceProvider.GetRequiredService<SiteStorageService>();
@@ -341,8 +371,11 @@ akka {{
if (storeAndForwardService != null)
{
// Initialize SQLite schema and start the retry timer. Must complete before
// any actor or HTTP handler touches the service.
storeAndForwardService.StartAsync().GetAwaiter().GetResult();
// any actor or HTTP handler touches the service. Host-005: awaited rather
// than blocked via GetAwaiter().GetResult() — no thread-pool starvation /
// sync-context deadlock risk, and exceptions surface as their original type.
cancellationToken.ThrowIfCancellationRequested();
await storeAndForwardService.StartAsync();
// Register the store-and-forward delivery handlers so buffered
// ExternalSystem calls, cached DB writes and notifications are actually
@@ -413,7 +446,22 @@ akka {{
contacts.Count, _nodeOptions.SiteId);
}
// Gate gRPC subscriptions until the actor system and SiteStreamManager are initialized
// Gate gRPC subscriptions until the actor system and SiteStreamManager are
// initialized (REQ-HOST-7).
//
// Host-009: SetReady asserts a deliberately narrow contract. By this point the
// actor system exists, SiteStreamManager.Initialize has run, and every
// role actor (SiteCommunicationActor, deployment-manager singleton,
// SiteReplicationActor, the ClusterClient) has been created with ActorOf —
// creation and the registration Tells are synchronous and strictly ordered.
// What is NOT guaranteed is completion of each actor's PreStart or the
// ClusterClient's initial-contact handshake with central: those are
// intentionally asynchronous. Gating readiness on the central handshake would
// be wrong — a site must come up and stream locally even while central is
// briefly unreachable. gRPC readiness therefore guarantees "the site actor
// graph exists and can accept subscription streams", not "the cluster
// handshake has completed". Streams opened before SetReady are already
// rejected by SiteStreamGrpcServer with StatusCode.Unavailable.
var grpcServer = _serviceProvider.GetService<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
grpcServer?.SetReady(_actorSystem!);
}

View File

@@ -3,6 +3,5 @@ namespace ScadaLink.Host;
public class DatabaseOptions
{
public string? ConfigurationDb { get; set; }
public string? MachineDataDb { get; set; }
public string? SiteDbPath { get; set; }
}

View File

@@ -0,0 +1,48 @@
using Serilog;
using Serilog.Events;
namespace ScadaLink.Host;
/// <summary>
/// Builds the Serilog <see cref="LoggerConfiguration"/> for the Host process.
///
/// REQ-HOST-8 / Host-011: the configured minimum level comes from
/// <c>ScadaLink:Logging:MinimumLevel</c> (bound to <see cref="LoggingOptions"/>) so an
/// operator editing that key changes the effective log level. The standard
/// <c>Serilog</c> configuration section is still read (via
/// <see cref="Serilog.Configuration.ConfigurationLoggerConfigurationExtensions"/>)
/// for sink/override customisation; the explicit <c>MinimumLevel.Is</c> below pins
/// the floor from <see cref="LoggingOptions"/>.
/// </summary>
public static class LoggerConfigurationFactory
{
public static LoggerConfiguration Build(
IConfiguration configuration,
string nodeRole,
string siteId,
string nodeHostname)
{
var loggingOptions = new LoggingOptions();
configuration.GetSection("ScadaLink:Logging").Bind(loggingOptions);
var minimumLevel = ParseLevel(loggingOptions.MinimumLevel);
return new LoggerConfiguration()
.ReadFrom.Configuration(configuration)
.MinimumLevel.Is(minimumLevel)
.Enrich.WithProperty("SiteId", siteId)
.Enrich.WithProperty("NodeHostname", nodeHostname)
.Enrich.WithProperty("NodeRole", nodeRole);
}
/// <summary>
/// Parses a Serilog <see cref="LogEventLevel"/> name, falling back to
/// <see cref="LogEventLevel.Information"/> for null/blank/unrecognised values.
/// </summary>
private static LogEventLevel ParseLevel(string? level)
{
return Enum.TryParse<LogEventLevel>(level, ignoreCase: true, out var parsed)
? parsed
: LogEventLevel.Information;
}
}

View File

@@ -38,12 +38,10 @@ var nodeRole = configuration["ScadaLink:Node:Role"]!;
var nodeHostname = configuration["ScadaLink:Node:NodeHostname"] ?? "unknown";
var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central";
// WP-14: Serilog structured logging
Log.Logger = new LoggerConfiguration()
.ReadFrom.Configuration(configuration)
.Enrich.WithProperty("SiteId", siteId)
.Enrich.WithProperty("NodeHostname", nodeHostname)
.Enrich.WithProperty("NodeRole", nodeRole)
// WP-14: Serilog structured logging.
// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions).
Log.Logger = ScadaLink.Host.LoggerConfigurationFactory
.Build(configuration, nodeRole, siteId, nodeHostname)
.WriteTo.Console(outputTemplate:
"[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}")
.WriteTo.File("logs/scadalink-.log", rollingInterval: Serilog.RollingInterval.Day)
@@ -116,14 +114,24 @@ try
{
var isDevelopment = app.Environment.IsDevelopment()
|| string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase);
using (var scope = app.Services.CreateScope())
{
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
var migrationLogger = scope.ServiceProvider
.GetRequiredService<ILoggerFactory>()
.CreateLogger(typeof(MigrationHelper).FullName!);
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger);
}
var migrationLogger = app.Services
.GetRequiredService<ILoggerFactory>()
.CreateLogger(typeof(MigrationHelper).FullName!);
// Host-010: tolerate a database that is briefly unreachable at boot
// (e.g. app and DB containers starting together) with a bounded
// exponential backoff before failing fatally.
await StartupRetry.ExecuteWithRetryAsync(
"database-migration",
async () =>
{
using var scope = app.Services.CreateScope();
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger);
},
maxAttempts: 8,
initialDelay: TimeSpan.FromSeconds(2),
migrationLogger);
}
// Middleware pipeline

View File

@@ -0,0 +1,48 @@
namespace ScadaLink.Host;
/// <summary>
/// Bounded retry-with-backoff for startup preconditions.
///
/// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations
/// before the host begins serving traffic. In container orchestration the database
/// and the app frequently start together, so the database may be briefly
/// unreachable. Rather than crashing the process on the first connection failure,
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
/// short outage and only fails fatally once attempts are exhausted.
/// </summary>
public static class StartupRetry
{
public static async Task ExecuteWithRetryAsync(
string operationName,
Func<Task> operation,
int maxAttempts,
TimeSpan initialDelay,
ILogger logger,
CancellationToken cancellationToken = default)
{
var delay = initialDelay;
for (var attempt = 1; ; attempt++)
{
cancellationToken.ThrowIfCancellationRequested();
try
{
await operation();
if (attempt > 1)
logger.LogInformation(
"Startup operation '{Operation}' succeeded on attempt {Attempt}.",
operationName, attempt);
return;
}
catch (Exception ex) when (attempt < maxAttempts)
{
logger.LogWarning(ex,
"Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " +
"retrying in {Delay}.",
operationName, attempt, maxAttempts, delay);
await Task.Delay(delay, cancellationToken);
// Exponential backoff, capped so the total wait stays bounded.
delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks));
}
}
}
}

View File

@@ -30,8 +30,6 @@ public static class StartupValidator
var dbSection = configuration.GetSection("ScadaLink:Database");
if (string.IsNullOrEmpty(dbSection["ConfigurationDb"]))
errors.Add("ScadaLink:Database:ConfigurationDb connection string required for Central");
if (string.IsNullOrEmpty(dbSection["MachineDataDb"]))
errors.Add("ScadaLink:Database:MachineDataDb connection string required for Central");
var secSection = configuration.GetSection("ScadaLink:Security");
if (string.IsNullOrEmpty(secSection["LdapServer"]))
@@ -51,6 +49,13 @@ public static class StartupValidator
if (grpcPortStr != null && (!int.TryParse(grpcPortStr, out grpcPort) || grpcPort < 1 || grpcPort > 65535))
errors.Add("ScadaLink:Node:GrpcPort must be 1-65535");
// Host-007 / REQ-HOST-4: the gRPC (Kestrel HTTP/2) port and the Akka
// remoting port must differ. Identical values make Kestrel and
// Akka.Remote contend for the same TCP port and fail opaquely at
// runtime. Uses the resolved GrpcPort, including the 8083 default.
if (port == grpcPort)
errors.Add("ScadaLink:Node:GrpcPort must differ from RemotingPort");
var dbSection = configuration.GetSection("ScadaLink:Database");
if (string.IsNullOrEmpty(dbSection["SiteDbPath"]))
errors.Add("ScadaLink:Database:SiteDbPath required for Site nodes");

View File

@@ -16,10 +16,9 @@
"FailureDetectionThreshold": "00:00:10",
"MinNrOfMembers": 1
},
"_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Database__MachineDataDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
"_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
"Database": {
"ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}",
"MachineDataDb": "${SCADALINK_MACHINEDATADB_CONNECTION_STRING}"
"ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}"
},
"Security": {
"LdapServer": "localhost",