fix(host): resolve Host-005..011 — async startup, HOCON escaping, port-conflict check, dead-config cleanup, migration retry, log-level wiring; Host-002 flagged
This commit is contained in:
@@ -54,58 +54,20 @@ public class AkkaHostedService : IHostedService
|
||||
/// </summary>
|
||||
public ActorSystem? ActorSystem => _actorSystem;
|
||||
|
||||
public Task StartAsync(CancellationToken cancellationToken)
|
||||
public async Task StartAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var seedNodesStr = string.Join(",",
|
||||
_clusterOptions.SeedNodes.Select(s => $"\"{s}\""));
|
||||
|
||||
// For site nodes, include a site-specific role (e.g., "site-SiteA") alongside the base role
|
||||
var roles = BuildRoles();
|
||||
var rolesStr = string.Join(",", roles.Select(r => $"\"{r}\""));
|
||||
|
||||
// WP-3: Transport heartbeat explicitly configured from CommunicationOptions (not framework defaults)
|
||||
var transportHeartbeatSec = _communicationOptions.TransportHeartbeatInterval.TotalSeconds;
|
||||
var transportFailureSec = _communicationOptions.TransportFailureThreshold.TotalSeconds;
|
||||
|
||||
var hocon = $@"
|
||||
akka {{
|
||||
extensions = [
|
||||
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
|
||||
]
|
||||
actor {{
|
||||
provider = cluster
|
||||
}}
|
||||
remote {{
|
||||
dot-netty.tcp {{
|
||||
hostname = ""{_nodeOptions.NodeHostname}""
|
||||
port = {_nodeOptions.RemotingPort}
|
||||
}}
|
||||
transport-failure-detector {{
|
||||
heartbeat-interval = {transportHeartbeatSec:F0}s
|
||||
acceptable-heartbeat-pause = {transportFailureSec:F0}s
|
||||
}}
|
||||
}}
|
||||
cluster {{
|
||||
seed-nodes = [{seedNodesStr}]
|
||||
roles = [{rolesStr}]
|
||||
min-nr-of-members = {_clusterOptions.MinNrOfMembers}
|
||||
split-brain-resolver {{
|
||||
active-strategy = {_clusterOptions.SplitBrainResolverStrategy}
|
||||
stable-after = {_clusterOptions.StableAfter.TotalSeconds:F0}s
|
||||
keep-oldest {{
|
||||
down-if-alone = on
|
||||
}}
|
||||
}}
|
||||
failure-detector {{
|
||||
heartbeat-interval = {_clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
|
||||
acceptable-heartbeat-pause = {_clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
|
||||
}}
|
||||
run-coordinated-shutdown-when-down = on
|
||||
}}
|
||||
coordinated-shutdown {{
|
||||
run-by-clr-shutdown-hook = on
|
||||
}}
|
||||
}}";
|
||||
// Host-006: HOCON is assembled in a dedicated builder that quotes/escapes every
|
||||
// interpolated value, so a hostname, seed node or strategy containing a quote,
|
||||
// backslash or whitespace cannot corrupt the configuration document.
|
||||
var hocon = BuildHocon(_nodeOptions, _clusterOptions, roles,
|
||||
transportHeartbeatSec, transportFailureSec);
|
||||
|
||||
var config = ConfigurationFactory.ParseString(hocon);
|
||||
_actorSystem = ActorSystem.Create("scadalink", config);
|
||||
@@ -135,10 +97,78 @@ akka {{
|
||||
}
|
||||
else if (_nodeOptions.Role.Equals("Site", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
RegisterSiteActors();
|
||||
await RegisterSiteActorsAsync(cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
/// <summary>
|
||||
/// Builds the Akka HOCON configuration document. Every interpolated value is
|
||||
/// routed through <see cref="QuoteHocon"/> (string values) so a hostname,
|
||||
/// seed-node URI, role or split-brain strategy containing a quote, backslash or
|
||||
/// whitespace cannot corrupt the document or be silently misparsed (Host-006).
|
||||
/// </summary>
|
||||
public static string BuildHocon(
|
||||
NodeOptions nodeOptions,
|
||||
ClusterOptions clusterOptions,
|
||||
IEnumerable<string> roles,
|
||||
double transportHeartbeatSec,
|
||||
double transportFailureSec)
|
||||
{
|
||||
var seedNodesStr = string.Join(",",
|
||||
clusterOptions.SeedNodes.Select(QuoteHocon));
|
||||
var rolesStr = string.Join(",", roles.Select(QuoteHocon));
|
||||
|
||||
return $@"
|
||||
akka {{
|
||||
extensions = [
|
||||
""Akka.Cluster.Tools.PublishSubscribe.DistributedPubSubExtensionProvider, Akka.Cluster.Tools""
|
||||
]
|
||||
actor {{
|
||||
provider = cluster
|
||||
}}
|
||||
remote {{
|
||||
dot-netty.tcp {{
|
||||
hostname = {QuoteHocon(nodeOptions.NodeHostname)}
|
||||
port = {nodeOptions.RemotingPort}
|
||||
}}
|
||||
transport-failure-detector {{
|
||||
heartbeat-interval = {transportHeartbeatSec:F0}s
|
||||
acceptable-heartbeat-pause = {transportFailureSec:F0}s
|
||||
}}
|
||||
}}
|
||||
cluster {{
|
||||
seed-nodes = [{seedNodesStr}]
|
||||
roles = [{rolesStr}]
|
||||
min-nr-of-members = {clusterOptions.MinNrOfMembers}
|
||||
split-brain-resolver {{
|
||||
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
|
||||
stable-after = {clusterOptions.StableAfter.TotalSeconds:F0}s
|
||||
keep-oldest {{
|
||||
down-if-alone = on
|
||||
}}
|
||||
}}
|
||||
failure-detector {{
|
||||
heartbeat-interval = {clusterOptions.HeartbeatInterval.TotalSeconds:F0}s
|
||||
acceptable-heartbeat-pause = {clusterOptions.FailureDetectionThreshold.TotalSeconds:F0}s
|
||||
}}
|
||||
run-coordinated-shutdown-when-down = on
|
||||
}}
|
||||
coordinated-shutdown {{
|
||||
run-by-clr-shutdown-hook = on
|
||||
}}
|
||||
}}";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Renders a value as a HOCON double-quoted string, escaping backslashes and
|
||||
/// double quotes so the resulting token cannot break out of its string literal.
|
||||
/// </summary>
|
||||
private static string QuoteHocon(string? value)
|
||||
{
|
||||
var escaped = (value ?? string.Empty)
|
||||
.Replace("\\", "\\\\")
|
||||
.Replace("\"", "\\\"");
|
||||
return $"\"{escaped}\"";
|
||||
}
|
||||
|
||||
public async Task StopAsync(CancellationToken cancellationToken)
|
||||
@@ -218,7 +248,7 @@ akka {{
|
||||
/// The singleton is scoped to the site-specific cluster role so it runs on exactly
|
||||
/// one node within this site's cluster.
|
||||
/// </summary>
|
||||
private void RegisterSiteActors()
|
||||
private async Task RegisterSiteActorsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var siteRole = $"site-{_nodeOptions.SiteId}";
|
||||
var storage = _serviceProvider.GetRequiredService<SiteStorageService>();
|
||||
@@ -341,8 +371,11 @@ akka {{
|
||||
if (storeAndForwardService != null)
|
||||
{
|
||||
// Initialize SQLite schema and start the retry timer. Must complete before
|
||||
// any actor or HTTP handler touches the service.
|
||||
storeAndForwardService.StartAsync().GetAwaiter().GetResult();
|
||||
// any actor or HTTP handler touches the service. Host-005: awaited rather
|
||||
// than blocked via GetAwaiter().GetResult() — no thread-pool starvation /
|
||||
// sync-context deadlock risk, and exceptions surface as their original type.
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
await storeAndForwardService.StartAsync();
|
||||
|
||||
// Register the store-and-forward delivery handlers so buffered
|
||||
// ExternalSystem calls, cached DB writes and notifications are actually
|
||||
@@ -413,7 +446,22 @@ akka {{
|
||||
contacts.Count, _nodeOptions.SiteId);
|
||||
}
|
||||
|
||||
// Gate gRPC subscriptions until the actor system and SiteStreamManager are initialized
|
||||
// Gate gRPC subscriptions until the actor system and SiteStreamManager are
|
||||
// initialized (REQ-HOST-7).
|
||||
//
|
||||
// Host-009: SetReady asserts a deliberately narrow contract. By this point the
|
||||
// actor system exists, SiteStreamManager.Initialize has run, and every
|
||||
// role actor (SiteCommunicationActor, deployment-manager singleton,
|
||||
// SiteReplicationActor, the ClusterClient) has been created with ActorOf —
|
||||
// creation and the registration Tells are synchronous and strictly ordered.
|
||||
// What is NOT guaranteed is completion of each actor's PreStart or the
|
||||
// ClusterClient's initial-contact handshake with central: those are
|
||||
// intentionally asynchronous. Gating readiness on the central handshake would
|
||||
// be wrong — a site must come up and stream locally even while central is
|
||||
// briefly unreachable. gRPC readiness therefore guarantees "the site actor
|
||||
// graph exists and can accept subscription streams", not "the cluster
|
||||
// handshake has completed". Streams opened before SetReady are already
|
||||
// rejected by SiteStreamGrpcServer with StatusCode.Unavailable.
|
||||
var grpcServer = _serviceProvider.GetService<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
|
||||
grpcServer?.SetReady(_actorSystem!);
|
||||
}
|
||||
|
||||
@@ -3,6 +3,5 @@ namespace ScadaLink.Host;
|
||||
public class DatabaseOptions
|
||||
{
|
||||
public string? ConfigurationDb { get; set; }
|
||||
public string? MachineDataDb { get; set; }
|
||||
public string? SiteDbPath { get; set; }
|
||||
}
|
||||
|
||||
48
src/ScadaLink.Host/LoggerConfigurationFactory.cs
Normal file
48
src/ScadaLink.Host/LoggerConfigurationFactory.cs
Normal file
@@ -0,0 +1,48 @@
|
||||
using Serilog;
|
||||
using Serilog.Events;
|
||||
|
||||
namespace ScadaLink.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Builds the Serilog <see cref="LoggerConfiguration"/> for the Host process.
|
||||
///
|
||||
/// REQ-HOST-8 / Host-011: the configured minimum level comes from
|
||||
/// <c>ScadaLink:Logging:MinimumLevel</c> (bound to <see cref="LoggingOptions"/>) so an
|
||||
/// operator editing that key changes the effective log level. The standard
|
||||
/// <c>Serilog</c> configuration section is still read (via
|
||||
/// <see cref="Serilog.Configuration.ConfigurationLoggerConfigurationExtensions"/>)
|
||||
/// for sink/override customisation; the explicit <c>MinimumLevel.Is</c> below pins
|
||||
/// the floor from <see cref="LoggingOptions"/>.
|
||||
/// </summary>
|
||||
public static class LoggerConfigurationFactory
|
||||
{
|
||||
public static LoggerConfiguration Build(
|
||||
IConfiguration configuration,
|
||||
string nodeRole,
|
||||
string siteId,
|
||||
string nodeHostname)
|
||||
{
|
||||
var loggingOptions = new LoggingOptions();
|
||||
configuration.GetSection("ScadaLink:Logging").Bind(loggingOptions);
|
||||
|
||||
var minimumLevel = ParseLevel(loggingOptions.MinimumLevel);
|
||||
|
||||
return new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(configuration)
|
||||
.MinimumLevel.Is(minimumLevel)
|
||||
.Enrich.WithProperty("SiteId", siteId)
|
||||
.Enrich.WithProperty("NodeHostname", nodeHostname)
|
||||
.Enrich.WithProperty("NodeRole", nodeRole);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses a Serilog <see cref="LogEventLevel"/> name, falling back to
|
||||
/// <see cref="LogEventLevel.Information"/> for null/blank/unrecognised values.
|
||||
/// </summary>
|
||||
private static LogEventLevel ParseLevel(string? level)
|
||||
{
|
||||
return Enum.TryParse<LogEventLevel>(level, ignoreCase: true, out var parsed)
|
||||
? parsed
|
||||
: LogEventLevel.Information;
|
||||
}
|
||||
}
|
||||
@@ -38,12 +38,10 @@ var nodeRole = configuration["ScadaLink:Node:Role"]!;
|
||||
var nodeHostname = configuration["ScadaLink:Node:NodeHostname"] ?? "unknown";
|
||||
var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central";
|
||||
|
||||
// WP-14: Serilog structured logging
|
||||
Log.Logger = new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(configuration)
|
||||
.Enrich.WithProperty("SiteId", siteId)
|
||||
.Enrich.WithProperty("NodeHostname", nodeHostname)
|
||||
.Enrich.WithProperty("NodeRole", nodeRole)
|
||||
// WP-14: Serilog structured logging.
|
||||
// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions).
|
||||
Log.Logger = ScadaLink.Host.LoggerConfigurationFactory
|
||||
.Build(configuration, nodeRole, siteId, nodeHostname)
|
||||
.WriteTo.Console(outputTemplate:
|
||||
"[{Timestamp:HH:mm:ss} {Level:u3}] [{NodeRole}/{NodeHostname}] {Message:lj}{NewLine}{Exception}")
|
||||
.WriteTo.File("logs/scadalink-.log", rollingInterval: Serilog.RollingInterval.Day)
|
||||
@@ -116,14 +114,24 @@ try
|
||||
{
|
||||
var isDevelopment = app.Environment.IsDevelopment()
|
||||
|| string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase);
|
||||
using (var scope = app.Services.CreateScope())
|
||||
{
|
||||
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
|
||||
var migrationLogger = scope.ServiceProvider
|
||||
.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger(typeof(MigrationHelper).FullName!);
|
||||
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger);
|
||||
}
|
||||
var migrationLogger = app.Services
|
||||
.GetRequiredService<ILoggerFactory>()
|
||||
.CreateLogger(typeof(MigrationHelper).FullName!);
|
||||
|
||||
// Host-010: tolerate a database that is briefly unreachable at boot
|
||||
// (e.g. app and DB containers starting together) with a bounded
|
||||
// exponential backoff before failing fatally.
|
||||
await StartupRetry.ExecuteWithRetryAsync(
|
||||
"database-migration",
|
||||
async () =>
|
||||
{
|
||||
using var scope = app.Services.CreateScope();
|
||||
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
|
||||
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger);
|
||||
},
|
||||
maxAttempts: 8,
|
||||
initialDelay: TimeSpan.FromSeconds(2),
|
||||
migrationLogger);
|
||||
}
|
||||
|
||||
// Middleware pipeline
|
||||
|
||||
48
src/ScadaLink.Host/StartupRetry.cs
Normal file
48
src/ScadaLink.Host/StartupRetry.cs
Normal file
@@ -0,0 +1,48 @@
|
||||
namespace ScadaLink.Host;
|
||||
|
||||
/// <summary>
|
||||
/// Bounded retry-with-backoff for startup preconditions.
|
||||
///
|
||||
/// Host-010 / REQ-HOST-4a: a Central node applies/validates database migrations
|
||||
/// before the host begins serving traffic. In container orchestration the database
|
||||
/// and the app frequently start together, so the database may be briefly
|
||||
/// unreachable. Rather than crashing the process on the first connection failure,
|
||||
/// the migration step is wrapped in this bounded exponential backoff: it tolerates a
|
||||
/// short outage and only fails fatally once attempts are exhausted.
|
||||
/// </summary>
|
||||
public static class StartupRetry
|
||||
{
|
||||
public static async Task ExecuteWithRetryAsync(
|
||||
string operationName,
|
||||
Func<Task> operation,
|
||||
int maxAttempts,
|
||||
TimeSpan initialDelay,
|
||||
ILogger logger,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var delay = initialDelay;
|
||||
for (var attempt = 1; ; attempt++)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
try
|
||||
{
|
||||
await operation();
|
||||
if (attempt > 1)
|
||||
logger.LogInformation(
|
||||
"Startup operation '{Operation}' succeeded on attempt {Attempt}.",
|
||||
operationName, attempt);
|
||||
return;
|
||||
}
|
||||
catch (Exception ex) when (attempt < maxAttempts)
|
||||
{
|
||||
logger.LogWarning(ex,
|
||||
"Startup operation '{Operation}' failed on attempt {Attempt}/{MaxAttempts}; " +
|
||||
"retrying in {Delay}.",
|
||||
operationName, attempt, maxAttempts, delay);
|
||||
await Task.Delay(delay, cancellationToken);
|
||||
// Exponential backoff, capped so the total wait stays bounded.
|
||||
delay = TimeSpan.FromTicks(Math.Min(delay.Ticks * 2, TimeSpan.FromSeconds(30).Ticks));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -30,8 +30,6 @@ public static class StartupValidator
|
||||
var dbSection = configuration.GetSection("ScadaLink:Database");
|
||||
if (string.IsNullOrEmpty(dbSection["ConfigurationDb"]))
|
||||
errors.Add("ScadaLink:Database:ConfigurationDb connection string required for Central");
|
||||
if (string.IsNullOrEmpty(dbSection["MachineDataDb"]))
|
||||
errors.Add("ScadaLink:Database:MachineDataDb connection string required for Central");
|
||||
|
||||
var secSection = configuration.GetSection("ScadaLink:Security");
|
||||
if (string.IsNullOrEmpty(secSection["LdapServer"]))
|
||||
@@ -51,6 +49,13 @@ public static class StartupValidator
|
||||
if (grpcPortStr != null && (!int.TryParse(grpcPortStr, out grpcPort) || grpcPort < 1 || grpcPort > 65535))
|
||||
errors.Add("ScadaLink:Node:GrpcPort must be 1-65535");
|
||||
|
||||
// Host-007 / REQ-HOST-4: the gRPC (Kestrel HTTP/2) port and the Akka
|
||||
// remoting port must differ. Identical values make Kestrel and
|
||||
// Akka.Remote contend for the same TCP port and fail opaquely at
|
||||
// runtime. Uses the resolved GrpcPort, including the 8083 default.
|
||||
if (port == grpcPort)
|
||||
errors.Add("ScadaLink:Node:GrpcPort must differ from RemotingPort");
|
||||
|
||||
var dbSection = configuration.GetSection("ScadaLink:Database");
|
||||
if (string.IsNullOrEmpty(dbSection["SiteDbPath"]))
|
||||
errors.Add("ScadaLink:Database:SiteDbPath required for Site nodes");
|
||||
|
||||
@@ -16,10 +16,9 @@
|
||||
"FailureDetectionThreshold": "00:00:10",
|
||||
"MinNrOfMembers": 1
|
||||
},
|
||||
"_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Database__MachineDataDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
|
||||
"_secrets": "Host-003: Secrets are NOT committed in this file. Supply them via environment variables, which the Host's configuration builder (AddEnvironmentVariables) overlays over this file. Required: ScadaLink__Database__ConfigurationDb, ScadaLink__Security__LdapServiceAccountPassword, ScadaLink__Security__JwtSigningKey. The ${...} placeholders below are intentionally non-functional and must be overridden per environment.",
|
||||
"Database": {
|
||||
"ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}",
|
||||
"MachineDataDb": "${SCADALINK_MACHINEDATADB_CONNECTION_STRING}"
|
||||
"ConfigurationDb": "${SCADALINK_CONFIGURATIONDB_CONNECTION_STRING}"
|
||||
},
|
||||
"Security": {
|
||||
"LdapServer": "localhost",
|
||||
|
||||
Reference in New Issue
Block a user