6ae0fea558
Async cancellation hygiene, fire-and-forget observability, retry/shutdown semantics, and audit-row coverage across 9 modules. Highlights: Cancellation & lifecycle: - AuditLog-006: SqliteAuditWriter.Dispose hops to thread pool, escaping the captured SyncContext that risked sync-over-async deadlock. - AuditLog-010: SiteAuditTelemetryActor owns a private lifecycle CTS, threaded through drain paths instead of CancellationToken.None. - Comm-019: CentralCommunicationActor adds lifecycle CTS for repo calls. - Host-019: Migration StartupRetry forwards ApplicationStopping so SIGTERM during the bounded-retry window aborts cleanly. Cursor / retry / counter correctness: - AuditLog-004: SiteAuditReconciliationActor's cursor now holds at `since` when any row's idempotent insert is still being retried (per-EventId retry counter, MaxPermanentInsertAttempts=5 escape valve with LogCritical abandon). No more silent abandonment of permanently-failing rows. - ConfigDB-019: Dropped the catch-and-continue on EnsureLookaheadAsync's SPLIT loop — by class-doc construction the catch could only mask real failures and let the next iteration create permanent partition holes. - HM-017/018: HealthReportSender + CentralHealthReportLoop snapshot per-interval counters before sending, restore via new ISiteHealthCollector.AddIntervalCounters on transport failure so counts aren't silently lost. Fire-and-forget / shutdown waits: - InboundAPI-018: AuditWriteMiddleware observes faulted audit-write tasks via OnlyOnFaulted continuation (Warning log; response unchanged). - SnF-024: StoreAndForwardService.StopAsync awaits in-flight retry sweep with a bounded SweepShutdownWaitTimeout (10s). Leak / refactor: - Comm-021: SiteStreamGrpcServer.SubscribeInstance wraps Subscribe in its own try/catch so a throw doesn't leak the relay actor or _activeStreams entry. - Comm-022: VERIFIED already-closed by Comm-016's dead-code purge. - CLI-017: BundleCommands' three subcommands delegate to ExecuteCommandAsync (auth-failure exit-code contract unified). Defensive / validation: - CLI-021: CliConfig.Load wraps file-read/JSON parse so malformed config prints a warning and returns defaults instead of crashing the CLI. - Host-022: ParseLevel emits stderr one-shot warning for unrecognised MinimumLevel instead of silently coercing to Information. - ESG-019: ExternalSystemClient sets HttpClient.Timeout=Infinite so the per-call CTS is the sole timeout source (was clipped to 100s by .NET). - Security-020: New SecurityOptionsValidator (IValidateOptions) rejects empty LdapServer/LdapSearchBase with ValidateOnStart. - DM-019: Lifecycle command timeouts now emit DisableTimedOut/EnableTimedOut/ DeleteTimedOut audit entries (mirrors DeployFailed pattern). Plus reconciled stale per-module Open-findings counters that had drifted from prior sessions. 20+ new regression tests across 11 test projects; build clean; affected suites all green. README regenerated: 75 open (was 93).
312 lines
15 KiB
C#
312 lines
15 KiB
C#
using HealthChecks.UI.Client;
|
|
using Microsoft.AspNetCore.Diagnostics.HealthChecks;
|
|
using ScadaLink.AuditLog;
|
|
using ScadaLink.CentralUI;
|
|
using ScadaLink.ClusterInfrastructure;
|
|
using ScadaLink.Communication;
|
|
using ScadaLink.ConfigurationDatabase;
|
|
using ScadaLink.DeploymentManager;
|
|
using ScadaLink.ExternalSystemGateway;
|
|
using ScadaLink.HealthMonitoring;
|
|
using ScadaLink.Host;
|
|
using ScadaLink.Host.Actors;
|
|
using ScadaLink.Host.Health;
|
|
using ScadaLink.InboundAPI;
|
|
using ScadaLink.InboundAPI.Middleware;
|
|
using ScadaLink.ManagementService;
|
|
using ScadaLink.NotificationOutbox;
|
|
using ScadaLink.NotificationService;
|
|
using ScadaLink.Security;
|
|
using ScadaLink.SiteCallAudit;
|
|
using ScadaLink.TemplateEngine;
|
|
using ScadaLink.Transport;
|
|
using Serilog;
|
|
|
|
// SCADALINK_CONFIG determines which role-specific config to load (Central or Site)
|
|
// DOTNET_ENVIRONMENT/ASPNETCORE_ENVIRONMENT stay as "Development" for dev tooling (static assets, EF migrations, etc.)
|
|
var scadalinkConfig = Environment.GetEnvironmentVariable("SCADALINK_CONFIG")
|
|
?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT")
|
|
?? "Production";
|
|
|
|
var configuration = new ConfigurationBuilder()
|
|
.AddJsonFile("appsettings.json", optional: false)
|
|
.AddJsonFile($"appsettings.{scadalinkConfig}.json", optional: true)
|
|
.AddEnvironmentVariables()
|
|
.AddCommandLine(args)
|
|
.Build();
|
|
|
|
// WP-11: Full startup validation — fail fast before any DI or actor system setup
|
|
StartupValidator.Validate(configuration);
|
|
|
|
// Read node options for Serilog enrichment
|
|
var nodeRole = configuration["ScadaLink:Node:Role"]!;
|
|
var nodeHostname = configuration["ScadaLink:Node:NodeHostname"] ?? "unknown";
|
|
var siteId = configuration["ScadaLink:Node:SiteId"] ?? "central";
|
|
|
|
// WP-14: Serilog structured logging.
|
|
// Host-011: minimum level is driven by ScadaLink:Logging:MinimumLevel (LoggingOptions).
|
|
// Host-014: console and file sinks are defined in the `Serilog` configuration
|
|
// section (appsettings.json) and applied via ReadFrom.Configuration inside the
|
|
// factory — the sink set, output template, file path and rolling interval are all
|
|
// configuration-driven per REQ-HOST-8, not hard-coded here.
|
|
Log.Logger = ScadaLink.Host.LoggerConfigurationFactory
|
|
.Build(configuration, nodeRole, siteId, nodeHostname)
|
|
.CreateLogger();
|
|
|
|
try
|
|
{
|
|
Log.Information("Starting ScadaLink host as {Role} on {Hostname}", nodeRole, nodeHostname);
|
|
|
|
if (nodeRole.Equals("Central", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var builder = WebApplication.CreateBuilder(args);
|
|
builder.Configuration.AddConfiguration(configuration);
|
|
|
|
// WP-14: Serilog
|
|
builder.Host.UseSerilog();
|
|
|
|
// WP-17: Windows Service support (no-op when not running as a Windows Service)
|
|
builder.Host.UseWindowsService();
|
|
|
|
// Shared components
|
|
builder.Services.AddClusterInfrastructure();
|
|
builder.Services.AddCommunication();
|
|
builder.Services.AddHealthMonitoring();
|
|
builder.Services.AddCentralHealthAggregation();
|
|
builder.Services.AddExternalSystemGateway();
|
|
builder.Services.AddNotificationService();
|
|
|
|
// Central-only components
|
|
// Notification Outbox: central owns SMTP delivery; the Email adapter reuses the
|
|
// AddNotificationService() SMTP machinery above. AddNotificationOutbox binds
|
|
// NotificationOutboxOptions via BindConfiguration, so no explicit Configure is needed.
|
|
builder.Services.AddNotificationOutbox();
|
|
// Transport (#24) — central-only bundle export/import pipeline. Binds
|
|
// TransportOptions from ScadaLink:Transport via BindConfiguration; no
|
|
// explicit Configure needed.
|
|
builder.Services.AddTransport();
|
|
// Audit Log (#23) — central node owns the AuditLogIngestActor singleton +
|
|
// IAuditLogRepository. The site writer chain is still registered (lazy
|
|
// singletons) but is never resolved on a central node.
|
|
builder.Services.AddAuditLog(builder.Configuration);
|
|
// #23 M6-T5 Bundle D — central-only hosted service that rolls
|
|
// pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance
|
|
// (registered below by AddConfigurationDatabase).
|
|
builder.Services.AddAuditLogCentralMaintenance(builder.Configuration);
|
|
// Site Call Audit (#22) — central node owns the SiteCallAuditActor
|
|
// singleton (M3 Bundle F). The extension itself currently registers
|
|
// nothing — actor Props are constructed inline in AkkaHostedService —
|
|
// but the call is here for symmetry with the other audit composition
|
|
// roots so future per-actor DI lands without touching Program.cs.
|
|
builder.Services.AddSiteCallAudit();
|
|
builder.Services.AddTemplateEngine();
|
|
builder.Services.AddDeploymentManager();
|
|
builder.Services.AddSecurity();
|
|
builder.Services.AddCentralUI();
|
|
builder.Services.AddInboundAPI();
|
|
builder.Services.AddManagementService();
|
|
|
|
var configDbConnectionString = configuration["ScadaLink:Database:ConfigurationDb"]
|
|
?? throw new InvalidOperationException("ScadaLink:Database:ConfigurationDb connection string is required for Central role.");
|
|
builder.Services.AddConfigurationDatabase(configDbConnectionString);
|
|
|
|
// WP-12: Health checks for readiness gating
|
|
builder.Services.AddHealthChecks()
|
|
.AddCheck<DatabaseHealthCheck>("database")
|
|
.AddCheck<AkkaClusterHealthCheck>("akka-cluster")
|
|
.AddCheck<ActiveNodeHealthCheck>("active-node");
|
|
|
|
// WP-13: Akka.NET bootstrap via hosted service
|
|
builder.Services.AddSingleton<AkkaHostedService>();
|
|
builder.Services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
|
|
|
|
// InboundAPI-022: register the production IActiveNodeGate implementation so
|
|
// standby-node gating is actually enforced (the InboundApiEndpointFilter
|
|
// consults IActiveNodeGate and defaults to "allow" when none is registered,
|
|
// which leaves the design's "central cluster only (active node)" guarantee
|
|
// unenforced in deployed binaries). The gate is backed by the same Akka
|
|
// cluster-leadership check as ActiveNodeHealthCheck above, so the inbound
|
|
// API and the /health/active endpoint Traefik routes against agree on
|
|
// which node is active.
|
|
builder.Services.AddSingleton<ScadaLink.InboundAPI.IActiveNodeGate, ActiveNodeGate>();
|
|
|
|
// Cluster node status provider scoped to the Central role — feeds the
|
|
// CentralHealthReportLoop so the central cluster appears on /monitoring/health.
|
|
builder.Services.AddSingleton<IClusterNodeProvider>(sp =>
|
|
{
|
|
var akkaService = sp.GetRequiredService<AkkaHostedService>();
|
|
return new AkkaClusterNodeProvider(akkaService, "Central");
|
|
});
|
|
|
|
// Options binding
|
|
SiteServiceRegistration.BindSharedOptions(builder.Services, builder.Configuration);
|
|
builder.Services.Configure<SecurityOptions>(builder.Configuration.GetSection("ScadaLink:Security"));
|
|
builder.Services.Configure<InboundApiOptions>(builder.Configuration.GetSection("ScadaLink:InboundApi"));
|
|
builder.Services.Configure<DeploymentManagerOptions>(
|
|
builder.Configuration.GetSection(ScadaLink.DeploymentManager.ServiceCollectionExtensions.OptionsSection));
|
|
|
|
var app = builder.Build();
|
|
|
|
// Apply or validate database migrations (skip when running in test harness)
|
|
if (!string.Equals(configuration["ScadaLink:Database:SkipMigrations"], "true", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var isDevelopment = app.Environment.IsDevelopment()
|
|
|| string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase);
|
|
var migrationLogger = app.Services
|
|
.GetRequiredService<ILoggerFactory>()
|
|
.CreateLogger(typeof(MigrationHelper).FullName!);
|
|
|
|
// Host-010: tolerate a database that is briefly unreachable at boot
|
|
// (e.g. app and DB containers starting together) with a bounded
|
|
// exponential backoff before failing fatally.
|
|
// Host-015: only connection-class (transient) faults are retried — a
|
|
// schema-version mismatch is permanent and must fail fast on attempt 1.
|
|
// Host-019: thread the host's ApplicationStopping token into both the
|
|
// migration call itself and the inter-attempt Task.Delay so a SIGTERM
|
|
// during the bounded-retry window (~2 min worst-case) tears down
|
|
// cleanly instead of being ignored until the loop exhausts.
|
|
await StartupRetry.ExecuteWithRetryAsync(
|
|
"database-migration",
|
|
async ct =>
|
|
{
|
|
using var scope = app.Services.CreateScope();
|
|
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
|
|
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger, ct);
|
|
},
|
|
maxAttempts: 8,
|
|
initialDelay: TimeSpan.FromSeconds(2),
|
|
migrationLogger,
|
|
isTransient: StartupRetry.IsTransientDatabaseFault,
|
|
cancellationToken: app.Lifetime.ApplicationStopping);
|
|
}
|
|
|
|
// Middleware pipeline
|
|
app.UseWebSockets();
|
|
app.UseRouting();
|
|
app.UseAuthentication();
|
|
app.UseAuthorization();
|
|
app.UseAntiforgery();
|
|
|
|
// Audit Log #23 (M4 Bundle D, T8): emit one InboundRequest/InboundAuthFailure
|
|
// audit row per call into the inbound API. Placed AFTER UseAuthentication/
|
|
// UseAuthorization so any HttpContext.User the framework populates is in
|
|
// place, and scoped to the /api/ prefix so it never observes the Central UI,
|
|
// Management API, SignalR hubs, or health endpoints. The endpoint handler
|
|
// is responsible for stashing the resolved API key name on
|
|
// HttpContext.Items (see AuditWriteMiddleware.AuditActorItemKey) AFTER its
|
|
// in-handler API key validation succeeds.
|
|
app.UseWhen(
|
|
ctx => ctx.Request.Path.StartsWithSegments("/api"),
|
|
branch => branch.UseAuditWriteMiddleware());
|
|
|
|
// WP-12: Map readiness endpoint — returns 503 until ready, 200 when ready.
|
|
// REQ-HOST-4a defines readiness as cluster membership + DB connectivity,
|
|
// explicitly NOT cluster leadership. The leader-only "active-node" check is
|
|
// excluded here so a fully operational standby central node reports ready;
|
|
// leadership is reported separately on /health/active.
|
|
app.MapHealthChecks("/health/ready", new HealthCheckOptions
|
|
{
|
|
Predicate = check => check.Name != "active-node",
|
|
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
|
});
|
|
|
|
// Active node endpoint — returns 200 only on the cluster leader; used by Traefik for routing
|
|
app.MapHealthChecks("/health/active", new HealthCheckOptions
|
|
{
|
|
Predicate = check => check.Name == "active-node",
|
|
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
|
|
});
|
|
|
|
app.MapStaticAssets();
|
|
app.MapCentralUI<ScadaLink.Host.Components.App>();
|
|
app.MapInboundAPI();
|
|
app.MapManagementAPI();
|
|
// Audit Log #23 (M8): CLI-facing /api/audit/{query,export} routes. Same
|
|
// Basic-Auth + LDAP mechanism as /management; gated on the OperationalAudit
|
|
// / AuditExport role sets.
|
|
app.MapAuditAPI();
|
|
app.MapHub<ScadaLink.ManagementService.DebugStreamHub>("/hubs/debug-stream");
|
|
|
|
// Compile and register all Inbound API method scripts at startup
|
|
using (var scope = app.Services.CreateScope())
|
|
{
|
|
var apiRepo = scope.ServiceProvider.GetRequiredService<ScadaLink.Commons.Interfaces.Repositories.IInboundApiRepository>();
|
|
var executor = app.Services.GetRequiredService<ScadaLink.InboundAPI.InboundScriptExecutor>();
|
|
var methods = await apiRepo.GetAllApiMethodsAsync();
|
|
foreach (var method in methods)
|
|
{
|
|
executor.CompileAndRegister(method);
|
|
}
|
|
}
|
|
|
|
await app.RunAsync();
|
|
}
|
|
else if (nodeRole.Equals("Site", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var builder = WebApplication.CreateBuilder(args);
|
|
builder.Configuration.AddConfiguration(configuration);
|
|
|
|
// WP-14: Serilog
|
|
builder.Host.UseSerilog();
|
|
|
|
// WP-17: Windows Service support (no-op when not running as a Windows Service)
|
|
builder.Host.UseWindowsService();
|
|
|
|
// Read GrpcPort from config (NodeOptions already has default 8083)
|
|
var grpcPort = configuration.GetValue<int>("ScadaLink:Node:GrpcPort", 8083);
|
|
|
|
// Configure Kestrel for HTTP/2 only on the gRPC port
|
|
builder.WebHost.ConfigureKestrel(options =>
|
|
{
|
|
options.ListenAnyIP(grpcPort, listenOptions =>
|
|
{
|
|
listenOptions.Protocols = Microsoft.AspNetCore.Server.Kestrel.Core.HttpProtocols.Http2;
|
|
});
|
|
});
|
|
|
|
// gRPC server registration
|
|
builder.Services.AddGrpc();
|
|
builder.Services.AddSingleton<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
|
|
|
|
// Existing site service registrations
|
|
SiteServiceRegistration.Configure(builder.Services, builder.Configuration);
|
|
|
|
var app = builder.Build();
|
|
|
|
// Map gRPC service — resolves the singleton SiteStreamGrpcServer from DI
|
|
app.MapGrpcService<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
|
|
|
|
// Host-017 / REQ-HOST-7: site-shutdown ordering. ApplicationStopping
|
|
// fires BEFORE IHostedService.StopAsync runs, so the gRPC server
|
|
// refuses new streams (Unavailable) and cancels every active stream
|
|
// here — clients observe a clean Cancelled and reconnect — and only
|
|
// THEN does AkkaHostedService run CoordinatedShutdown and tear down
|
|
// actors. Without this hand-off, in-flight streams go silent and only
|
|
// time out via gRPC keepalive (~25 s), violating the documented
|
|
// four-step sequence.
|
|
var siteLifetime = app.Services.GetRequiredService<Microsoft.Extensions.Hosting.IHostApplicationLifetime>();
|
|
var siteGrpcServer = app.Services.GetRequiredService<ScadaLink.Communication.Grpc.SiteStreamGrpcServer>();
|
|
siteLifetime.ApplicationStopping.Register(() => siteGrpcServer.CancelAllStreams());
|
|
|
|
await app.RunAsync();
|
|
}
|
|
else
|
|
{
|
|
throw new InvalidOperationException($"Unknown role: {nodeRole}. Must be 'Central' or 'Site'.");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Fatal(ex, "ScadaLink host terminated unexpectedly");
|
|
throw;
|
|
}
|
|
finally
|
|
{
|
|
await Log.CloseAndFlushAsync();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Exposes the auto-generated Program class for test infrastructure (e.g. WebApplicationFactory).
|
|
/// </summary>
|
|
public partial class Program { }
|