using HealthChecks.UI.Client; using Microsoft.AspNetCore.Diagnostics.HealthChecks; using ZB.MOM.WW.ScadaBridge.AuditLog; using ZB.MOM.WW.ScadaBridge.CentralUI; using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure; using ZB.MOM.WW.ScadaBridge.Communication; using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase; using ZB.MOM.WW.ScadaBridge.DeploymentManager; using ZB.MOM.WW.ScadaBridge.ExternalSystemGateway; using ZB.MOM.WW.ScadaBridge.HealthMonitoring; using ZB.MOM.WW.ScadaBridge.Host; using ZB.MOM.WW.ScadaBridge.Host.Actors; using ZB.MOM.WW.ScadaBridge.Host.Health; using ZB.MOM.WW.ScadaBridge.InboundAPI; using ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware; using ZB.MOM.WW.ScadaBridge.ManagementService; using ZB.MOM.WW.ScadaBridge.NotificationOutbox; using ZB.MOM.WW.ScadaBridge.NotificationService; using ZB.MOM.WW.ScadaBridge.Security; using ZB.MOM.WW.ScadaBridge.SiteCallAudit; using ZB.MOM.WW.ScadaBridge.TemplateEngine; using ZB.MOM.WW.ScadaBridge.Transport; using Serilog; // SCADABRIDGE_CONFIG determines which role-specific config to load (Central or Site) // DOTNET_ENVIRONMENT/ASPNETCORE_ENVIRONMENT stay as "Development" for dev tooling (static assets, EF migrations, etc.) var scadabridgeConfig = Environment.GetEnvironmentVariable("SCADABRIDGE_CONFIG") ?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT") ?? "Production"; var configuration = new ConfigurationBuilder() .AddJsonFile("appsettings.json", optional: false) .AddJsonFile($"appsettings.{scadabridgeConfig}.json", optional: true) .AddEnvironmentVariables() .AddCommandLine(args) .Build(); // WP-11: Full startup validation — fail fast before any DI or actor system setup StartupValidator.Validate(configuration); // Read node options for Serilog enrichment var nodeRole = configuration["ScadaBridge:Node:Role"]!; var nodeHostname = configuration["ScadaBridge:Node:NodeHostname"] ?? "unknown"; var siteId = configuration["ScadaBridge:Node:SiteId"] ?? "central"; // WP-14: Serilog structured logging. // Host-011: minimum level is driven by ScadaBridge:Logging:MinimumLevel (LoggingOptions). // Host-014: console and file sinks are defined in the `Serilog` configuration // section (appsettings.json) and applied via ReadFrom.Configuration inside the // factory — the sink set, output template, file path and rolling interval are all // configuration-driven per REQ-HOST-8, not hard-coded here. Log.Logger = ZB.MOM.WW.ScadaBridge.Host.LoggerConfigurationFactory .Build(configuration, nodeRole, siteId, nodeHostname) .CreateLogger(); try { Log.Information("Starting ScadaBridge host as {Role} on {Hostname}", nodeRole, nodeHostname); if (nodeRole.Equals("Central", StringComparison.OrdinalIgnoreCase)) { var builder = WebApplication.CreateBuilder(args); builder.Configuration.AddConfiguration(configuration); // WP-14: Serilog builder.Host.UseSerilog(); // WP-17: Windows Service support (no-op when not running as a Windows Service) builder.Host.UseWindowsService(); // Shared components builder.Services.AddClusterInfrastructure(); builder.Services.AddCommunication(); builder.Services.AddHealthMonitoring(); builder.Services.AddCentralHealthAggregation(); builder.Services.AddExternalSystemGateway(); builder.Services.AddNotificationService(); // Central-only components // Notification Outbox: central owns SMTP delivery; the Email adapter reuses the // AddNotificationService() SMTP machinery above. AddNotificationOutbox binds // NotificationOutboxOptions via BindConfiguration, so no explicit Configure is needed. builder.Services.AddNotificationOutbox(); // Transport (#24) — central-only bundle export/import pipeline. Binds // TransportOptions from ScadaBridge:Transport via BindConfiguration; no // explicit Configure needed. builder.Services.AddTransport(); // Audit Log (#23) — central node owns the AuditLogIngestActor singleton + // IAuditLogRepository. The site writer chain is still registered (lazy // singletons) but is never resolved on a central node. builder.Services.AddAuditLog(builder.Configuration); // #23 M6-T5 Bundle D — central-only hosted service that rolls // pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance // (registered below by AddConfigurationDatabase). builder.Services.AddAuditLogCentralMaintenance(builder.Configuration); // Site Call Audit (#22) — central node owns the SiteCallAuditActor // singleton (M3 Bundle F). The extension itself currently registers // nothing — actor Props are constructed inline in AkkaHostedService — // but the call is here for symmetry with the other audit composition // roots so future per-actor DI lands without touching Program.cs. builder.Services.AddSiteCallAudit(); builder.Services.AddTemplateEngine(); builder.Services.AddDeploymentManager(); builder.Services.AddSecurity(); builder.Services.AddCentralUI(); builder.Services.AddInboundAPI(); builder.Services.AddManagementService(); var configDbConnectionString = configuration["ScadaBridge:Database:ConfigurationDb"] ?? throw new InvalidOperationException("ScadaBridge:Database:ConfigurationDb connection string is required for Central role."); builder.Services.AddConfigurationDatabase(configDbConnectionString); // WP-12: Health checks for readiness gating builder.Services.AddHealthChecks() .AddCheck("database") .AddCheck("akka-cluster") .AddCheck("active-node"); // WP-13: Akka.NET bootstrap via hosted service builder.Services.AddSingleton(); builder.Services.AddHostedService(sp => sp.GetRequiredService()); // InboundAPI-022: register the production IActiveNodeGate implementation so // standby-node gating is actually enforced (the InboundApiEndpointFilter // consults IActiveNodeGate and defaults to "allow" when none is registered, // which leaves the design's "central cluster only (active node)" guarantee // unenforced in deployed binaries). The gate is backed by the same Akka // cluster-leadership check as ActiveNodeHealthCheck above, so the inbound // API and the /health/active endpoint Traefik routes against agree on // which node is active. builder.Services.AddSingleton(); // Cluster node status provider scoped to the Central role — feeds the // CentralHealthReportLoop so the central cluster appears on /monitoring/health. builder.Services.AddSingleton(sp => { var akkaService = sp.GetRequiredService(); return new AkkaClusterNodeProvider(akkaService, "Central"); }); // Options binding SiteServiceRegistration.BindSharedOptions(builder.Services, builder.Configuration); builder.Services.Configure(builder.Configuration.GetSection("ScadaBridge:Security")); builder.Services.Configure(builder.Configuration.GetSection("ScadaBridge:InboundApi")); builder.Services.Configure( builder.Configuration.GetSection(ZB.MOM.WW.ScadaBridge.DeploymentManager.ServiceCollectionExtensions.OptionsSection)); var app = builder.Build(); // Apply or validate database migrations (skip when running in test harness) if (!string.Equals(configuration["ScadaBridge:Database:SkipMigrations"], "true", StringComparison.OrdinalIgnoreCase)) { var isDevelopment = app.Environment.IsDevelopment() || string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase); var migrationLogger = app.Services .GetRequiredService() .CreateLogger(typeof(MigrationHelper).FullName!); // Host-010: tolerate a database that is briefly unreachable at boot // (e.g. app and DB containers starting together) with a bounded // exponential backoff before failing fatally. // Host-015: only connection-class (transient) faults are retried — a // schema-version mismatch is permanent and must fail fast on attempt 1. // Host-019: thread the host's ApplicationStopping token into both the // migration call itself and the inter-attempt Task.Delay so a SIGTERM // during the bounded-retry window (~2 min worst-case) tears down // cleanly instead of being ignored until the loop exhausts. await StartupRetry.ExecuteWithRetryAsync( "database-migration", async ct => { using var scope = app.Services.CreateScope(); var dbContext = scope.ServiceProvider.GetRequiredService(); await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger, ct); }, maxAttempts: 8, initialDelay: TimeSpan.FromSeconds(2), migrationLogger, isTransient: StartupRetry.IsTransientDatabaseFault, cancellationToken: app.Lifetime.ApplicationStopping); } // Middleware pipeline app.UseWebSockets(); app.UseRouting(); app.UseAuthentication(); app.UseAuthorization(); app.UseAntiforgery(); // Audit Log #23 (M4 Bundle D, T8): emit one InboundRequest/InboundAuthFailure // audit row per call into the inbound API. Placed AFTER UseAuthentication/ // UseAuthorization so any HttpContext.User the framework populates is in // place, and scoped to the /api/ prefix so it never observes the Central UI, // Management API, SignalR hubs, or health endpoints. The endpoint handler // is responsible for stashing the resolved API key name on // HttpContext.Items (see AuditWriteMiddleware.AuditActorItemKey) AFTER its // in-handler API key validation succeeds. // InboundAPI-025: scope the audit middleware to the inbound API method // route (/api/{methodName}) and explicitly exclude the management/audit // sub-trees that share the /api prefix. Without these exclusions the // middleware would emit a spurious ApiInbound audit row for every // /api/audit/query and /api/audit/export call (and would treat audit-log // reads as inbound script invocations — recursive write-on-read). The // POST-only filter rules out the GET routes on /api/audit, /api/centralui, // /api/script-analysis even if a future route is added under those // prefixes with the same verb; the explicit prefix excludes still belt- // and-brace POST-y additions there. app.UseWhen( ctx => ctx.Request.Path.StartsWithSegments("/api") && !ctx.Request.Path.StartsWithSegments("/api/audit") && !ctx.Request.Path.StartsWithSegments("/api/centralui") && !ctx.Request.Path.StartsWithSegments("/api/script-analysis") && !ctx.Request.Path.StartsWithSegments("/api/management") && HttpMethods.IsPost(ctx.Request.Method), branch => branch.UseAuditWriteMiddleware()); // WP-12: Map readiness endpoint — returns 503 until ready, 200 when ready. // REQ-HOST-4a defines readiness as cluster membership + DB connectivity, // explicitly NOT cluster leadership. The leader-only "active-node" check is // excluded here so a fully operational standby central node reports ready; // leadership is reported separately on /health/active. app.MapHealthChecks("/health/ready", new HealthCheckOptions { Predicate = check => check.Name != "active-node", ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse }); // Active node endpoint — returns 200 only on the cluster leader; used by Traefik for routing app.MapHealthChecks("/health/active", new HealthCheckOptions { Predicate = check => check.Name == "active-node", ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse }); app.MapStaticAssets(); app.MapCentralUI(); app.MapInboundAPI(); app.MapManagementAPI(); // Audit Log #23 (M8): CLI-facing /api/audit/{query,export} routes. Same // Basic-Auth + LDAP mechanism as /management; gated on the OperationalAudit // / AuditExport role sets. app.MapAuditAPI(); app.MapHub("/hubs/debug-stream"); // Compile and register all Inbound API method scripts at startup using (var scope = app.Services.CreateScope()) { var apiRepo = scope.ServiceProvider.GetRequiredService(); var executor = app.Services.GetRequiredService(); var methods = await apiRepo.GetAllApiMethodsAsync(); foreach (var method in methods) { executor.CompileAndRegister(method); } } await app.RunAsync(); } else if (nodeRole.Equals("Site", StringComparison.OrdinalIgnoreCase)) { var builder = WebApplication.CreateBuilder(args); builder.Configuration.AddConfiguration(configuration); // WP-14: Serilog builder.Host.UseSerilog(); // WP-17: Windows Service support (no-op when not running as a Windows Service) builder.Host.UseWindowsService(); // Read GrpcPort from config (NodeOptions already has default 8083) var grpcPort = configuration.GetValue("ScadaBridge:Node:GrpcPort", 8083); // Configure Kestrel for HTTP/2 only on the gRPC port builder.WebHost.ConfigureKestrel(options => { options.ListenAnyIP(grpcPort, listenOptions => { listenOptions.Protocols = Microsoft.AspNetCore.Server.Kestrel.Core.HttpProtocols.Http2; }); }); // gRPC server registration builder.Services.AddGrpc(); builder.Services.AddSingleton(); // Existing site service registrations SiteServiceRegistration.Configure(builder.Services, builder.Configuration); var app = builder.Build(); // Map gRPC service — resolves the singleton SiteStreamGrpcServer from DI app.MapGrpcService(); // Host-017 / REQ-HOST-7: site-shutdown ordering. ApplicationStopping // fires BEFORE IHostedService.StopAsync runs, so the gRPC server // refuses new streams (Unavailable) and cancels every active stream // here — clients observe a clean Cancelled and reconnect — and only // THEN does AkkaHostedService run CoordinatedShutdown and tear down // actors. Without this hand-off, in-flight streams go silent and only // time out via gRPC keepalive (~25 s), violating the documented // four-step sequence. var siteLifetime = app.Services.GetRequiredService(); var siteGrpcServer = app.Services.GetRequiredService(); siteLifetime.ApplicationStopping.Register(() => siteGrpcServer.CancelAllStreams()); await app.RunAsync(); } else { throw new InvalidOperationException($"Unknown role: {nodeRole}. Must be 'Central' or 'Site'."); } } catch (Exception ex) { Log.Fatal(ex, "ScadaBridge host terminated unexpectedly"); throw; } finally { await Log.CloseAndFlushAsync(); } /// /// Exposes the auto-generated Program class for test infrastructure (e.g. WebApplicationFactory). /// public partial class Program { }