Files
ScadaBridge/src/ZB.MOM.WW.ScadaBridge.Host/Program.cs
T

361 lines
19 KiB
C#

using ZB.MOM.WW.Health;
using ZB.MOM.WW.Health.Akka;
using ZB.MOM.WW.Health.EntityFrameworkCore;
using ZB.MOM.WW.ScadaBridge.AuditLog;
using ZB.MOM.WW.ScadaBridge.CentralUI;
using ZB.MOM.WW.ScadaBridge.ClusterInfrastructure;
using ZB.MOM.WW.ScadaBridge.Communication;
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
using ZB.MOM.WW.ScadaBridge.DeploymentManager;
using ZB.MOM.WW.ScadaBridge.ExternalSystemGateway;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
using ZB.MOM.WW.ScadaBridge.Host;
using ZB.MOM.WW.ScadaBridge.Host.Actors;
using ZB.MOM.WW.ScadaBridge.Host.Health;
using ZB.MOM.WW.ScadaBridge.InboundAPI;
using ZB.MOM.WW.ScadaBridge.InboundAPI.Middleware;
using ZB.MOM.WW.ScadaBridge.ManagementService;
using ZB.MOM.WW.ScadaBridge.NotificationOutbox;
using ZB.MOM.WW.ScadaBridge.NotificationService;
using ZB.MOM.WW.ScadaBridge.Security;
using ZB.MOM.WW.ScadaBridge.SiteCallAudit;
using ZB.MOM.WW.ScadaBridge.TemplateEngine;
using ZB.MOM.WW.ScadaBridge.Transport;
using ZB.MOM.WW.Telemetry;
using Serilog;
// SCADABRIDGE_CONFIG determines which role-specific config to load (Central or Site)
// DOTNET_ENVIRONMENT/ASPNETCORE_ENVIRONMENT stay as "Development" for dev tooling (static assets, EF migrations, etc.)
var scadabridgeConfig = Environment.GetEnvironmentVariable("SCADABRIDGE_CONFIG")
?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT")
?? "Production";
var configuration = new ConfigurationBuilder()
.AddJsonFile("appsettings.json", optional: false)
.AddJsonFile($"appsettings.{scadabridgeConfig}.json", optional: true)
.AddEnvironmentVariables()
.AddCommandLine(args)
.Build();
// WP-11: Full startup validation — fail fast before any DI or actor system setup
StartupValidator.Validate(configuration);
// Read node options for Serilog enrichment
var nodeRole = configuration["ScadaBridge:Node:Role"]!;
var nodeHostname = configuration["ScadaBridge:Node:NodeHostname"] ?? "unknown";
var siteId = configuration["ScadaBridge:Node:SiteId"] ?? "central";
// WP-14: Serilog structured logging.
// Host-011: minimum level is driven by ScadaBridge:Logging:MinimumLevel (LoggingOptions).
// Host-014: console and file sinks are defined in the `Serilog` configuration
// section (appsettings.json) and applied via ReadFrom.Configuration inside the
// factory — the sink set, output template, file path and rolling interval are all
// configuration-driven per REQ-HOST-8, not hard-coded here.
Log.Logger = ZB.MOM.WW.ScadaBridge.Host.LoggerConfigurationFactory
.Build(configuration, nodeRole, siteId, nodeHostname)
.CreateLogger();
try
{
Log.Information("Starting ScadaBridge host as {Role} on {Hostname}", nodeRole, nodeHostname);
if (nodeRole.Equals("Central", StringComparison.OrdinalIgnoreCase))
{
var builder = WebApplication.CreateBuilder(args);
builder.Configuration.AddConfiguration(configuration);
// WP-14: Serilog
builder.Host.UseSerilog();
// WP-17: Windows Service support (no-op when not running as a Windows Service)
builder.Host.UseWindowsService();
// Shared components
builder.Services.AddClusterInfrastructure();
builder.Services.AddCommunication();
builder.Services.AddHealthMonitoring();
builder.Services.AddCentralHealthAggregation();
builder.Services.AddExternalSystemGateway();
builder.Services.AddNotificationService();
// Central-only components
// Notification Outbox: central owns SMTP delivery; the Email adapter reuses the
// AddNotificationService() SMTP machinery above. AddNotificationOutbox binds
// NotificationOutboxOptions via BindConfiguration, so no explicit Configure is needed.
builder.Services.AddNotificationOutbox();
// Transport (#24) — central-only bundle export/import pipeline. Binds
// TransportOptions from ScadaBridge:Transport via BindConfiguration; no
// explicit Configure needed.
builder.Services.AddTransport();
// Audit Log (#23) — central node owns the AuditLogIngestActor singleton +
// IAuditLogRepository. The site writer chain is still registered (lazy
// singletons) but is never resolved on a central node.
builder.Services.AddAuditLog(builder.Configuration);
// #23 M6-T5 Bundle D — central-only hosted service that rolls
// pf_AuditLog_Month forward monthly. Depends on IPartitionMaintenance
// (registered below by AddConfigurationDatabase).
builder.Services.AddAuditLogCentralMaintenance(builder.Configuration);
// Site Call Audit (#22) — central node owns the SiteCallAuditActor
// singleton (M3 Bundle F). The extension itself currently registers
// nothing — actor Props are constructed inline in AkkaHostedService —
// but the call is here for symmetry with the other audit composition
// roots so future per-actor DI lands without touching Program.cs.
builder.Services.AddSiteCallAudit();
builder.Services.AddTemplateEngine();
builder.Services.AddDeploymentManager();
builder.Services.AddSecurity();
builder.Services.AddCentralUI();
builder.Services.AddInboundAPI();
builder.Services.AddManagementService();
var configDbConnectionString = configuration["ScadaBridge:Database:ConfigurationDb"]
?? throw new InvalidOperationException("ScadaBridge:Database:ConfigurationDb connection string is required for Central role.");
builder.Services.AddConfigurationDatabase(configDbConnectionString);
// WP-12: Health checks for readiness gating — shared ZB.MOM.WW.Health probes.
// Check names and the ready/active tier split are preserved: database + akka-cluster
// carry the Ready tag (/health/ready), active-node carries the Active tag (/health/active).
// The Akka checks resolve ActorSystem from DI via the transient bridge registered below;
// the DatabaseHealthCheck<TContext> resolves a scoped ScadaBridgeDbContext (no factory).
builder.Services.AddHealthChecks()
.AddTypeActivatedCheck<DatabaseHealthCheck<ScadaBridgeDbContext>>(
"database",
failureStatus: null,
tags: new[] { ZbHealthTags.Ready })
.AddTypeActivatedCheck<AkkaClusterHealthCheck>(
"akka-cluster",
failureStatus: null,
tags: new[] { ZbHealthTags.Ready },
args: AkkaClusterStatusPolicy.Default)
.AddTypeActivatedCheck<ActiveNodeHealthCheck>(
"active-node",
failureStatus: null,
tags: new[] { ZbHealthTags.Active });
// WP-13: Akka.NET bootstrap via hosted service
builder.Services.AddSingleton<AkkaHostedService>();
builder.Services.AddHostedService(sp => sp.GetRequiredService<AkkaHostedService>());
// The shared ZB.MOM.WW.Health Akka checks resolve ActorSystem from DI. ScadaBridge owns the
// ActorSystem inside AkkaHostedService (not a DI singleton), so bridge it as TRANSIENT: each
// resolve re-reads the current value — null while warming up (checks → Degraded), live after.
// The factory must NOT throw: GetService<ActorSystem>() must return null (not raise) pre-start.
builder.Services.AddTransient<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().ActorSystem!);
// InboundAPI-022: register the production IActiveNodeGate implementation so
// standby-node gating is actually enforced (the InboundApiEndpointFilter
// consults IActiveNodeGate and defaults to "allow" when none is registered,
// which leaves the design's "central cluster only (active node)" guarantee
// unenforced in deployed binaries). The gate is backed by the same Akka
// cluster-leadership check as ActiveNodeHealthCheck above, so the inbound
// API and the /health/active endpoint Traefik routes against agree on
// which node is active.
builder.Services.AddSingleton<ZB.MOM.WW.ScadaBridge.InboundAPI.IActiveNodeGate, ActiveNodeGate>();
// Cluster node status provider scoped to the Central role — feeds the
// CentralHealthReportLoop so the central cluster appears on /monitoring/health.
builder.Services.AddSingleton<IClusterNodeProvider>(sp =>
{
var akkaService = sp.GetRequiredService<AkkaHostedService>();
return new AkkaClusterNodeProvider(akkaService, "Central");
});
// Options binding
SiteServiceRegistration.BindSharedOptions(builder.Services, builder.Configuration);
builder.Services.Configure<SecurityOptions>(builder.Configuration.GetSection("ScadaBridge:Security"));
builder.Services.Configure<InboundApiOptions>(builder.Configuration.GetSection("ScadaBridge:InboundApi"));
builder.Services.Configure<DeploymentManagerOptions>(
builder.Configuration.GetSection(ZB.MOM.WW.ScadaBridge.DeploymentManager.ServiceCollectionExtensions.OptionsSection));
var app = builder.Build();
// Apply or validate database migrations (skip when running in test harness)
if (!string.Equals(configuration["ScadaBridge:Database:SkipMigrations"], "true", StringComparison.OrdinalIgnoreCase))
{
var isDevelopment = app.Environment.IsDevelopment()
|| string.Equals(Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT"), "Development", StringComparison.OrdinalIgnoreCase);
var migrationLogger = app.Services
.GetRequiredService<ILoggerFactory>()
.CreateLogger(typeof(MigrationHelper).FullName!);
// Host-010: tolerate a database that is briefly unreachable at boot
// (e.g. app and DB containers starting together) with a bounded
// exponential backoff before failing fatally.
// Host-015: only connection-class (transient) faults are retried — a
// schema-version mismatch is permanent and must fail fast on attempt 1.
// Host-019: thread the host's ApplicationStopping token into both the
// migration call itself and the inter-attempt Task.Delay so a SIGTERM
// during the bounded-retry window (~2 min worst-case) tears down
// cleanly instead of being ignored until the loop exhausts.
await StartupRetry.ExecuteWithRetryAsync(
"database-migration",
async ct =>
{
using var scope = app.Services.CreateScope();
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
await MigrationHelper.ApplyOrValidateMigrationsAsync(dbContext, isDevelopment, migrationLogger, ct);
},
maxAttempts: 8,
initialDelay: TimeSpan.FromSeconds(2),
migrationLogger,
isTransient: StartupRetry.IsTransientDatabaseFault,
cancellationToken: app.Lifetime.ApplicationStopping);
}
// Middleware pipeline
app.UseWebSockets();
app.UseRouting();
app.UseAuthentication();
app.UseAuthorization();
app.UseAntiforgery();
// Audit Log #23 (M4 Bundle D, T8): emit one InboundRequest/InboundAuthFailure
// audit row per call into the inbound API. Placed AFTER UseAuthentication/
// UseAuthorization so any HttpContext.User the framework populates is in
// place, and scoped to the /api/ prefix so it never observes the Central UI,
// Management API, SignalR hubs, or health endpoints. The endpoint handler
// is responsible for stashing the resolved API key name on
// HttpContext.Items (see AuditWriteMiddleware.AuditActorItemKey) AFTER its
// in-handler API key validation succeeds.
// InboundAPI-025: scope the audit middleware to the inbound API method
// route (/api/{methodName}) and explicitly exclude the management/audit
// sub-trees that share the /api prefix. Without these exclusions the
// middleware would emit a spurious ApiInbound audit row for every
// /api/audit/query and /api/audit/export call (and would treat audit-log
// reads as inbound script invocations — recursive write-on-read). The
// POST-only filter rules out the GET routes on /api/audit, /api/centralui,
// /api/script-analysis even if a future route is added under those
// prefixes with the same verb; the explicit prefix excludes still belt-
// and-brace POST-y additions there.
app.UseWhen(
ctx => ctx.Request.Path.StartsWithSegments("/api")
&& !ctx.Request.Path.StartsWithSegments("/api/audit")
&& !ctx.Request.Path.StartsWithSegments("/api/centralui")
&& !ctx.Request.Path.StartsWithSegments("/api/script-analysis")
&& !ctx.Request.Path.StartsWithSegments("/api/management")
&& HttpMethods.IsPost(ctx.Request.Method),
branch => branch.UseAuditWriteMiddleware());
// WP-12: Map the canonical three-tier health endpoints in one call:
// /health/ready — Ready-tagged checks (database + akka-cluster). REQ-HOST-4a defines
// readiness as cluster membership + DB connectivity, explicitly NOT
// cluster leadership, so the leader-only active-node check is excluded
// (a fully operational standby central node still reports ready).
// /health/active — Active-tagged check (active-node); returns 200 only on the cluster
// leader; used by Traefik for routing.
// /healthz — bare process liveness; runs no checks (always 200 while the process
// is up). New tier added by adopting the shared library.
// All three are anonymous and use the canonical ZbHealthWriter JSON output.
app.MapZbHealth();
// Observability — mount the always-on Prometheus /metrics scrape endpoint.
// AddZbTelemetry (in SiteServiceRegistration.BindSharedOptions) wires the OTel
// Resource + standard instrumentation + Prometheus exporter; this exposes them.
// Requires endpoint routing (app.UseRouting() above).
app.MapZbMetrics();
app.MapStaticAssets();
app.MapCentralUI<ZB.MOM.WW.ScadaBridge.Host.Components.App>();
app.MapInboundAPI();
app.MapManagementAPI();
// Audit Log #23 (M8): CLI-facing /api/audit/{query,export} routes. Same
// Basic-Auth + LDAP mechanism as /management; gated on the OperationalAudit
// / AuditExport role sets.
app.MapAuditAPI();
app.MapHub<ZB.MOM.WW.ScadaBridge.ManagementService.DebugStreamHub>("/hubs/debug-stream");
// Compile and register all Inbound API method scripts at startup
using (var scope = app.Services.CreateScope())
{
var apiRepo = scope.ServiceProvider.GetRequiredService<ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories.IInboundApiRepository>();
var executor = app.Services.GetRequiredService<ZB.MOM.WW.ScadaBridge.InboundAPI.InboundScriptExecutor>();
var methods = await apiRepo.GetAllApiMethodsAsync();
foreach (var method in methods)
{
executor.CompileAndRegister(method);
}
}
await app.RunAsync();
}
else if (nodeRole.Equals("Site", StringComparison.OrdinalIgnoreCase))
{
var builder = WebApplication.CreateBuilder(args);
builder.Configuration.AddConfiguration(configuration);
// WP-14: Serilog
builder.Host.UseSerilog();
// WP-17: Windows Service support (no-op when not running as a Windows Service)
builder.Host.UseWindowsService();
// Read GrpcPort from config (NodeOptions already has default 8083)
var grpcPort = configuration.GetValue<int>("ScadaBridge:Node:GrpcPort", 8083);
// Configure Kestrel for HTTP/2 only on the gRPC port
builder.WebHost.ConfigureKestrel(options =>
{
options.ListenAnyIP(grpcPort, listenOptions =>
{
listenOptions.Protocols = Microsoft.AspNetCore.Server.Kestrel.Core.HttpProtocols.Http2;
});
});
// gRPC server registration
builder.Services.AddGrpc();
builder.Services.AddSingleton<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
// Existing site service registrations
SiteServiceRegistration.Configure(builder.Services, builder.Configuration);
var app = builder.Build();
// Endpoint routing middleware. The gRPC service mapping below and the
// /metrics scrape endpoint both run on endpoint routing, so UseRouting()
// must be present before the Map* calls on the site role.
app.UseRouting();
// Observability — mount the always-on Prometheus /metrics scrape endpoint.
// AddZbTelemetry (in SiteServiceRegistration.Configure → BindSharedOptions)
// wires the OTel Resource + standard instrumentation + Prometheus exporter;
// this exposes them on the site node too.
app.MapZbMetrics();
// Map gRPC service — resolves the singleton SiteStreamGrpcServer from DI
app.MapGrpcService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
// Host-017 / REQ-HOST-7: site-shutdown ordering. ApplicationStopping
// fires BEFORE IHostedService.StopAsync runs, so the gRPC server
// refuses new streams (Unavailable) and cancels every active stream
// here — clients observe a clean Cancelled and reconnect — and only
// THEN does AkkaHostedService run CoordinatedShutdown and tear down
// actors. Without this hand-off, in-flight streams go silent and only
// time out via gRPC keepalive (~25 s), violating the documented
// four-step sequence.
var siteLifetime = app.Services.GetRequiredService<Microsoft.Extensions.Hosting.IHostApplicationLifetime>();
var siteGrpcServer = app.Services.GetRequiredService<ZB.MOM.WW.ScadaBridge.Communication.Grpc.SiteStreamGrpcServer>();
siteLifetime.ApplicationStopping.Register(() => siteGrpcServer.CancelAllStreams());
await app.RunAsync();
}
else
{
throw new InvalidOperationException($"Unknown role: {nodeRole}. Must be 'Central' or 'Site'.");
}
}
catch (Exception ex)
{
Log.Fatal(ex, "ScadaBridge host terminated unexpectedly");
throw;
}
finally
{
await Log.CloseAndFlushAsync();
}
/// <summary>
/// Exposes the auto-generated Program class for test infrastructure (e.g. WebApplicationFactory).
/// </summary>
public partial class Program { }