Files
ScadaBridge/src/ZB.MOM.WW.ScadaBridge.Communication/CommunicationOptions.cs
T
Joseph Doherty 06f2df4f89 feat(deploy): wire periodic PendingDeployment purge + SQL Server same-id re-stage test
Notify-and-fetch follow-ups:

- PendingDeploymentPurgeActor: a central cluster singleton (not
  readiness-gated, best-effort) that sweeps expired PendingDeployment
  staging rows on CommunicationOptions.PendingDeploymentPurgeInterval
  (default 1h). Modeled on the kpi-history-recorder pattern: self-scheduling
  timer, per-tick DI scope -> IDeploymentManagerRepository, continue-on-error.
  Wired in AkkaHostedService.RegisterCentralActors (manager + proxy + drain);
  resolves the deferred TODO in DeploymentService. Correctness never depends
  on it (supersession bounds rows to <=1/instance; the fetch endpoint enforces
  the TTL), so it is deliberately absent from RequiredSingletonsHealthCheck.

- SQL Server integration test for StagePendingIfAbsentAsync re-staging an
  instance's OWN DeploymentId over an expired row against the real UNIQUE
  index on DeploymentId — confirms EF orders DELETE before INSERT in one
  SaveChanges (SQLite's constraint timing differs from SQL Server's). Plus
  a same-instance supersession variant on real SQL Server.

Tests: 2 TestKit actor tests + 2 SQL Server integration tests (both ran
green against the infra MSSQL container); 235 Communication + 15
PendingDeployment tests pass; Host builds 0 warnings.
2026-06-26 23:19:29 -04:00

84 lines
4.3 KiB
C#

namespace ZB.MOM.WW.ScadaBridge.Communication;
/// <summary>
/// Configuration options for central-site communication, including per-pattern
/// timeouts and transport heartbeat settings.
/// </summary>
public class CommunicationOptions
{
/// <summary>Timeout for deployment commands (typically longest due to apply logic).</summary>
public TimeSpan DeploymentTimeout { get; set; } = TimeSpan.FromMinutes(2);
/// <summary>Timeout for lifecycle commands (disable, enable, delete).</summary>
public TimeSpan LifecycleTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>Timeout for artifact deployment commands.</summary>
public TimeSpan ArtifactDeploymentTimeout { get; set; } = TimeSpan.FromMinutes(1);
/// <summary>Timeout for remote query requests (event logs, parked messages).</summary>
public TimeSpan QueryTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>Timeout for integration call routing.</summary>
public TimeSpan IntegrationTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>Timeout for debug view subscribe/unsubscribe handshake.</summary>
public TimeSpan DebugViewTimeout { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>Timeout for health report acknowledgement (fire-and-forget, but bounded).</summary>
public TimeSpan HealthReportTimeout { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>
/// Notification Outbox: timeout for forwarding a buffered notification to central
/// and awaiting its <c>NotificationSubmitAck</c>. A timeout is treated as a
/// transient failure — the Store-and-Forward engine keeps the message buffered
/// and retries the forward at the fixed retry interval.
/// </summary>
public TimeSpan NotificationForwardTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Contact point addresses for the central cluster (e.g. "akka.tcp://scadabridge@central-a:8081").
/// Used by site nodes to create a ClusterClient for reaching central.
/// </summary>
public List<string> CentralContactPoints { get; set; } = new();
/// <summary>gRPC keepalive ping interval for streaming connections.</summary>
public TimeSpan GrpcKeepAlivePingDelay { get; set; } = TimeSpan.FromSeconds(15);
/// <summary>gRPC keepalive ping timeout — stream is considered dead if no response within this period.</summary>
public TimeSpan GrpcKeepAlivePingTimeout { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>Maximum lifetime for a single gRPC stream before the server forces re-establishment.</summary>
public TimeSpan GrpcMaxStreamLifetime { get; set; } = TimeSpan.FromHours(4);
/// <summary>Maximum number of concurrent gRPC streaming subscriptions per site node.</summary>
public int GrpcMaxConcurrentStreams { get; set; } = 100;
/// <summary>Akka.Remote transport heartbeat interval.</summary>
public TimeSpan TransportHeartbeatInterval { get; set; } = TimeSpan.FromSeconds(5);
/// <summary>Akka.Remote transport failure detection threshold.</summary>
public TimeSpan TransportFailureThreshold { get; set; } = TimeSpan.FromSeconds(15);
/// <summary>
/// Base URL (Traefik/LB) the SITE uses to fetch deploy configs from central,
/// e.g. "https://central.example:9000". Carried in RefreshDeploymentCommand so
/// sites need no new standing config. Empty disables notify-and-fetch fallback.
/// </summary>
public string CentralFetchBaseUrl { get; set; } = "";
/// <summary>
/// How long a staged PendingDeployment (and its fetch token) stays valid. Must
/// comfortably cover both site nodes' fetches within one deploy window.
/// </summary>
public TimeSpan PendingDeploymentTtl { get; set; } = TimeSpan.FromMinutes(5);
/// <summary>
/// How often the central <c>PendingDeploymentPurgeActor</c> singleton reclaims
/// expired (TTL-elapsed) PendingDeployment staging rows. Best-effort hygiene only:
/// supersession bounds pending rows to ≤1 per instance and the config-fetch endpoint
/// already enforces the TTL, so this purge merely sweeps rows left behind by instances
/// that are deployed once and never re-deployed. Default 1 hour ≫ <see cref="PendingDeploymentTtl"/>.
/// </summary>
public TimeSpan PendingDeploymentPurgeInterval { get; set; } = TimeSpan.FromHours(1);
}