using System.Collections.Concurrent;
using System.Diagnostics;
using Mbproxy.Admin;
using Mbproxy.Bcd;
using Mbproxy.Configuration;
using Mbproxy.Options;
using Mbproxy.Proxy.Cache;
using Mbproxy.Proxy.Multiplexing;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Options;
namespace Mbproxy.Proxy;
///
/// that owns all instances.
///
/// Startup posture (matches design doc "eager, continue on per-port failure"):
///
/// - Enumerate and build one supervisor per PLC.
/// - Start all supervisors in parallel. Each supervisor attempts to bind immediately
/// and enters the Polly recovery loop if the bind fails.
/// - After all supervisors have completed their first bind attempt (reached
/// or ),
/// log mbproxy.startup.ready with bound/configured counts.
///
///
/// Phase 06: passes the supervisor dictionary to
/// after initial startup so hot-reload changes are applied by the reconciler.
///
/// Stop: cancels all supervisors in parallel with a 5-second hard deadline.
///
internal sealed partial class ProxyWorker : BackgroundService
{
private readonly IOptionsMonitor _options;
private readonly IPduPipeline _pipeline;
private readonly ILogger _logger;
private readonly ILoggerFactory _loggerFactory;
private readonly ConfigReconciler _reconciler;
// Phase 12 (W1.5) — admin endpoint is no longer IHostedService; ProxyWorker drives its
// lifecycle directly so the design's "drain THEN stop admin" ordering is honoured.
//
// Resolved LAZILY (in ExecuteAsync) rather than in the constructor because the DI graph
// is circular: AdminEndpointHost → StatusSnapshotBuilder → ProxyWorker. A constructor
// GetService() during ProxyWorker's own construction returns null
// silently. Lazy resolution sidesteps the cycle — by the time ExecuteAsync runs the DI
// container is fully built.
private readonly IServiceProvider _services;
private AdminEndpointHost? _admin;
// Phase 06: supervisors are now managed jointly by ProxyWorker (initial bootstrap)
// and ConfigReconciler (subsequent hot-reload changes). The dictionary is shared
// via ConfigReconciler.Attach() after initial startup.
//
// Phase 12 (W2.3) — ConcurrentDictionary because ConfigReconciler mutates this from
// parallel Task.WhenAll continuations (Add/Remove/Restart paths). The outer Apply is
// serialised by a semaphore but the inner per-PLC tasks run concurrently. Status-page
// reads via IReadOnlyDictionary still work without locking.
private readonly ConcurrentDictionary _supervisors =
new(StringComparer.Ordinal);
///
/// Read-only view of the live supervisor dictionary. Consumed by Phase 07's
/// to enumerate per-PLC state.
/// The caller should read this on the status-page path only (not the hot path).
///
internal IReadOnlyDictionary Supervisors => _supervisors;
public ProxyWorker(
IOptionsMonitor options,
IPduPipeline pipeline,
ILogger logger,
ILoggerFactory loggerFactory,
ConfigReconciler reconciler,
IServiceProvider services)
{
_options = options;
_pipeline = pipeline;
_logger = logger;
_loggerFactory = loggerFactory;
_reconciler = reconciler;
_services = services;
// Phase 12 (W1.5) — admin endpoint resolved lazily in ExecuteAsync (see field comment).
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var opts = _options.CurrentValue;
int plcsConfigured = opts.Plcs.Count;
// ── 1. Build per-PLC BCD tag maps ────────────────────────────────────────────
var plcContexts = new Dictionary(opts.Plcs.Count, StringComparer.Ordinal);
foreach (var plc in opts.Plcs)
{
var result = BcdTagMapBuilder.Build(opts.BcdTags, plc.BcdTags, plc.DefaultCacheTtlMs);
foreach (var warn in result.Warnings)
_logger.LogWarning("[{Plc}] BCD tag map warning: {Message}", plc.Name, warn.Message);
if (result.Errors.Count > 0)
{
foreach (var err in result.Errors)
_logger.LogError("[{Plc}] BCD tag map error ({Kind}): {Message}",
plc.Name, err.Kind, err.Message);
_logger.LogError("Skipping listener for PLC '{Plc}' due to BCD tag map errors.", plc.Name);
continue;
}
// Phase 11 — construct a per-PLC response cache only when at least one
// resolved tag opts in (CacheTtlMs > 0). Skipping cache construction for a
// PLC with no cacheable tags keeps the no-cache path free of the eviction
// timer and the per-call resolution cost, preserving "default behaviour =
// Phase 10 unchanged" when no operator has opted any tag in.
var cache = HasAnyCacheableTag(result.Map)
? new ResponseCache(opts.Cache.MaxEntriesPerPlc, opts.Cache.EvictionIntervalMs)
: null;
plcContexts[plc.Name] = new PerPlcContext
{
PlcName = plc.Name,
TagMap = result.Map,
Counters = new ProxyCounters(),
Logger = _loggerFactory.CreateLogger($"Mbproxy.Proxy.BcdRewriter.{plc.Name}"),
Cache = cache,
};
}
// ── 2. Build Polly pipelines once ─────────────────────────────────────────────
// Both pipelines are built from ResilienceOptions and reused across all PLCs.
var resilienceOpts = opts.Resilience;
var backendPipeline = PolicyFactory.BuildBackendConnect(
resilienceOpts.BackendConnect,
_loggerFactory.CreateLogger("Mbproxy.Proxy.BackendConnect"));
// ── 3. Build supervisors ──────────────────────────────────────────────────────
foreach (var plc in opts.Plcs)
{
if (!plcContexts.TryGetValue(plc.Name, out var perPlcContext))
continue; // BCD map failed — skip this PLC.
// Each supervisor gets its own recovery pipeline (with its own logger scope).
var recoveryPipeline = PolicyFactory.BuildListenerRecovery(
resilienceOpts.ListenerRecovery,
_loggerFactory.CreateLogger($"Mbproxy.Proxy.ListenerRecovery.{plc.Name}"));
// Phase 10 — give the supervisor a live accessor for ReadCoalescingOptions
// so a hot-reload of `Mbproxy.Resilience.ReadCoalescing.Enabled` propagates
// to the multiplexer's per-PDU coalescing decision.
Func coalescingAccessor =
() => _options.CurrentValue.Resilience.ReadCoalescing;
var supervisor = new PlcListenerSupervisor(
plc,
opts.Connection,
_pipeline,
_loggerFactory.CreateLogger(),
_loggerFactory.CreateLogger(),
_loggerFactory.CreateLogger($"Mbproxy.Proxy.UpstreamPipe.{plc.Name}"),
perPlcContext,
recoveryPipeline,
_loggerFactory.CreateLogger(),
backendPipeline,
coalescingAccessor);
_supervisors[plc.Name] = supervisor;
}
// ── Phase 06: wire reconciler BEFORE starting supervisors ─────────────────
// Attach hands the reconciler the authoritative supervisor dictionary and the
// initial options snapshot. The reconciler won't process OnChange events until
// after this call — the brief window between Attach and first supervisor start
// is safe because the channel signal only enqueues; apply runs asynchronously.
// Phase 12 (W2.1) — also pass the live coalescing accessor so reconciler-built
// supervisors (add/restart paths) honour hot-reloaded ReadCoalescing values.
Func reconcilerCoalescingAccessor =
() => _options.CurrentValue.Resilience.ReadCoalescing;
_reconciler.Attach(_supervisors, opts, reconcilerCoalescingAccessor);
if (_supervisors.Count == 0)
{
LogStartupReady(_logger, 0, plcsConfigured);
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
return;
}
// ── 4. Start all supervisors in parallel ──────────────────────────────────────
var startTasks = _supervisors.Values
.Select(s => s.StartAsync(stoppingToken))
.ToArray();
await Task.WhenAll(startTasks).ConfigureAwait(false);
// ── 5. Wait for every supervisor to complete its first bind attempt ───────────
// "Ready" = every supervisor has transitioned out of Stopped (i.e. reached
// Bound or Recovering from its first attempt).
using var readyCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
using var readyLinked = CancellationTokenSource.CreateLinkedTokenSource(
readyCts.Token, stoppingToken);
var waitTasks = _supervisors.Values
.Select(s => s.WaitForInitialBindAttemptAsync(readyLinked.Token))
.ToArray();
try
{
await Task.WhenAll(waitTasks).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// Either the 30 s deadline fired or the service is stopping.
}
int boundCount = _supervisors.Values.Count(s => s.Snapshot().State == SupervisorState.Bound);
LogStartupReady(_logger, boundCount, plcsConfigured);
// Phase 12 (W1.5) — start the admin endpoint AFTER listeners are bound so the
// status page can never observe the service in a "no PLCs configured yet" state.
// The admin endpoint is no longer registered as IHostedService (the host's reverse
// stop order would tear it down BEFORE drain). ProxyWorker drives both ends.
//
// Resolution happens here, not in the constructor — the DI graph is circular
// (admin → StatusSnapshotBuilder → ProxyWorker) and a constructor-time lookup
// returns null silently.
_admin = _services.GetService();
if (_admin is not null)
{
try
{
await _admin.StartAsync(stoppingToken).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogError(ex, "Admin endpoint failed to start: {Message}", ex.Message);
}
}
// ── 6. Keep the worker alive until the host signals stop ─────────────────────
// Supervisors run their own background loops; ExecuteAsync just waits.
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
}
///
/// Phase 12 (W1.5) — graceful shutdown sequence (replaces the deleted
/// ShutdownCoordinator):
///
/// - Cancel via base.StopAsync.
/// - Stop all supervisors with a 5 s hard deadline (no new connections; existing
/// pipes are cascaded by teardown).
/// - Wait for in-flight PDUs to drain via the live
/// (read fresh from
/// so a hot-reloaded value is
/// honoured at stop time).
/// - Stop the admin endpoint LAST so the status page survives the drain phase
/// and an operator polling it sees the in-flight count fall to zero.
/// - Dispose every supervisor to release sockets, channels, and watchdog timers.
///
/// Logs mbproxy.shutdown.complete on the way out with the in-flight count at
/// drain-deadline (zero on a clean shutdown, positive when forced cancel).
///
public override async Task StopAsync(CancellationToken cancellationToken)
{
// Cancel ExecuteAsync first.
await base.StopAsync(cancellationToken).ConfigureAwait(false);
var sw = Stopwatch.StartNew();
// Phase 12 (W2.20) — supervisor stop deadline read from the live config so a
// hot-reloaded GracefulShutdownTimeoutMs is honoured. Previously hard-coded 5 s.
// The supervisor stop budget is bounded by the same total-shutdown budget.
int gracefulMs = _options.CurrentValue.Connection.GracefulShutdownTimeoutMs;
// ── 1. Stop accepting new connections ─────────────────────────────────────────
using var stopCts = new CancellationTokenSource(TimeSpan.FromMilliseconds(gracefulMs));
using var linked = CancellationTokenSource.CreateLinkedTokenSource(
stopCts.Token, cancellationToken);
var stopTasks = _supervisors.Values
.Select(s => s.StopAsync(linked.Token))
.ToArray();
try
{
await Task.WhenAll(stopTasks).ConfigureAwait(false);
}
catch
{
// Best effort — don't let individual supervisor failures block shutdown.
}
// ── 2. Drain in-flight PDUs ───────────────────────────────────────────────────
// Same `gracefulMs` budget the supervisor-stop step used.
int drainDeadlineMs = gracefulMs;
int inFlightAtCancel = 0;
if (drainDeadlineMs > 0)
{
using var drainCts = new CancellationTokenSource(TimeSpan.FromMilliseconds(drainDeadlineMs));
try
{
while (!drainCts.Token.IsCancellationRequested)
{
int total = CountInFlight();
if (total == 0) break;
await Task.Delay(10, drainCts.Token).ConfigureAwait(false);
}
}
catch (OperationCanceledException)
{
inFlightAtCancel = CountInFlight();
}
}
// ── 3. Stop admin endpoint LAST ───────────────────────────────────────────────
if (_admin is not null)
{
try
{
using var adminCts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
await _admin.StopAsync(adminCts.Token).ConfigureAwait(false);
}
catch
{
// Best-effort.
}
}
// ── 4. Dispose supervisors (releases sockets, channels, watchdog timers) ─────
foreach (var supervisor in _supervisors.Values)
await supervisor.DisposeAsync().ConfigureAwait(false);
_supervisors.Clear();
LogShutdownComplete(_logger, inFlightAtCancel, sw.ElapsedMilliseconds);
}
private int CountInFlight()
{
int total = 0;
foreach (var supervisor in _supervisors.Values)
total += (int)supervisor.CurrentCounters.Snapshot().InFlightCount;
return total;
}
// ── Logging ───────────────────────────────────────────────────────────────────────────
///
/// Phase 11 — returns true when at least one BcdTag in the resolved map has a
/// positive . A PLC with no cacheable tags skips the
/// entirely (no eviction timer, no
/// per-call cache resolution cost), so the default-OFF deployment is byte-identical
/// to a Phase-10 deployment.
///
private static bool HasAnyCacheableTag(BcdTagMap map)
{
foreach (var t in map.All)
if (t.CacheTtlMs > 0) return true;
return false;
}
[LoggerMessage(EventId = 1, EventName = "mbproxy.startup.ready",
Level = LogLevel.Information,
Message = "mbproxy service ready — ListenersBound={ListenersBound} PlcsConfigured={PlcsConfigured}")]
private static partial void LogStartupReady(ILogger logger, int listenersBound, int plcsConfigured);
[LoggerMessage(EventId = 21, EventName = "mbproxy.startup.bind.failed",
Level = LogLevel.Error,
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
// Phase 12 (W1.5) — moved here from the deleted ShutdownCoordinator.
[LoggerMessage(EventId = 80, EventName = "mbproxy.shutdown.complete",
Level = LogLevel.Information,
Message = "Graceful shutdown complete: InFlightAtCancel={InFlightAtCancel} ElapsedMs={ElapsedMs}")]
private static partial void LogShutdownComplete(ILogger logger, int inFlightAtCancel, long elapsedMs);
}