wwtools/mbproxy/src/Mbproxy/Proxy/ProxyWorker.cs

using System.Collections.Concurrent;
using System.Diagnostics;
using Mbproxy.Admin;
using Mbproxy.Bcd;
using Mbproxy.Configuration;
using Mbproxy.Options;
using Mbproxy.Proxy.Cache;
using Mbproxy.Proxy.Multiplexing;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Options;

namespace Mbproxy.Proxy;

/// <summary>
/// <see cref="BackgroundService"/> that owns all <see cref="PlcListenerSupervisor"/> instances.
///
/// Startup posture (matches docs/Architecture/Overview.md "eager, continue on per-port failure"):
/// <list type="number">
///   <item>Enumerate <see cref="MbproxyOptions.Plcs"/> and build one supervisor per PLC.</item>
///   <item>Start all supervisors in parallel. Each supervisor attempts to bind immediately
///         and enters the Polly recovery loop if the bind fails.</item>
///   <item>After all supervisors have completed their first bind attempt (reached
///         <see cref="SupervisorState.Bound"/> or <see cref="SupervisorState.Recovering"/>),
///         log <c>mbproxy.startup.ready</c> with bound/configured counts.</item>
/// </list>
///
/// Passes the supervisor dictionary to <see cref="ConfigReconciler.Attach"/> after
/// initial startup so hot-reload changes are applied by the reconciler.
///
/// Stop: cancels all supervisors in parallel with a 5-second hard deadline.
/// </summary>
internal sealed partial class ProxyWorker : BackgroundService
{
    private readonly IOptionsMonitor<MbproxyOptions> _options;
    private readonly IPduPipeline _pipeline;
    private readonly ILogger<ProxyWorker> _logger;
    private readonly ILoggerFactory _loggerFactory;
    private readonly ConfigReconciler _reconciler;
    // Admin endpoint is not registered as IHostedService; ProxyWorker drives its
    // lifecycle directly so the design's "drain THEN stop admin" ordering is honoured.
    //
    // Resolved LAZILY (in ExecuteAsync) rather than in the constructor because the DI
    // graph is circular: AdminEndpointHost → StatusSnapshotBuilder → ProxyWorker. A
    // constructor GetService<AdminEndpointHost>() during ProxyWorker's own construction
    // returns null silently. Lazy resolution sidesteps the cycle — by the time
    // ExecuteAsync runs the DI container is fully built.
    private readonly IServiceProvider _services;
    private AdminEndpointHost? _admin;

    // Per-PLC tag-value captures for the connection-detail debug view. Populated as
    // each PerPlcContext is built; the admin SignalR layer arms/disarms entries.
    private readonly TagCaptureRegistry _captureRegistry;

    // Supervisors are managed jointly by ProxyWorker (initial bootstrap) and
    // ConfigReconciler (subsequent hot-reload changes). The dictionary is shared via
    // ConfigReconciler.Attach() after initial startup.
    //
    // ConcurrentDictionary because ConfigReconciler mutates this from parallel
    // Task.WhenAll continuations (Add/Remove/Restart paths). The outer Apply is
    // serialised by a semaphore but the inner per-PLC tasks run concurrently.
    // Status-page reads via IReadOnlyDictionary still work without locking.
    private readonly ConcurrentDictionary<string, PlcListenerSupervisor> _supervisors =
        new(StringComparer.Ordinal);

    /// <summary>
    /// Read-only view of the live supervisor dictionary. Consumed by
    /// <see cref="Admin.StatusSnapshotBuilder"/> to enumerate per-PLC state.
    /// The caller should read this on the status-page path only (not the hot path).
    /// </summary>
    internal IReadOnlyDictionary<string, PlcListenerSupervisor> Supervisors => _supervisors;

    public ProxyWorker(
        IOptionsMonitor<MbproxyOptions> options,
        IPduPipeline pipeline,
        ILogger<ProxyWorker> logger,
        ILoggerFactory loggerFactory,
        ConfigReconciler reconciler,
        TagCaptureRegistry captureRegistry,
        IServiceProvider services)
    {
        _options         = options;
        _pipeline        = pipeline;
        _logger          = logger;
        _loggerFactory   = loggerFactory;
        _reconciler      = reconciler;
        _captureRegistry = captureRegistry;
        _services        = services;
        // Admin endpoint resolved lazily in ExecuteAsync (see field comment).
    }

    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        var opts = _options.CurrentValue;
        int plcsConfigured = opts.Plcs.Count;

        // ── 1. Build per-PLC BCD tag maps ────────────────────────────────────────────
        var plcContexts = new Dictionary<string, PerPlcContext>(opts.Plcs.Count, StringComparer.Ordinal);

        foreach (var plc in opts.Plcs)
        {
            var result = BcdTagMapBuilder.Build(opts.BcdTags, plc.BcdTags, plc.DefaultCacheTtlMs);

            foreach (var warn in result.Warnings)
                _logger.LogWarning("[{Plc}] BCD tag map warning: {Message}", plc.Name, warn.Message);

            if (result.Errors.Count > 0)
            {
                foreach (var err in result.Errors)
                    _logger.LogError("[{Plc}] BCD tag map error ({Kind}): {Message}",
                        plc.Name, err.Kind, err.Message);

                _logger.LogError("Skipping listener for PLC '{Plc}' due to BCD tag map errors.", plc.Name);
                continue;
            }

            // Construct a per-PLC response cache only when at least one resolved tag
            // opts in (CacheTtlMs > 0). Skipping cache construction for a PLC with no
            // cacheable tags keeps the no-cache path free of the eviction timer and the
            // per-call resolution cost, preserving the "no caching" default behaviour
            // when no operator has opted any tag in.
            var cache = HasAnyCacheableTag(result.Map)
                ? new ResponseCache(opts.Cache.MaxEntriesPerPlc, opts.Cache.EvictionIntervalMs)
                : null;

            plcContexts[plc.Name] = new PerPlcContext
            {
                PlcName  = plc.Name,
                TagMap   = result.Map,
                Counters = new ProxyCounters(),
                Logger   = _loggerFactory.CreateLogger($"Mbproxy.Proxy.BcdRewriter.{plc.Name}"),
                Cache    = cache,
                Capture  = _captureRegistry.GetOrCreate(plc.Name, result.Map),
            };
        }

        // ── 2. Build Polly pipelines once ─────────────────────────────────────────────
        // Both pipelines are built from ResilienceOptions and reused across all PLCs.
        var resilienceOpts  = opts.Resilience;
        var backendPipeline = PolicyFactory.BuildBackendConnect(
            resilienceOpts.BackendConnect,
            _loggerFactory.CreateLogger("Mbproxy.Proxy.BackendConnect"));

        // ── 3. Build supervisors ──────────────────────────────────────────────────────
        foreach (var plc in opts.Plcs)
        {
            if (!plcContexts.TryGetValue(plc.Name, out var perPlcContext))
                continue; // BCD map failed — skip this PLC.

            // Each supervisor gets its own recovery pipeline (with its own logger scope).
            var recoveryPipeline = PolicyFactory.BuildListenerRecovery(
                resilienceOpts.ListenerRecovery,
                _loggerFactory.CreateLogger($"Mbproxy.Proxy.ListenerRecovery.{plc.Name}"));

            // Give the supervisor a live accessor for ReadCoalescingOptions so a
            // hot-reload of `Mbproxy.Resilience.ReadCoalescing.Enabled` propagates to
            // the multiplexer's per-PDU coalescing decision.
            Func<ReadCoalescingOptions> coalescingAccessor =
                () => _options.CurrentValue.Resilience.ReadCoalescing;

            // Live accessor for KeepaliveOptions so a hot-reload of `Connection.Keepalive`
            // propagates to the backend heartbeat loop and to upstream-socket keepalive.
            Func<KeepaliveOptions> keepaliveAccessor =
                () => _options.CurrentValue.Connection.Keepalive;

            var supervisor = new PlcListenerSupervisor(
                plc,
                opts.Connection,
                _pipeline,
                _loggerFactory.CreateLogger<PlcListener>(),
                _loggerFactory.CreateLogger<PlcMultiplexer>(),
                _loggerFactory.CreateLogger($"Mbproxy.Proxy.UpstreamPipe.{plc.Name}"),
                perPlcContext,
                recoveryPipeline,
                _loggerFactory.CreateLogger<PlcListenerSupervisor>(),
                backendPipeline,
                coalescingAccessor,
                keepaliveAccessor);

            _supervisors[plc.Name] = supervisor;
        }

        // ── Wire reconciler BEFORE starting supervisors ──────────────────────────
        // Attach hands the reconciler the authoritative supervisor dictionary and the
        // initial options snapshot. The reconciler won't process OnChange events until
        // after this call — the brief window between Attach and first supervisor start
        // is safe because the channel signal only enqueues; apply runs asynchronously.
        // Pass the live coalescing accessor so reconciler-built supervisors
        // (add/restart paths) honour hot-reloaded ReadCoalescing values.
        Func<ReadCoalescingOptions> reconcilerCoalescingAccessor =
            () => _options.CurrentValue.Resilience.ReadCoalescing;
        Func<KeepaliveOptions> reconcilerKeepaliveAccessor =
            () => _options.CurrentValue.Connection.Keepalive;
        _reconciler.Attach(_supervisors, opts, reconcilerCoalescingAccessor, reconcilerKeepaliveAccessor);

        if (_supervisors.Count == 0)
        {
            LogStartupReady(_logger, 0, plcsConfigured);
            await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
            return;
        }

        // ── 4. Start all supervisors in parallel ──────────────────────────────────────
        var startTasks = _supervisors.Values
            .Select(s => s.StartAsync(stoppingToken))
            .ToArray();
        await Task.WhenAll(startTasks).ConfigureAwait(false);

        // ── 5. Wait for every supervisor to complete its first bind attempt ───────────
        // "Ready" = every supervisor has transitioned out of Stopped (i.e. reached
        // Bound or Recovering from its first attempt).
        using var readyCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
        using var readyLinked = CancellationTokenSource.CreateLinkedTokenSource(
            readyCts.Token, stoppingToken);

        var waitTasks = _supervisors.Values
            .Select(s => s.WaitForInitialBindAttemptAsync(readyLinked.Token))
            .ToArray();

        try
        {
            await Task.WhenAll(waitTasks).ConfigureAwait(false);
        }
        catch (OperationCanceledException)
        {
            // Either the 30 s deadline fired or the service is stopping.
        }

        int boundCount = _supervisors.Values.Count(s => s.Snapshot().State == SupervisorState.Bound);
        LogStartupReady(_logger, boundCount, plcsConfigured);

        // Start the admin endpoint AFTER listeners are bound so the status page can
        // never observe the service in a "no PLCs configured yet" state. The admin
        // endpoint is not registered as IHostedService (the host's reverse stop order
        // would tear it down BEFORE drain) — ProxyWorker drives both ends.
        //
        // Resolution happens here, not in the constructor — the DI graph is circular
        // (admin → StatusSnapshotBuilder → ProxyWorker) and a constructor-time lookup
        // returns null silently.
        _admin = _services.GetService<AdminEndpointHost>();
        if (_admin is not null)
        {
            try
            {
                await _admin.StartAsync(stoppingToken).ConfigureAwait(false);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Admin endpoint failed to start: {Message}", ex.Message);
            }
        }
        else
        {
            // Surface the absence. The lazy lookup returns null silently if
            // AddMbproxyAdmin() is missing from Program.cs; a single warning makes a
            // botched composition observable without blocking startup.
            _logger.LogWarning(
                "Admin endpoint not registered (AddMbproxyAdmin() missing from composition). " +
                "Status page will be unavailable; service continues without it.");
        }

        // ── 6. Keep the worker alive until the host signals stop ─────────────────────
        // Supervisors run their own background loops; ExecuteAsync just waits.
        await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
    }

    /// <summary>
    /// Graceful shutdown sequence:
    /// <list type="number">
    ///   <item>Cancel <see cref="ExecuteAsync"/> via <c>base.StopAsync</c>.</item>
    ///   <item><b>Snapshot</b> per-PLC in-flight counts BEFORE stopping supervisors —
    ///         this is the only honest reading of "how many requests were in flight when
    ///         we decided to stop." Once supervisors stop, their multiplexers are torn
    ///         down and the per-mux counter providers are nulled, so any later read
    ///         returns 0 regardless of what was actually dropped.</item>
    ///   <item>Stop all supervisors with the configured graceful timeout. Supervisor
    ///         stop is the actual drain — it cancels the listener, which exits its
    ///         accept loop, which disposes the multiplexer, which cascades all attached
    ///         pipes. There is no separate "drain in-flight" phase because there is
    ///         nothing to drain that wouldn't be killed by the supervisor stop itself.</item>
    ///   <item>Stop the admin endpoint LAST so the status page survives the supervisor
    ///         stop phase and operators can observe the live state right up to shutdown.</item>
    ///   <item>Dispose every supervisor to release sockets, channels, and watchdog timers.</item>
    /// </list>
    /// Logs <c>mbproxy.shutdown.complete</c> with <c>InFlightAtCancel</c> equal to the
    /// snapshot count from step 2 (= the number of in-flight requests dropped by the
    /// stop) and <c>ElapsedMs</c> for the whole sequence.
    /// </summary>
    public override async Task StopAsync(CancellationToken cancellationToken)
    {
        // Snapshot in-flight BEFORE base.StopAsync so the field matches its name: "the
        // count at the moment the host signalled stop", not "the count at the moment we
        // got around to computing it." `base.StopAsync` cancels the ExecuteAsync
        // stoppingToken; in the milliseconds before it returns, in-flight requests
        // whose responses arrive will be removed from _correlation and the watchdog can
        // clear stale entries — the count would otherwise drift downward.
        //
        // Must run BEFORE supervisor stop too: after supervisor.StopAsync, multiplexers
        // are disposed and CountInFlight returns 0 unconditionally.
        int inFlightAtCancel = CountInFlight();

        // Cancel ExecuteAsync first.
        await base.StopAsync(cancellationToken).ConfigureAwait(false);

        var sw = Stopwatch.StartNew();

        // Supervisor stop deadline read from the live config so a hot-reloaded
        // GracefulShutdownTimeoutMs is honoured. Supervisor stop is the drain:
        // cancelling the supervisor cancels the listener, which exits accept, which
        // disposes the multiplexer, which cascades all attached pipes.
        int gracefulMs = _options.CurrentValue.Connection.GracefulShutdownTimeoutMs;

        // ── 1. Stop accepting new connections + drain (one combined phase) ────────────
        using var stopCts = new CancellationTokenSource(TimeSpan.FromMilliseconds(gracefulMs));
        using var linked  = CancellationTokenSource.CreateLinkedTokenSource(
            stopCts.Token, cancellationToken);

        var stopTasks = _supervisors.Values
            .Select(s => s.StopAsync(linked.Token))
            .ToArray();

        try
        {
            await Task.WhenAll(stopTasks).ConfigureAwait(false);
        }
        catch
        {
            // Best effort — don't let individual supervisor failures block shutdown.
        }

        // ── 2. Stop admin endpoint LAST ───────────────────────────────────────────────
        if (_admin is not null)
        {
            try
            {
                using var adminCts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
                await _admin.StopAsync(adminCts.Token).ConfigureAwait(false);
            }
            catch
            {
                // Best-effort.
            }
        }

        // ── 3. Dispose supervisors (releases sockets, channels, watchdog timers) ─────
        foreach (var supervisor in _supervisors.Values)
            await supervisor.DisposeAsync().ConfigureAwait(false);

        _supervisors.Clear();

        LogShutdownComplete(_logger, inFlightAtCancel, sw.ElapsedMilliseconds);
    }

    private int CountInFlight()
    {
        int total = 0;
        foreach (var supervisor in _supervisors.Values)
            total += (int)supervisor.CurrentCounters.Snapshot().InFlightCount;
        return total;
    }

    // ── Logging ───────────────────────────────────────────────────────────────────────────

    /// <summary>
    /// Returns <c>true</c> when at least one BcdTag in the resolved map has a positive
    /// <see cref="BcdTag.CacheTtlMs"/>. A PLC with no cacheable tags skips the
    /// <see cref="Mbproxy.Proxy.Cache.ResponseCache"/> entirely (no eviction timer, no
    /// per-call cache resolution cost), so the default-OFF deployment runs the
    /// no-cache code path.
    /// </summary>
    private static bool HasAnyCacheableTag(BcdTagMap map)
    {
        foreach (var t in map.All)
            if (t.CacheTtlMs > 0) return true;
        return false;
    }

    [LoggerMessage(EventId = 1, EventName = "mbproxy.startup.ready",
        Level = LogLevel.Information,
        Message = "mbproxy service ready — ListenersBound={ListenersBound} PlcsConfigured={PlcsConfigured}")]
    private static partial void LogStartupReady(ILogger logger, int listenersBound, int plcsConfigured);

    [LoggerMessage(EventId = 21, EventName = "mbproxy.startup.bind.failed",
        Level = LogLevel.Error,
        Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
    private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);

    [LoggerMessage(EventId = 80, EventName = "mbproxy.shutdown.complete",
        Level = LogLevel.Information,
        Message = "Graceful shutdown complete: InFlightAtCancel={InFlightAtCancel} ElapsedMs={ElapsedMs}")]
    private static partial void LogShutdownComplete(ILogger logger, int inFlightAtCancel, long elapsedMs);
}