Files
wwtools/mbproxy/src/Mbproxy/Proxy/ProxyWorker.cs
T
Joseph Doherty 56eee3c563 mbproxy: initial commit through Phase 9 (TxId multiplexing)
Adds the mbproxy service end-to-end. Phases 00-08 implement the
production-ready single-listener / 1:1-backend transparent Modbus TCP
proxy with bidirectional BCD rewriting for the ~54-PLC DL205/DL260
fleet. Phase 9 replaces the connection layer with a single backend
socket per PLC plus MBAP TxId rewriting, lifting the H2-ECOM100's
4-concurrent-client cap as an operational ceiling.

Phase 9 additions of note:
- PlcMultiplexer + UpstreamPipe + TxIdAllocator + CorrelationMap
- InFlightRequest with IReadOnlyList<InterestedParty> (load-bearing
  for Phase 10 read coalescing — do not collapse to a single field)
- Per-request watchdog: surfaces Modbus exception 0x0B to upstream
  on BackendRequestTimeoutMs, defending against lost responses,
  dead-PLC paths, and pymodbus 3.13.0's concurrent-multiplexed-
  request bug (its ServerRequestHandler.last_pdu state race)
- Status DTO + HTML gain inFlight / maxInFlight / txIdWraps /
  disconnectCascades / queueDepth (Tier 1.6 in docs/kpi.md)

Tests: 263 unit + 38 E2E. Multiplexer correctness under truly
concurrent backend traffic is proved against a stub backend in
PlcMultiplexerTests; MultiplexerE2ETests paces requests so pymodbus
3.13's single-PDU framer stays in known-good mode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 01:49:35 -04:00

219 lines
9.9 KiB
C#

using Mbproxy.Bcd;
using Mbproxy.Configuration;
using Mbproxy.Options;
using Mbproxy.Proxy.Multiplexing;
using Mbproxy.Proxy.Supervision;
using Microsoft.Extensions.Options;
using Polly;
namespace Mbproxy.Proxy;
/// <summary>
/// <see cref="BackgroundService"/> that owns all <see cref="PlcListenerSupervisor"/> instances.
///
/// Startup posture (matches design doc "eager, continue on per-port failure"):
/// <list type="number">
/// <item>Enumerate <see cref="MbproxyOptions.Plcs"/> and build one supervisor per PLC.</item>
/// <item>Start all supervisors in parallel. Each supervisor attempts to bind immediately
/// and enters the Polly recovery loop if the bind fails.</item>
/// <item>After all supervisors have completed their first bind attempt (reached
/// <see cref="SupervisorState.Bound"/> or <see cref="SupervisorState.Recovering"/>),
/// log <c>mbproxy.startup.ready</c> with bound/configured counts.</item>
/// </list>
///
/// Phase 06: passes the supervisor dictionary to <see cref="ConfigReconciler.Attach"/>
/// after initial startup so hot-reload changes are applied by the reconciler.
///
/// Stop: cancels all supervisors in parallel with a 5-second hard deadline.
/// </summary>
internal sealed partial class ProxyWorker : BackgroundService
{
private readonly IOptionsMonitor<MbproxyOptions> _options;
private readonly IPduPipeline _pipeline;
private readonly ILogger<ProxyWorker> _logger;
private readonly ILoggerFactory _loggerFactory;
private readonly ConfigReconciler _reconciler;
// Phase 06: supervisors are now managed jointly by ProxyWorker (initial bootstrap)
// and ConfigReconciler (subsequent hot-reload changes). The dictionary is shared
// via ConfigReconciler.Attach() after initial startup.
private readonly Dictionary<string, PlcListenerSupervisor> _supervisors = new(StringComparer.Ordinal);
/// <summary>
/// Read-only view of the live supervisor dictionary. Consumed by Phase 07's
/// <see cref="Admin.StatusSnapshotBuilder"/> to enumerate per-PLC state.
/// The caller should read this on the status-page path only (not the hot path).
/// </summary>
internal IReadOnlyDictionary<string, PlcListenerSupervisor> Supervisors => _supervisors;
public ProxyWorker(
IOptionsMonitor<MbproxyOptions> options,
IPduPipeline pipeline,
ILogger<ProxyWorker> logger,
ILoggerFactory loggerFactory,
ConfigReconciler reconciler)
{
_options = options;
_pipeline = pipeline;
_logger = logger;
_loggerFactory = loggerFactory;
_reconciler = reconciler;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var opts = _options.CurrentValue;
int plcsConfigured = opts.Plcs.Count;
// ── 1. Build per-PLC BCD tag maps ────────────────────────────────────────────
var plcContexts = new Dictionary<string, PerPlcContext>(opts.Plcs.Count, StringComparer.Ordinal);
foreach (var plc in opts.Plcs)
{
var result = BcdTagMapBuilder.Build(opts.BcdTags, plc.BcdTags);
foreach (var warn in result.Warnings)
_logger.LogWarning("[{Plc}] BCD tag map warning: {Message}", plc.Name, warn.Message);
if (result.Errors.Count > 0)
{
foreach (var err in result.Errors)
_logger.LogError("[{Plc}] BCD tag map error ({Kind}): {Message}",
plc.Name, err.Kind, err.Message);
_logger.LogError("Skipping listener for PLC '{Plc}' due to BCD tag map errors.", plc.Name);
continue;
}
plcContexts[plc.Name] = new PerPlcContext
{
PlcName = plc.Name,
TagMap = result.Map,
Counters = new ProxyCounters(),
Logger = _loggerFactory.CreateLogger($"Mbproxy.Proxy.BcdRewriter.{plc.Name}"),
};
}
// ── 2. Build Polly pipelines once ─────────────────────────────────────────────
// Both pipelines are built from ResilienceOptions and reused across all PLCs.
var resilienceOpts = opts.Resilience;
var backendPipeline = PolicyFactory.BuildBackendConnect(
resilienceOpts.BackendConnect,
_loggerFactory.CreateLogger("Mbproxy.Proxy.BackendConnect"));
// ── 3. Build supervisors ──────────────────────────────────────────────────────
foreach (var plc in opts.Plcs)
{
if (!plcContexts.TryGetValue(plc.Name, out var perPlcContext))
continue; // BCD map failed — skip this PLC.
// Each supervisor gets its own recovery pipeline (with its own logger scope).
var recoveryPipeline = PolicyFactory.BuildListenerRecovery(
resilienceOpts.ListenerRecovery,
_loggerFactory.CreateLogger($"Mbproxy.Proxy.ListenerRecovery.{plc.Name}"));
var supervisor = new PlcListenerSupervisor(
plc,
opts.Connection,
_pipeline,
_loggerFactory.CreateLogger<PlcListener>(),
_loggerFactory.CreateLogger<PlcMultiplexer>(),
_loggerFactory.CreateLogger($"Mbproxy.Proxy.UpstreamPipe.{plc.Name}"),
perPlcContext,
recoveryPipeline,
_loggerFactory.CreateLogger<PlcListenerSupervisor>(),
backendPipeline);
_supervisors[plc.Name] = supervisor;
}
// ── Phase 06: wire reconciler BEFORE starting supervisors ─────────────────
// Attach hands the reconciler the authoritative supervisor dictionary and the
// initial options snapshot. The reconciler won't process OnChange events until
// after this call — the brief window between Attach and first supervisor start
// is safe because the channel signal only enqueues; apply runs asynchronously.
_reconciler.Attach(_supervisors, opts);
if (_supervisors.Count == 0)
{
LogStartupReady(_logger, 0, plcsConfigured);
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
return;
}
// ── 4. Start all supervisors in parallel ──────────────────────────────────────
var startTasks = _supervisors.Values
.Select(s => s.StartAsync(stoppingToken))
.ToArray();
await Task.WhenAll(startTasks).ConfigureAwait(false);
// ── 5. Wait for every supervisor to complete its first bind attempt ───────────
// "Ready" = every supervisor has transitioned out of Stopped (i.e. reached
// Bound or Recovering from its first attempt).
using var readyCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
using var readyLinked = CancellationTokenSource.CreateLinkedTokenSource(
readyCts.Token, stoppingToken);
var waitTasks = _supervisors.Values
.Select(s => s.WaitForInitialBindAttemptAsync(readyLinked.Token))
.ToArray();
try
{
await Task.WhenAll(waitTasks).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
// Either the 30 s deadline fired or the service is stopping.
}
int boundCount = _supervisors.Values.Count(s => s.Snapshot().State == SupervisorState.Bound);
LogStartupReady(_logger, boundCount, plcsConfigured);
// ── 6. Keep the worker alive until the host signals stop ─────────────────────
// Supervisors run their own background loops; ExecuteAsync just waits.
await Task.Delay(Timeout.Infinite, stoppingToken).ConfigureAwait(false);
}
public override async Task StopAsync(CancellationToken cancellationToken)
{
// Cancel ExecuteAsync first.
await base.StopAsync(cancellationToken).ConfigureAwait(false);
// Stop all supervisors in parallel with a 5-second hard deadline.
using var stopCts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
using var linked = CancellationTokenSource.CreateLinkedTokenSource(
stopCts.Token, cancellationToken);
var stopTasks = _supervisors.Values
.Select(s => s.StopAsync(linked.Token))
.ToArray();
try
{
await Task.WhenAll(stopTasks).ConfigureAwait(false);
}
catch
{
// Best effort — don't let individual supervisor failures block shutdown.
}
foreach (var supervisor in _supervisors.Values)
await supervisor.DisposeAsync().ConfigureAwait(false);
_supervisors.Clear();
}
// ── Logging ───────────────────────────────────────────────────────────────────────────
[LoggerMessage(EventId = 1, EventName = "mbproxy.startup.ready",
Level = LogLevel.Information,
Message = "mbproxy service ready — ListenersBound={ListenersBound} PlcsConfigured={PlcsConfigured}")]
private static partial void LogStartupReady(ILogger logger, int listenersBound, int plcsConfigured);
[LoggerMessage(EventId = 21, EventName = "mbproxy.startup.bind.failed",
Level = LogLevel.Error,
Message = "Failed to bind listener: Plc={Plc} Port={Port} Reason={Reason}")]
private static partial void LogBindFailed(ILogger logger, string plc, int port, string reason);
}