Files
scadalink-design/lmxproxy/src/ZB.MOM.WW.LmxProxy.Host/LmxProxyService.cs
Joseph Doherty eecd82b787 fix(lmxproxy): clean up stale session subscriptions on scavenge and add stream timeout
Grpc.Core doesn't reliably fire CancellationToken on client disconnect,
so Subscribe RPCs can hang forever and leak session subscriptions. Bridge
SessionManager scavenging to SubscriptionManager cleanup, and add a
30-second periodic session validity check in the Subscribe loop so stale
streams exit within 30s of session scavenge rather than hanging until
process restart.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-24 15:21:06 -04:00

236 lines
9.5 KiB
C#

using System;
using System.Threading;
using Grpc.Core;
using Grpc.Core.Interceptors;
using Serilog;
using ZB.MOM.WW.LmxProxy.Host.Configuration;
using ZB.MOM.WW.LmxProxy.Host.Grpc.Services;
using ZB.MOM.WW.LmxProxy.Host.MxAccess;
using ZB.MOM.WW.LmxProxy.Host.Security;
using ZB.MOM.WW.LmxProxy.Host.Health;
using ZB.MOM.WW.LmxProxy.Host.Metrics;
using ZB.MOM.WW.LmxProxy.Host.Sessions;
using ZB.MOM.WW.LmxProxy.Host.Status;
using ZB.MOM.WW.LmxProxy.Host.Subscriptions;
namespace ZB.MOM.WW.LmxProxy.Host
{
/// <summary>
/// Service lifecycle manager. Created by Topshelf, handles Start/Stop/Pause/Continue.
/// </summary>
public class LmxProxyService
{
private static readonly ILogger Log = Serilog.Log.ForContext<LmxProxyService>();
private readonly LmxProxyConfiguration _config;
private MxAccessClient? _mxAccessClient;
private SessionManager? _sessionManager;
private SubscriptionManager? _subscriptionManager;
private ApiKeyService? _apiKeyService;
private PerformanceMetrics? _performanceMetrics;
private HealthCheckService? _healthCheckService;
private StatusReportService? _statusReportService;
private StatusWebServer? _statusWebServer;
private Server? _grpcServer;
public LmxProxyService(LmxProxyConfiguration config)
{
_config = config;
}
/// <summary>
/// Topshelf Start callback. Creates and starts all components.
/// </summary>
public bool Start()
{
try
{
Log.Information("LmxProxy service starting...");
// 1. Validate configuration
ConfigurationValidator.ValidateAndLog(_config);
// 2. Check/generate TLS certificates
var credentials = TlsCertificateManager.CreateServerCredentials(_config.Tls);
// 3. Create ApiKeyService
_apiKeyService = new ApiKeyService(_config.ApiKeyConfigFile);
// 4. Create MxAccessClient
_mxAccessClient = new MxAccessClient(
maxConcurrentOperations: _config.Connection.MaxConcurrentOperations,
readTimeoutSeconds: _config.Connection.ReadTimeoutSeconds,
writeTimeoutSeconds: _config.Connection.WriteTimeoutSeconds,
monitorIntervalSeconds: _config.Connection.MonitorIntervalSeconds,
autoReconnect: _config.Connection.AutoReconnect,
nodeName: _config.Connection.NodeName,
galaxyName: _config.Connection.GalaxyName,
probeTestTagAddress: _config.HealthCheck.TestTagAddress,
probeStaleThresholdMs: _config.HealthCheck.ProbeStaleThresholdMs,
clientName: _config.ClientName);
// 5. Connect to MxAccess synchronously (with timeout)
Log.Information("Connecting to MxAccess (timeout: {Timeout}s)...",
_config.Connection.ConnectionTimeoutSeconds);
using (var cts = new CancellationTokenSource(
TimeSpan.FromSeconds(_config.Connection.ConnectionTimeoutSeconds)))
{
_mxAccessClient.ConnectAsync(cts.Token).GetAwaiter().GetResult();
}
// 6. Start auto-reconnect monitor
_mxAccessClient.StartMonitorLoop();
// 7. Create SubscriptionManager
var channelFullMode = System.Threading.Channels.BoundedChannelFullMode.DropOldest;
if (_config.Subscription.ChannelFullMode.Equals("DropNewest", StringComparison.OrdinalIgnoreCase))
channelFullMode = System.Threading.Channels.BoundedChannelFullMode.DropNewest;
else if (_config.Subscription.ChannelFullMode.Equals("Wait", StringComparison.OrdinalIgnoreCase))
channelFullMode = System.Threading.Channels.BoundedChannelFullMode.Wait;
_subscriptionManager = new SubscriptionManager(
_mxAccessClient, _config.Subscription.ChannelCapacity, channelFullMode);
// Wire MxAccessClient data change events to SubscriptionManager
_mxAccessClient.OnTagValueChanged = _subscriptionManager.OnTagValueChanged;
// Wire MxAccessClient disconnect to SubscriptionManager
_mxAccessClient.ConnectionStateChanged += (sender, e) =>
{
if (e.CurrentState == Domain.ConnectionState.Disconnected ||
e.CurrentState == Domain.ConnectionState.Error)
{
_subscriptionManager.NotifyDisconnection();
}
else if (e.CurrentState == Domain.ConnectionState.Connected &&
e.PreviousState == Domain.ConnectionState.Reconnecting)
{
_subscriptionManager.NotifyReconnection();
}
};
// 8. Create SessionManager
_sessionManager = new SessionManager(inactivityTimeoutMinutes: 5);
_sessionManager.OnSessionScavenged(sessionId =>
{
Log.Information("Cleaning up subscriptions for scavenged session {SessionId}", sessionId);
_subscriptionManager.UnsubscribeClient(sessionId);
});
// 9. Create performance metrics
_performanceMetrics = new PerformanceMetrics();
// 10. Create health check services
_healthCheckService = new HealthCheckService(_mxAccessClient, _subscriptionManager, _performanceMetrics);
// 11. Create status report service
_statusReportService = new StatusReportService(
_mxAccessClient, _subscriptionManager, _performanceMetrics,
_healthCheckService);
// 12. Start status web server
_statusWebServer = new StatusWebServer(_config.WebServer, _statusReportService);
if (!_statusWebServer.Start())
{
Log.Warning("Status web server failed to start — continuing without it");
}
// 13. Create gRPC service
var grpcService = new ScadaGrpcService(
_mxAccessClient, _sessionManager, _subscriptionManager, _performanceMetrics, _apiKeyService);
// 14. Create and configure interceptor
var interceptor = new ApiKeyInterceptor(_apiKeyService);
// 15. Build and start gRPC server
_grpcServer = new Server
{
Services =
{
Scada.ScadaService.BindService(grpcService)
.Intercept(interceptor)
},
Ports =
{
new ServerPort("0.0.0.0", _config.GrpcPort, credentials)
}
};
_grpcServer.Start();
Log.Information("gRPC server started on port {Port}", _config.GrpcPort);
Log.Information("LmxProxy service started successfully");
return true;
}
catch (Exception ex)
{
Log.Fatal(ex, "LmxProxy service failed to start");
return false;
}
}
/// <summary>
/// Topshelf Stop callback. Stops and disposes all components in reverse order.
/// </summary>
public bool Stop()
{
Log.Information("LmxProxy service stopping...");
try
{
// 1. Stop reconnect monitor (5s wait)
_mxAccessClient?.StopMonitorLoop();
// 2. Stop status web server
_statusWebServer?.Stop();
// 3. Dispose performance metrics
_performanceMetrics?.Dispose();
// 4. Graceful gRPC shutdown (10s timeout, then kill)
if (_grpcServer != null)
{
Log.Information("Shutting down gRPC server...");
_grpcServer.ShutdownAsync().Wait(TimeSpan.FromSeconds(10));
Log.Information("gRPC server stopped");
}
// 3. Dispose components in reverse order
_subscriptionManager?.Dispose();
_sessionManager?.Dispose();
_apiKeyService?.Dispose();
// 4. Disconnect MxAccess (10s timeout)
if (_mxAccessClient != null)
{
Log.Information("Disconnecting from MxAccess...");
_mxAccessClient.DisposeAsync().AsTask().Wait(TimeSpan.FromSeconds(10));
Log.Information("MxAccess disconnected");
}
}
catch (Exception ex)
{
Log.Error(ex, "Error during shutdown");
}
Log.Information("LmxProxy service stopped");
return true;
}
/// <summary>Topshelf Pause callback -- no-op.</summary>
public bool Pause()
{
Log.Information("LmxProxy service paused (no-op)");
return true;
}
/// <summary>Topshelf Continue callback -- no-op.</summary>
public bool Continue()
{
Log.Information("LmxProxy service continued (no-op)");
return true;
}
}
}