Grpc.Core doesn't reliably fire CancellationToken on client disconnect, so Subscribe RPCs can hang forever and leak session subscriptions. Bridge SessionManager scavenging to SubscriptionManager cleanup, and add a 30-second periodic session validity check in the Subscribe loop so stale streams exit within 30s of session scavenge rather than hanging until process restart. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
236 lines
9.5 KiB
C#
236 lines
9.5 KiB
C#
using System;
|
|
using System.Threading;
|
|
using Grpc.Core;
|
|
using Grpc.Core.Interceptors;
|
|
using Serilog;
|
|
using ZB.MOM.WW.LmxProxy.Host.Configuration;
|
|
using ZB.MOM.WW.LmxProxy.Host.Grpc.Services;
|
|
using ZB.MOM.WW.LmxProxy.Host.MxAccess;
|
|
using ZB.MOM.WW.LmxProxy.Host.Security;
|
|
using ZB.MOM.WW.LmxProxy.Host.Health;
|
|
using ZB.MOM.WW.LmxProxy.Host.Metrics;
|
|
using ZB.MOM.WW.LmxProxy.Host.Sessions;
|
|
using ZB.MOM.WW.LmxProxy.Host.Status;
|
|
using ZB.MOM.WW.LmxProxy.Host.Subscriptions;
|
|
|
|
namespace ZB.MOM.WW.LmxProxy.Host
|
|
{
|
|
/// <summary>
|
|
/// Service lifecycle manager. Created by Topshelf, handles Start/Stop/Pause/Continue.
|
|
/// </summary>
|
|
public class LmxProxyService
|
|
{
|
|
private static readonly ILogger Log = Serilog.Log.ForContext<LmxProxyService>();
|
|
|
|
private readonly LmxProxyConfiguration _config;
|
|
|
|
private MxAccessClient? _mxAccessClient;
|
|
private SessionManager? _sessionManager;
|
|
private SubscriptionManager? _subscriptionManager;
|
|
private ApiKeyService? _apiKeyService;
|
|
private PerformanceMetrics? _performanceMetrics;
|
|
private HealthCheckService? _healthCheckService;
|
|
private StatusReportService? _statusReportService;
|
|
private StatusWebServer? _statusWebServer;
|
|
private Server? _grpcServer;
|
|
|
|
public LmxProxyService(LmxProxyConfiguration config)
|
|
{
|
|
_config = config;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Topshelf Start callback. Creates and starts all components.
|
|
/// </summary>
|
|
public bool Start()
|
|
{
|
|
try
|
|
{
|
|
Log.Information("LmxProxy service starting...");
|
|
|
|
// 1. Validate configuration
|
|
ConfigurationValidator.ValidateAndLog(_config);
|
|
|
|
// 2. Check/generate TLS certificates
|
|
var credentials = TlsCertificateManager.CreateServerCredentials(_config.Tls);
|
|
|
|
// 3. Create ApiKeyService
|
|
_apiKeyService = new ApiKeyService(_config.ApiKeyConfigFile);
|
|
|
|
// 4. Create MxAccessClient
|
|
_mxAccessClient = new MxAccessClient(
|
|
maxConcurrentOperations: _config.Connection.MaxConcurrentOperations,
|
|
readTimeoutSeconds: _config.Connection.ReadTimeoutSeconds,
|
|
writeTimeoutSeconds: _config.Connection.WriteTimeoutSeconds,
|
|
monitorIntervalSeconds: _config.Connection.MonitorIntervalSeconds,
|
|
autoReconnect: _config.Connection.AutoReconnect,
|
|
nodeName: _config.Connection.NodeName,
|
|
galaxyName: _config.Connection.GalaxyName,
|
|
probeTestTagAddress: _config.HealthCheck.TestTagAddress,
|
|
probeStaleThresholdMs: _config.HealthCheck.ProbeStaleThresholdMs,
|
|
clientName: _config.ClientName);
|
|
|
|
// 5. Connect to MxAccess synchronously (with timeout)
|
|
Log.Information("Connecting to MxAccess (timeout: {Timeout}s)...",
|
|
_config.Connection.ConnectionTimeoutSeconds);
|
|
using (var cts = new CancellationTokenSource(
|
|
TimeSpan.FromSeconds(_config.Connection.ConnectionTimeoutSeconds)))
|
|
{
|
|
_mxAccessClient.ConnectAsync(cts.Token).GetAwaiter().GetResult();
|
|
}
|
|
|
|
// 6. Start auto-reconnect monitor
|
|
_mxAccessClient.StartMonitorLoop();
|
|
|
|
// 7. Create SubscriptionManager
|
|
var channelFullMode = System.Threading.Channels.BoundedChannelFullMode.DropOldest;
|
|
if (_config.Subscription.ChannelFullMode.Equals("DropNewest", StringComparison.OrdinalIgnoreCase))
|
|
channelFullMode = System.Threading.Channels.BoundedChannelFullMode.DropNewest;
|
|
else if (_config.Subscription.ChannelFullMode.Equals("Wait", StringComparison.OrdinalIgnoreCase))
|
|
channelFullMode = System.Threading.Channels.BoundedChannelFullMode.Wait;
|
|
|
|
_subscriptionManager = new SubscriptionManager(
|
|
_mxAccessClient, _config.Subscription.ChannelCapacity, channelFullMode);
|
|
|
|
// Wire MxAccessClient data change events to SubscriptionManager
|
|
_mxAccessClient.OnTagValueChanged = _subscriptionManager.OnTagValueChanged;
|
|
|
|
// Wire MxAccessClient disconnect to SubscriptionManager
|
|
_mxAccessClient.ConnectionStateChanged += (sender, e) =>
|
|
{
|
|
if (e.CurrentState == Domain.ConnectionState.Disconnected ||
|
|
e.CurrentState == Domain.ConnectionState.Error)
|
|
{
|
|
_subscriptionManager.NotifyDisconnection();
|
|
}
|
|
else if (e.CurrentState == Domain.ConnectionState.Connected &&
|
|
e.PreviousState == Domain.ConnectionState.Reconnecting)
|
|
{
|
|
_subscriptionManager.NotifyReconnection();
|
|
}
|
|
};
|
|
|
|
// 8. Create SessionManager
|
|
_sessionManager = new SessionManager(inactivityTimeoutMinutes: 5);
|
|
_sessionManager.OnSessionScavenged(sessionId =>
|
|
{
|
|
Log.Information("Cleaning up subscriptions for scavenged session {SessionId}", sessionId);
|
|
_subscriptionManager.UnsubscribeClient(sessionId);
|
|
});
|
|
|
|
// 9. Create performance metrics
|
|
_performanceMetrics = new PerformanceMetrics();
|
|
|
|
// 10. Create health check services
|
|
_healthCheckService = new HealthCheckService(_mxAccessClient, _subscriptionManager, _performanceMetrics);
|
|
|
|
// 11. Create status report service
|
|
_statusReportService = new StatusReportService(
|
|
_mxAccessClient, _subscriptionManager, _performanceMetrics,
|
|
_healthCheckService);
|
|
|
|
// 12. Start status web server
|
|
_statusWebServer = new StatusWebServer(_config.WebServer, _statusReportService);
|
|
if (!_statusWebServer.Start())
|
|
{
|
|
Log.Warning("Status web server failed to start — continuing without it");
|
|
}
|
|
|
|
// 13. Create gRPC service
|
|
var grpcService = new ScadaGrpcService(
|
|
_mxAccessClient, _sessionManager, _subscriptionManager, _performanceMetrics, _apiKeyService);
|
|
|
|
// 14. Create and configure interceptor
|
|
var interceptor = new ApiKeyInterceptor(_apiKeyService);
|
|
|
|
// 15. Build and start gRPC server
|
|
_grpcServer = new Server
|
|
{
|
|
Services =
|
|
{
|
|
Scada.ScadaService.BindService(grpcService)
|
|
.Intercept(interceptor)
|
|
},
|
|
Ports =
|
|
{
|
|
new ServerPort("0.0.0.0", _config.GrpcPort, credentials)
|
|
}
|
|
};
|
|
|
|
_grpcServer.Start();
|
|
Log.Information("gRPC server started on port {Port}", _config.GrpcPort);
|
|
|
|
Log.Information("LmxProxy service started successfully");
|
|
return true;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Fatal(ex, "LmxProxy service failed to start");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Topshelf Stop callback. Stops and disposes all components in reverse order.
|
|
/// </summary>
|
|
public bool Stop()
|
|
{
|
|
Log.Information("LmxProxy service stopping...");
|
|
|
|
try
|
|
{
|
|
// 1. Stop reconnect monitor (5s wait)
|
|
_mxAccessClient?.StopMonitorLoop();
|
|
|
|
// 2. Stop status web server
|
|
_statusWebServer?.Stop();
|
|
|
|
// 3. Dispose performance metrics
|
|
_performanceMetrics?.Dispose();
|
|
|
|
// 4. Graceful gRPC shutdown (10s timeout, then kill)
|
|
if (_grpcServer != null)
|
|
{
|
|
Log.Information("Shutting down gRPC server...");
|
|
_grpcServer.ShutdownAsync().Wait(TimeSpan.FromSeconds(10));
|
|
Log.Information("gRPC server stopped");
|
|
}
|
|
|
|
// 3. Dispose components in reverse order
|
|
_subscriptionManager?.Dispose();
|
|
_sessionManager?.Dispose();
|
|
_apiKeyService?.Dispose();
|
|
|
|
// 4. Disconnect MxAccess (10s timeout)
|
|
if (_mxAccessClient != null)
|
|
{
|
|
Log.Information("Disconnecting from MxAccess...");
|
|
_mxAccessClient.DisposeAsync().AsTask().Wait(TimeSpan.FromSeconds(10));
|
|
Log.Information("MxAccess disconnected");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Log.Error(ex, "Error during shutdown");
|
|
}
|
|
|
|
Log.Information("LmxProxy service stopped");
|
|
return true;
|
|
}
|
|
|
|
/// <summary>Topshelf Pause callback -- no-op.</summary>
|
|
public bool Pause()
|
|
{
|
|
Log.Information("LmxProxy service paused (no-op)");
|
|
return true;
|
|
}
|
|
|
|
/// <summary>Topshelf Continue callback -- no-op.</summary>
|
|
public bool Continue()
|
|
{
|
|
Log.Information("LmxProxy service continued (no-op)");
|
|
return true;
|
|
}
|
|
}
|
|
}
|