using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging; /// /// SiteEventLogging-019: predicate the /// consults at the top of every purge tick to decide whether THIS node should /// run the daily purge. The design states "a daily background job runs on the /// active node and deletes all events older than 30 days"; the standby's local /// SQLite receives no writes, so purging there is harmless but unnecessary — /// and silently doing it anyway diverges from the design. /// /// Registration is the Host's responsibility (it knows the cluster topology); /// when no implementation is registered the purge service defaults to "always /// active" so non-clustered hosts and unit tests are unaffected — backward /// compatible with the prior "run on every host" behaviour. /// public delegate bool SiteEventLogActiveNodeCheck(); /// /// Background service that periodically purges old events from the SQLite event log. /// Enforces both time-based retention (default 30 days) and storage cap (default 1GB). /// Runs on a background thread and does not block event recording. /// public class EventLogPurgeService : BackgroundService { /// Number of events deleted per cap-purge batch. private const int CapPurgeBatchSize = 1000; private readonly SiteEventLogger _eventLogger; private readonly SiteEventLogOptions _options; private readonly ILogger _logger; private readonly SiteEventLogActiveNodeCheck _isActiveNode; /// Initializes a new instance of . /// The concrete event logger providing lock-guarded database access. /// Site event log options (retention days, storage cap, purge interval). /// Logger instance. /// /// SiteEventLogging-019: optional active-node check. When null, the /// service runs the purge on every tick (preserves the pre-fix behaviour /// for non-clustered hosts and existing tests). When supplied — e.g. by /// the Host on a site node — each tick early-exits on the standby so the /// daily purge runs only on the active node, matching the design. /// public EventLogPurgeService( SiteEventLogger eventLogger, IOptions options, ILogger logger, SiteEventLogActiveNodeCheck? isActiveNode = null) { // Depend on the concrete recorder directly: purge must funnel database access // through its lock-guarded WithConnection. Taking ISiteEventLogger and // downcasting would throw InvalidCastException for any other implementation. _eventLogger = eventLogger; _options = options.Value; _logger = logger; _isActiveNode = isActiveNode ?? (static () => true); } /// protected override async Task ExecuteAsync(CancellationToken stoppingToken) { _logger.LogInformation( "Event log purge service started — retention: {Days} days, cap: {Cap} MB, interval: {Interval}", _options.RetentionDays, _options.MaxStorageMb, _options.PurgeInterval); using var timer = new PeriodicTimer(_options.PurgeInterval); // Run an initial purge on startup RunPurge(); while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false)) { RunPurge(); } } /// Runs a single purge cycle: first by retention age, then by storage cap. internal void RunPurge() { try { // SiteEventLogging-019: gate every tick on the active-node check. // The standby's local SQLite receives no writes, so purging there // is harmless but unnecessary; the design (Component-SiteEventLogging // → Storage) explicitly states the purge runs on the active node. // Defensive try/catch: a transient cluster-state read failure must // not stop the purge loop — fall back to running the purge (the // pre-fix behaviour was "always run", which is harmless on standby). bool isActive; try { isActive = _isActiveNode(); } catch (Exception checkEx) { _logger.LogDebug(checkEx, "Active-node check threw during purge tick; running purge to be safe"); isActive = true; } if (!isActive) { _logger.LogDebug("Skipping event log purge tick — this node is not the active site member"); return; } PurgeByRetention(); PurgeByStorageCap(); } catch (Exception ex) { _logger.LogError(ex, "Error during event log purge"); } } private void PurgeByRetention() { var cutoff = DateTimeOffset.UtcNow.AddDays(-_options.RetentionDays).ToString("o"); var deleted = _eventLogger.WithConnection(connection => { using var cmd = connection.CreateCommand(); cmd.CommandText = "DELETE FROM site_events WHERE timestamp < $cutoff"; cmd.Parameters.AddWithValue("$cutoff", cutoff); return cmd.ExecuteNonQuery(); }); if (deleted > 0) { _logger.LogInformation("Purged {Count} events older than {Days} days", deleted, _options.RetentionDays); } } private void PurgeByStorageCap() { var capBytes = (long)_options.MaxStorageMb * 1024 * 1024; var currentSizeBytes = GetDatabaseSizeBytes(); if (currentSizeBytes <= capBytes) return; _logger.LogWarning( "Event log size {Size:F1} MB exceeds cap {Cap} MB — purging oldest events", currentSizeBytes / (1024.0 * 1024.0), _options.MaxStorageMb); // Delete the oldest events in batches until the database is under the cap. // The loop also stops if the on-disk size fails to decrease across an // iteration (e.g. if vacuum cannot reclaim space), so a cap that can never // be met does not silently empty the entire table. while (currentSizeBytes > capBytes) { var previousSizeBytes = currentSizeBytes; var deleted = _eventLogger.WithConnection(connection => { using var cmd = connection.CreateCommand(); cmd.CommandText = $""" DELETE FROM site_events WHERE id IN ( SELECT id FROM site_events ORDER BY id ASC LIMIT {CapPurgeBatchSize} ) """; var rows = cmd.ExecuteNonQuery(); // Reclaim free pages so page_count/freelist measurement reflects the // delete. Effective because auto_vacuum = INCREMENTAL is set at schema // creation; harmless otherwise. using var vacuumCmd = connection.CreateCommand(); vacuumCmd.CommandText = "PRAGMA incremental_vacuum"; vacuumCmd.ExecuteNonQuery(); return rows; }); if (deleted == 0) break; currentSizeBytes = GetDatabaseSizeBytes(); if (currentSizeBytes >= previousSizeBytes) { // Size is not shrinking despite deletes — stop rather than wipe the // whole table. This should not happen now that logical size is // measured, but guards against any future regression. _logger.LogWarning( "Event log size did not decrease after a cap-purge batch ({Size:F1} MB); " + "stopping to avoid emptying the log", currentSizeBytes / (1024.0 * 1024.0)); break; } } } /// /// Returns the logical size of the database in bytes — only pages that hold live /// data, excluding free pages on the freelist. Measuring logical size (rather than /// the raw file size from page_count) means the storage-cap loop observes /// space being reclaimed even if free pages have not yet been returned to the OS. /// internal long GetDatabaseSizeBytes() { return _eventLogger.WithConnection(connection => { using var pageCountCmd = connection.CreateCommand(); pageCountCmd.CommandText = "PRAGMA page_count"; var pageCount = (long)pageCountCmd.ExecuteScalar()!; using var freeListCmd = connection.CreateCommand(); freeListCmd.CommandText = "PRAGMA freelist_count"; var freeListCount = (long)freeListCmd.ExecuteScalar()!; using var pageSizeCmd = connection.CreateCommand(); pageSizeCmd.CommandText = "PRAGMA page_size"; var pageSize = (long)pageSizeCmd.ExecuteScalar()!; var usedPages = Math.Max(0, pageCount - freeListCount); return usedPages * pageSize; }); } }