refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,223 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-019: predicate the <see cref="EventLogPurgeService"/>
|
||||
/// consults at the top of every purge tick to decide whether THIS node should
|
||||
/// run the daily purge. The design states "a daily background job runs on the
|
||||
/// active node and deletes all events older than 30 days"; the standby's local
|
||||
/// SQLite receives no writes, so purging there is harmless but unnecessary —
|
||||
/// and silently doing it anyway diverges from the design.
|
||||
///
|
||||
/// Registration is the Host's responsibility (it knows the cluster topology);
|
||||
/// when no implementation is registered the purge service defaults to "always
|
||||
/// active" so non-clustered hosts and unit tests are unaffected — backward
|
||||
/// compatible with the prior "run on every host" behaviour.
|
||||
/// </summary>
|
||||
public delegate bool SiteEventLogActiveNodeCheck();
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically purges old events from the SQLite event log.
|
||||
/// Enforces both time-based retention (default 30 days) and storage cap (default 1GB).
|
||||
/// Runs on a background thread and does not block event recording.
|
||||
/// </summary>
|
||||
public class EventLogPurgeService : BackgroundService
|
||||
{
|
||||
/// <summary>Number of events deleted per cap-purge batch.</summary>
|
||||
private const int CapPurgeBatchSize = 1000;
|
||||
|
||||
private readonly SiteEventLogger _eventLogger;
|
||||
private readonly SiteEventLogOptions _options;
|
||||
private readonly ILogger<EventLogPurgeService> _logger;
|
||||
private readonly SiteEventLogActiveNodeCheck _isActiveNode;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="EventLogPurgeService"/>.</summary>
|
||||
/// <param name="eventLogger">The concrete event logger providing lock-guarded database access.</param>
|
||||
/// <param name="options">Site event log options (retention days, storage cap, purge interval).</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="isActiveNode">
|
||||
/// SiteEventLogging-019: optional active-node check. When <c>null</c>, the
|
||||
/// service runs the purge on every tick (preserves the pre-fix behaviour
|
||||
/// for non-clustered hosts and existing tests). When supplied — e.g. by
|
||||
/// the Host on a site node — each tick early-exits on the standby so the
|
||||
/// daily purge runs only on the active node, matching the design.
|
||||
/// </param>
|
||||
public EventLogPurgeService(
|
||||
SiteEventLogger eventLogger,
|
||||
IOptions<SiteEventLogOptions> options,
|
||||
ILogger<EventLogPurgeService> logger,
|
||||
SiteEventLogActiveNodeCheck? isActiveNode = null)
|
||||
{
|
||||
// Depend on the concrete recorder directly: purge must funnel database access
|
||||
// through its lock-guarded WithConnection. Taking ISiteEventLogger and
|
||||
// downcasting would throw InvalidCastException for any other implementation.
|
||||
_eventLogger = eventLogger;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_isActiveNode = isActiveNode ?? (static () => true);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Event log purge service started — retention: {Days} days, cap: {Cap} MB, interval: {Interval}",
|
||||
_options.RetentionDays, _options.MaxStorageMb, _options.PurgeInterval);
|
||||
|
||||
using var timer = new PeriodicTimer(_options.PurgeInterval);
|
||||
|
||||
// Run an initial purge on startup
|
||||
RunPurge();
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
RunPurge();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Runs a single purge cycle: first by retention age, then by storage cap.</summary>
|
||||
internal void RunPurge()
|
||||
{
|
||||
try
|
||||
{
|
||||
// SiteEventLogging-019: gate every tick on the active-node check.
|
||||
// The standby's local SQLite receives no writes, so purging there
|
||||
// is harmless but unnecessary; the design (Component-SiteEventLogging
|
||||
// → Storage) explicitly states the purge runs on the active node.
|
||||
// Defensive try/catch: a transient cluster-state read failure must
|
||||
// not stop the purge loop — fall back to running the purge (the
|
||||
// pre-fix behaviour was "always run", which is harmless on standby).
|
||||
bool isActive;
|
||||
try
|
||||
{
|
||||
isActive = _isActiveNode();
|
||||
}
|
||||
catch (Exception checkEx)
|
||||
{
|
||||
_logger.LogDebug(checkEx,
|
||||
"Active-node check threw during purge tick; running purge to be safe");
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
if (!isActive)
|
||||
{
|
||||
_logger.LogDebug("Skipping event log purge tick — this node is not the active site member");
|
||||
return;
|
||||
}
|
||||
|
||||
PurgeByRetention();
|
||||
PurgeByStorageCap();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error during event log purge");
|
||||
}
|
||||
}
|
||||
|
||||
private void PurgeByRetention()
|
||||
{
|
||||
var cutoff = DateTimeOffset.UtcNow.AddDays(-_options.RetentionDays).ToString("o");
|
||||
|
||||
var deleted = _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = "DELETE FROM site_events WHERE timestamp < $cutoff";
|
||||
cmd.Parameters.AddWithValue("$cutoff", cutoff);
|
||||
return cmd.ExecuteNonQuery();
|
||||
});
|
||||
|
||||
if (deleted > 0)
|
||||
{
|
||||
_logger.LogInformation("Purged {Count} events older than {Days} days", deleted, _options.RetentionDays);
|
||||
}
|
||||
}
|
||||
|
||||
private void PurgeByStorageCap()
|
||||
{
|
||||
var capBytes = (long)_options.MaxStorageMb * 1024 * 1024;
|
||||
var currentSizeBytes = GetDatabaseSizeBytes();
|
||||
|
||||
if (currentSizeBytes <= capBytes)
|
||||
return;
|
||||
|
||||
_logger.LogWarning(
|
||||
"Event log size {Size:F1} MB exceeds cap {Cap} MB — purging oldest events",
|
||||
currentSizeBytes / (1024.0 * 1024.0), _options.MaxStorageMb);
|
||||
|
||||
// Delete the oldest events in batches until the database is under the cap.
|
||||
// The loop also stops if the on-disk size fails to decrease across an
|
||||
// iteration (e.g. if vacuum cannot reclaim space), so a cap that can never
|
||||
// be met does not silently empty the entire table.
|
||||
while (currentSizeBytes > capBytes)
|
||||
{
|
||||
var previousSizeBytes = currentSizeBytes;
|
||||
|
||||
var deleted = _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = $"""
|
||||
DELETE FROM site_events WHERE id IN (
|
||||
SELECT id FROM site_events ORDER BY id ASC LIMIT {CapPurgeBatchSize}
|
||||
)
|
||||
""";
|
||||
var rows = cmd.ExecuteNonQuery();
|
||||
|
||||
// Reclaim free pages so page_count/freelist measurement reflects the
|
||||
// delete. Effective because auto_vacuum = INCREMENTAL is set at schema
|
||||
// creation; harmless otherwise.
|
||||
using var vacuumCmd = connection.CreateCommand();
|
||||
vacuumCmd.CommandText = "PRAGMA incremental_vacuum";
|
||||
vacuumCmd.ExecuteNonQuery();
|
||||
|
||||
return rows;
|
||||
});
|
||||
|
||||
if (deleted == 0)
|
||||
break;
|
||||
|
||||
currentSizeBytes = GetDatabaseSizeBytes();
|
||||
|
||||
if (currentSizeBytes >= previousSizeBytes)
|
||||
{
|
||||
// Size is not shrinking despite deletes — stop rather than wipe the
|
||||
// whole table. This should not happen now that logical size is
|
||||
// measured, but guards against any future regression.
|
||||
_logger.LogWarning(
|
||||
"Event log size did not decrease after a cap-purge batch ({Size:F1} MB); " +
|
||||
"stopping to avoid emptying the log",
|
||||
currentSizeBytes / (1024.0 * 1024.0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the logical size of the database in bytes — only pages that hold live
|
||||
/// data, excluding free pages on the freelist. Measuring logical size (rather than
|
||||
/// the raw file size from <c>page_count</c>) means the storage-cap loop observes
|
||||
/// space being reclaimed even if free pages have not yet been returned to the OS.
|
||||
/// </summary>
|
||||
internal long GetDatabaseSizeBytes()
|
||||
{
|
||||
return _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var pageCountCmd = connection.CreateCommand();
|
||||
pageCountCmd.CommandText = "PRAGMA page_count";
|
||||
var pageCount = (long)pageCountCmd.ExecuteScalar()!;
|
||||
|
||||
using var freeListCmd = connection.CreateCommand();
|
||||
freeListCmd.CommandText = "PRAGMA freelist_count";
|
||||
var freeListCount = (long)freeListCmd.ExecuteScalar()!;
|
||||
|
||||
using var pageSizeCmd = connection.CreateCommand();
|
||||
pageSizeCmd.CommandText = "PRAGMA page_size";
|
||||
var pageSize = (long)pageSizeCmd.ExecuteScalar()!;
|
||||
|
||||
var usedPages = Math.Max(0, pageCount - freeListCount);
|
||||
return usedPages * pageSize;
|
||||
});
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user