refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,29 @@
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// Akka actor bridge for <see cref="IEventLogQueryService"/>.
|
||||
/// Receives <see cref="EventLogQueryRequest"/> from the SiteCommunicationActor
|
||||
/// and returns <see cref="EventLogQueryResponse"/>.
|
||||
/// </summary>
|
||||
public class EventLogHandlerActor : ReceiveActor
|
||||
{
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
private readonly IEventLogQueryService _queryService;
|
||||
|
||||
/// <summary>Initializes the actor and registers the <see cref="EventLogQueryRequest"/> handler.</summary>
|
||||
/// <param name="queryService">The query service that executes event log queries.</param>
|
||||
public EventLogHandlerActor(IEventLogQueryService queryService)
|
||||
{
|
||||
_queryService = queryService;
|
||||
|
||||
Receive<EventLogQueryRequest>(msg =>
|
||||
{
|
||||
var response = _queryService.ExecuteQuery(msg);
|
||||
Sender.Tell(response);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-019: predicate the <see cref="EventLogPurgeService"/>
|
||||
/// consults at the top of every purge tick to decide whether THIS node should
|
||||
/// run the daily purge. The design states "a daily background job runs on the
|
||||
/// active node and deletes all events older than 30 days"; the standby's local
|
||||
/// SQLite receives no writes, so purging there is harmless but unnecessary —
|
||||
/// and silently doing it anyway diverges from the design.
|
||||
///
|
||||
/// Registration is the Host's responsibility (it knows the cluster topology);
|
||||
/// when no implementation is registered the purge service defaults to "always
|
||||
/// active" so non-clustered hosts and unit tests are unaffected — backward
|
||||
/// compatible with the prior "run on every host" behaviour.
|
||||
/// </summary>
|
||||
public delegate bool SiteEventLogActiveNodeCheck();
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically purges old events from the SQLite event log.
|
||||
/// Enforces both time-based retention (default 30 days) and storage cap (default 1GB).
|
||||
/// Runs on a background thread and does not block event recording.
|
||||
/// </summary>
|
||||
public class EventLogPurgeService : BackgroundService
|
||||
{
|
||||
/// <summary>Number of events deleted per cap-purge batch.</summary>
|
||||
private const int CapPurgeBatchSize = 1000;
|
||||
|
||||
private readonly SiteEventLogger _eventLogger;
|
||||
private readonly SiteEventLogOptions _options;
|
||||
private readonly ILogger<EventLogPurgeService> _logger;
|
||||
private readonly SiteEventLogActiveNodeCheck _isActiveNode;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="EventLogPurgeService"/>.</summary>
|
||||
/// <param name="eventLogger">The concrete event logger providing lock-guarded database access.</param>
|
||||
/// <param name="options">Site event log options (retention days, storage cap, purge interval).</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="isActiveNode">
|
||||
/// SiteEventLogging-019: optional active-node check. When <c>null</c>, the
|
||||
/// service runs the purge on every tick (preserves the pre-fix behaviour
|
||||
/// for non-clustered hosts and existing tests). When supplied — e.g. by
|
||||
/// the Host on a site node — each tick early-exits on the standby so the
|
||||
/// daily purge runs only on the active node, matching the design.
|
||||
/// </param>
|
||||
public EventLogPurgeService(
|
||||
SiteEventLogger eventLogger,
|
||||
IOptions<SiteEventLogOptions> options,
|
||||
ILogger<EventLogPurgeService> logger,
|
||||
SiteEventLogActiveNodeCheck? isActiveNode = null)
|
||||
{
|
||||
// Depend on the concrete recorder directly: purge must funnel database access
|
||||
// through its lock-guarded WithConnection. Taking ISiteEventLogger and
|
||||
// downcasting would throw InvalidCastException for any other implementation.
|
||||
_eventLogger = eventLogger;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_isActiveNode = isActiveNode ?? (static () => true);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Event log purge service started — retention: {Days} days, cap: {Cap} MB, interval: {Interval}",
|
||||
_options.RetentionDays, _options.MaxStorageMb, _options.PurgeInterval);
|
||||
|
||||
using var timer = new PeriodicTimer(_options.PurgeInterval);
|
||||
|
||||
// Run an initial purge on startup
|
||||
RunPurge();
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
RunPurge();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Runs a single purge cycle: first by retention age, then by storage cap.</summary>
|
||||
internal void RunPurge()
|
||||
{
|
||||
try
|
||||
{
|
||||
// SiteEventLogging-019: gate every tick on the active-node check.
|
||||
// The standby's local SQLite receives no writes, so purging there
|
||||
// is harmless but unnecessary; the design (Component-SiteEventLogging
|
||||
// → Storage) explicitly states the purge runs on the active node.
|
||||
// Defensive try/catch: a transient cluster-state read failure must
|
||||
// not stop the purge loop — fall back to running the purge (the
|
||||
// pre-fix behaviour was "always run", which is harmless on standby).
|
||||
bool isActive;
|
||||
try
|
||||
{
|
||||
isActive = _isActiveNode();
|
||||
}
|
||||
catch (Exception checkEx)
|
||||
{
|
||||
_logger.LogDebug(checkEx,
|
||||
"Active-node check threw during purge tick; running purge to be safe");
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
if (!isActive)
|
||||
{
|
||||
_logger.LogDebug("Skipping event log purge tick — this node is not the active site member");
|
||||
return;
|
||||
}
|
||||
|
||||
PurgeByRetention();
|
||||
PurgeByStorageCap();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error during event log purge");
|
||||
}
|
||||
}
|
||||
|
||||
private void PurgeByRetention()
|
||||
{
|
||||
var cutoff = DateTimeOffset.UtcNow.AddDays(-_options.RetentionDays).ToString("o");
|
||||
|
||||
var deleted = _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = "DELETE FROM site_events WHERE timestamp < $cutoff";
|
||||
cmd.Parameters.AddWithValue("$cutoff", cutoff);
|
||||
return cmd.ExecuteNonQuery();
|
||||
});
|
||||
|
||||
if (deleted > 0)
|
||||
{
|
||||
_logger.LogInformation("Purged {Count} events older than {Days} days", deleted, _options.RetentionDays);
|
||||
}
|
||||
}
|
||||
|
||||
private void PurgeByStorageCap()
|
||||
{
|
||||
var capBytes = (long)_options.MaxStorageMb * 1024 * 1024;
|
||||
var currentSizeBytes = GetDatabaseSizeBytes();
|
||||
|
||||
if (currentSizeBytes <= capBytes)
|
||||
return;
|
||||
|
||||
_logger.LogWarning(
|
||||
"Event log size {Size:F1} MB exceeds cap {Cap} MB — purging oldest events",
|
||||
currentSizeBytes / (1024.0 * 1024.0), _options.MaxStorageMb);
|
||||
|
||||
// Delete the oldest events in batches until the database is under the cap.
|
||||
// The loop also stops if the on-disk size fails to decrease across an
|
||||
// iteration (e.g. if vacuum cannot reclaim space), so a cap that can never
|
||||
// be met does not silently empty the entire table.
|
||||
while (currentSizeBytes > capBytes)
|
||||
{
|
||||
var previousSizeBytes = currentSizeBytes;
|
||||
|
||||
var deleted = _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = $"""
|
||||
DELETE FROM site_events WHERE id IN (
|
||||
SELECT id FROM site_events ORDER BY id ASC LIMIT {CapPurgeBatchSize}
|
||||
)
|
||||
""";
|
||||
var rows = cmd.ExecuteNonQuery();
|
||||
|
||||
// Reclaim free pages so page_count/freelist measurement reflects the
|
||||
// delete. Effective because auto_vacuum = INCREMENTAL is set at schema
|
||||
// creation; harmless otherwise.
|
||||
using var vacuumCmd = connection.CreateCommand();
|
||||
vacuumCmd.CommandText = "PRAGMA incremental_vacuum";
|
||||
vacuumCmd.ExecuteNonQuery();
|
||||
|
||||
return rows;
|
||||
});
|
||||
|
||||
if (deleted == 0)
|
||||
break;
|
||||
|
||||
currentSizeBytes = GetDatabaseSizeBytes();
|
||||
|
||||
if (currentSizeBytes >= previousSizeBytes)
|
||||
{
|
||||
// Size is not shrinking despite deletes — stop rather than wipe the
|
||||
// whole table. This should not happen now that logical size is
|
||||
// measured, but guards against any future regression.
|
||||
_logger.LogWarning(
|
||||
"Event log size did not decrease after a cap-purge batch ({Size:F1} MB); " +
|
||||
"stopping to avoid emptying the log",
|
||||
currentSizeBytes / (1024.0 * 1024.0));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the logical size of the database in bytes — only pages that hold live
|
||||
/// data, excluding free pages on the freelist. Measuring logical size (rather than
|
||||
/// the raw file size from <c>page_count</c>) means the storage-cap loop observes
|
||||
/// space being reclaimed even if free pages have not yet been returned to the OS.
|
||||
/// </summary>
|
||||
internal long GetDatabaseSizeBytes()
|
||||
{
|
||||
return _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var pageCountCmd = connection.CreateCommand();
|
||||
pageCountCmd.CommandText = "PRAGMA page_count";
|
||||
var pageCount = (long)pageCountCmd.ExecuteScalar()!;
|
||||
|
||||
using var freeListCmd = connection.CreateCommand();
|
||||
freeListCmd.CommandText = "PRAGMA freelist_count";
|
||||
var freeListCount = (long)freeListCmd.ExecuteScalar()!;
|
||||
|
||||
using var pageSizeCmd = connection.CreateCommand();
|
||||
pageSizeCmd.CommandText = "PRAGMA page_size";
|
||||
var pageSize = (long)pageSizeCmd.ExecuteScalar()!;
|
||||
|
||||
var usedPages = Math.Max(0, pageCount - freeListCount);
|
||||
return usedPages * pageSize;
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
using System.Globalization;
|
||||
using Microsoft.Data.Sqlite;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// Processes event log queries locally against SQLite.
|
||||
/// Supports filtering by event_type, time range, instance_id, severity,
|
||||
/// and keyword search (LIKE on message and source).
|
||||
/// Uses keyset pagination with continuation token (last event ID).
|
||||
/// </summary>
|
||||
public class EventLogQueryService : IEventLogQueryService
|
||||
{
|
||||
private readonly SiteEventLogger _eventLogger;
|
||||
private readonly SiteEventLogOptions _options;
|
||||
private readonly ILogger<EventLogQueryService> _logger;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="EventLogQueryService"/>.</summary>
|
||||
/// <param name="eventLogger">The concrete event logger providing lock-guarded database access.</param>
|
||||
/// <param name="options">Site event log options (page size and other query settings).</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public EventLogQueryService(
|
||||
SiteEventLogger eventLogger,
|
||||
IOptions<SiteEventLogOptions> options,
|
||||
ILogger<EventLogQueryService> logger)
|
||||
{
|
||||
// Depend on the concrete recorder directly: queries must funnel database
|
||||
// access through its lock-guarded WithConnection. Taking ISiteEventLogger and
|
||||
// downcasting would throw InvalidCastException for any other implementation.
|
||||
_eventLogger = eventLogger;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Escapes the SQL <c>LIKE</c> metacharacters (<c>\</c>, <c>%</c>, <c>_</c>) in a
|
||||
/// user-supplied keyword so it is matched as a literal substring. Used together
|
||||
/// with a <c>LIKE ... ESCAPE '\'</c> clause.
|
||||
/// </summary>
|
||||
private static string EscapeLikePattern(string input)
|
||||
{
|
||||
return input
|
||||
.Replace("\\", "\\\\")
|
||||
.Replace("%", "\\%")
|
||||
.Replace("_", "\\_");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public EventLogQueryResponse ExecuteQuery(EventLogQueryRequest request)
|
||||
{
|
||||
try
|
||||
{
|
||||
// SiteEventLogging-017: clamp caller-supplied PageSize to a hard upper
|
||||
// bound so a central client sending int.MaxValue can't force the query
|
||||
// to materialise the entire log into a single list while holding the
|
||||
// shared write lock. Silent clamp — misconfigured clients still get a
|
||||
// usable response.
|
||||
var requestedSize = request.PageSize > 0 ? request.PageSize : _options.QueryPageSize;
|
||||
var pageSize = Math.Min(requestedSize, _options.MaxQueryPageSize);
|
||||
|
||||
var whereClauses = new List<string>();
|
||||
var parameters = new List<SqliteParameter>();
|
||||
|
||||
// Keyset pagination: only return events with id > continuation token
|
||||
if (request.ContinuationToken.HasValue)
|
||||
{
|
||||
whereClauses.Add("id > $afterId");
|
||||
parameters.Add(new SqliteParameter("$afterId", request.ContinuationToken.Value));
|
||||
}
|
||||
|
||||
if (request.From.HasValue)
|
||||
{
|
||||
whereClauses.Add("timestamp >= $from");
|
||||
parameters.Add(new SqliteParameter("$from", request.From.Value.ToString("o")));
|
||||
}
|
||||
|
||||
if (request.To.HasValue)
|
||||
{
|
||||
whereClauses.Add("timestamp <= $to");
|
||||
parameters.Add(new SqliteParameter("$to", request.To.Value.ToString("o")));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(request.EventType))
|
||||
{
|
||||
whereClauses.Add("event_type = $eventType");
|
||||
parameters.Add(new SqliteParameter("$eventType", request.EventType));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(request.Severity))
|
||||
{
|
||||
whereClauses.Add("severity = $severity");
|
||||
parameters.Add(new SqliteParameter("$severity", request.Severity));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(request.InstanceId))
|
||||
{
|
||||
whereClauses.Add("instance_id = $instanceId");
|
||||
parameters.Add(new SqliteParameter("$instanceId", request.InstanceId));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(request.KeywordFilter))
|
||||
{
|
||||
// Keyword search is a literal substring match. The LIKE
|
||||
// metacharacters % and _ (and the escape char itself) must be
|
||||
// escaped so identifiers such as "store_and_forward" or a literal
|
||||
// "%" are not misinterpreted as wildcards (SiteEventLogging-013).
|
||||
var escaped = EscapeLikePattern(request.KeywordFilter);
|
||||
whereClauses.Add(
|
||||
"(message LIKE $keyword ESCAPE '\\' OR source LIKE $keyword ESCAPE '\\')");
|
||||
parameters.Add(new SqliteParameter("$keyword", $"%{escaped}%"));
|
||||
}
|
||||
|
||||
var whereClause = whereClauses.Count > 0
|
||||
? "WHERE " + string.Join(" AND ", whereClauses)
|
||||
: "";
|
||||
|
||||
// Run the read against the shared connection under the logger's write
|
||||
// lock — the connection is not thread-safe and is also used by the
|
||||
// recorder and the purge service on other threads.
|
||||
var entries = _eventLogger.WithConnection(connection =>
|
||||
{
|
||||
using var cmd = connection.CreateCommand();
|
||||
|
||||
// Fetch pageSize + 1 to determine if there are more results
|
||||
cmd.CommandText = $"""
|
||||
SELECT id, timestamp, event_type, severity, instance_id, source, message, details
|
||||
FROM site_events
|
||||
{whereClause}
|
||||
ORDER BY id ASC
|
||||
LIMIT $limit
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$limit", pageSize + 1);
|
||||
foreach (var p in parameters)
|
||||
cmd.Parameters.Add(p);
|
||||
|
||||
var rows = new List<EventLogEntry>();
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(new EventLogEntry(
|
||||
Id: reader.GetInt64(0),
|
||||
// Parse with explicit invariant culture and round-trip style
|
||||
// (SiteEventLogging-021). Stored values are ISO 8601 "o" UTC
|
||||
// (see SiteEventLogger.LogEventAsync), and the recorder's
|
||||
// emitted offset is always +00:00; AssumeUniversal +
|
||||
// AdjustToUniversal guarantees the parsed value is UTC and
|
||||
// does not depend on the host's CurrentCulture.
|
||||
Timestamp: DateTimeOffset.Parse(
|
||||
reader.GetString(1),
|
||||
CultureInfo.InvariantCulture,
|
||||
DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal),
|
||||
EventType: reader.GetString(2),
|
||||
Severity: reader.GetString(3),
|
||||
InstanceId: reader.IsDBNull(4) ? null : reader.GetString(4),
|
||||
Source: reader.GetString(5),
|
||||
Message: reader.GetString(6),
|
||||
Details: reader.IsDBNull(7) ? null : reader.GetString(7)));
|
||||
}
|
||||
|
||||
return rows;
|
||||
});
|
||||
|
||||
var hasMore = entries.Count > pageSize;
|
||||
if (hasMore)
|
||||
{
|
||||
entries.RemoveAt(entries.Count - 1);
|
||||
}
|
||||
|
||||
var continuationToken = entries.Count > 0 ? entries[^1].Id : (long?)null;
|
||||
|
||||
return new EventLogQueryResponse(
|
||||
CorrelationId: request.CorrelationId,
|
||||
SiteId: request.SiteId,
|
||||
Entries: entries,
|
||||
ContinuationToken: continuationToken,
|
||||
HasMore: hasMore,
|
||||
Success: true,
|
||||
ErrorMessage: null,
|
||||
Timestamp: DateTimeOffset.UtcNow);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to execute event log query: {CorrelationId}", request.CorrelationId);
|
||||
return new EventLogQueryResponse(
|
||||
CorrelationId: request.CorrelationId,
|
||||
SiteId: request.SiteId,
|
||||
Entries: [],
|
||||
ContinuationToken: null,
|
||||
HasMore: false,
|
||||
Success: false,
|
||||
ErrorMessage: ex.Message,
|
||||
Timestamp: DateTimeOffset.UtcNow);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for querying site event logs.
|
||||
/// Used by Communication Layer to process remote queries from central.
|
||||
/// </summary>
|
||||
public interface IEventLogQueryService
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes a query against the local site event log and returns the matching entries.
|
||||
/// </summary>
|
||||
/// <param name="request">The query parameters including filters, pagination, and keyword search.</param>
|
||||
EventLogQueryResponse ExecuteQuery(EventLogQueryRequest request);
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for recording operational events to the local SQLite event log.
|
||||
/// </summary>
|
||||
public interface ISiteEventLogger
|
||||
{
|
||||
/// <summary>
|
||||
/// Record an event asynchronously. The call enqueues the event onto a background
|
||||
/// writer and returns without blocking the caller on disk I/O. The returned
|
||||
/// <see cref="Task"/> completes once the event is durably persisted and faults if
|
||||
/// the write fails, so callers that <c>await</c> it observe success or failure.
|
||||
/// </summary>
|
||||
/// <param name="eventType">Category: script, alarm, deployment, connection, store_and_forward, instance_lifecycle</param>
|
||||
/// <param name="severity">Info, Warning, or Error</param>
|
||||
/// <param name="instanceId">Optional instance ID associated with the event</param>
|
||||
/// <param name="source">Source identifier, e.g., "ScriptActor:MonitorSpeed"</param>
|
||||
/// <param name="message">Human-readable event description</param>
|
||||
/// <param name="details">
|
||||
/// Optional free-form detail text (stack traces, compilation errors, etc.).
|
||||
/// Stored verbatim — JSON is conventional but not validated or enforced.
|
||||
/// </param>
|
||||
Task LogEventAsync(
|
||||
string eventType,
|
||||
string severity,
|
||||
string? instanceId,
|
||||
string source,
|
||||
string message,
|
||||
string? details = null);
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-018: total number of event writes that have failed
|
||||
/// (SQLite error, disk full, bounded-queue overflow drop, etc.) since this
|
||||
/// logger was created. Available for future Health Monitoring integration —
|
||||
/// promoted onto the interface so a Health consumer can read it without a
|
||||
/// concrete-type downcast. Not yet polled by Health Monitoring; the wiring
|
||||
/// is tracked separately.
|
||||
/// </summary>
|
||||
long FailedWriteCount { get; }
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Register site event logging services (recording, purge, query).
|
||||
/// </summary>
|
||||
/// <param name="services">The DI service collection to register into.</param>
|
||||
public static IServiceCollection AddSiteEventLogging(this IServiceCollection services)
|
||||
{
|
||||
// The recorder is registered as a concrete singleton and the interface is
|
||||
// forwarded to the same instance. The purge and query services depend on the
|
||||
// concrete SiteEventLogger directly (they need its lock-guarded WithConnection)
|
||||
// rather than downcasting an ISiteEventLogger, which would throw
|
||||
// InvalidCastException for any other ISiteEventLogger implementation.
|
||||
services.AddSingleton<SiteEventLogger>();
|
||||
services.AddSingleton<ISiteEventLogger>(sp => sp.GetRequiredService<SiteEventLogger>());
|
||||
services.AddSingleton<IEventLogQueryService, EventLogQueryService>();
|
||||
|
||||
// SiteEventLogging-019: the purge service still registers on every host
|
||||
// node, but it consults an optional SiteEventLogActiveNodeCheck on each
|
||||
// tick and early-exits on the standby. The Host registers the real
|
||||
// active-node check on site nodes; tests and non-clustered hosts leave
|
||||
// it unregistered, and the purge defaults to "always run" (the
|
||||
// pre-fix behaviour). Building the service via a factory so the
|
||||
// optional delegate flows from DI rather than the constructor default.
|
||||
services.AddHostedService(sp => new EventLogPurgeService(
|
||||
sp.GetRequiredService<SiteEventLogger>(),
|
||||
sp.GetRequiredService<IOptions<SiteEventLogOptions>>(),
|
||||
sp.GetRequiredService<ILogger<EventLogPurgeService>>(),
|
||||
sp.GetService<SiteEventLogActiveNodeCheck>()));
|
||||
return services;
|
||||
}
|
||||
|
||||
// NOTE: EventLogHandlerActor is wired up directly in
|
||||
// ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs as a cluster singleton, because the
|
||||
// actor must be created inside the ActorSystem with the resolved
|
||||
// IEventLogQueryService. There is intentionally no DI helper for that here — a
|
||||
// former AddSiteEventLoggingActors placeholder was dead code and has been removed.
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
public class SiteEventLogOptions
|
||||
{
|
||||
/// <summary>Number of days to retain site event log entries before purge; default 30.</summary>
|
||||
public int RetentionDays { get; set; } = 30;
|
||||
/// <summary>Maximum SQLite database size in megabytes before old entries are purged; default 1024 MB.</summary>
|
||||
public int MaxStorageMb { get; set; } = 1024;
|
||||
/// <summary>File path for the site event log SQLite database.</summary>
|
||||
public string DatabasePath { get; set; } = "site_events.db";
|
||||
/// <summary>Maximum number of rows returned per paginated query; default 500.</summary>
|
||||
public int QueryPageSize { get; set; } = 500;
|
||||
/// <summary>
|
||||
/// SiteEventLogging-017: hard upper bound on a caller-supplied <c>PageSize</c>. A
|
||||
/// misbehaving or hostile central client that requests <c>int.MaxValue</c> would
|
||||
/// otherwise force the query to materialise the entire log into a single list while
|
||||
/// holding the shared write lock. Silent clamp; default 500 matches
|
||||
/// <see cref="QueryPageSize"/>.
|
||||
/// </summary>
|
||||
public int MaxQueryPageSize { get; set; } = 500;
|
||||
/// <summary>Interval between purge runs; default 24 hours.</summary>
|
||||
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
|
||||
/// <summary>
|
||||
/// SiteEventLogging-015: bound on the background write queue. Default 10 000 events.
|
||||
/// Overflow uses <c>BoundedChannelFullMode.DropOldest</c> — callers never block; the
|
||||
/// dropped event's <c>Task</c> is faulted and <c>FailedWriteCount</c> is incremented
|
||||
/// so the drop is observable.
|
||||
/// </summary>
|
||||
public int WriteQueueCapacity { get; set; } = 10_000;
|
||||
}
|
||||
@@ -0,0 +1,334 @@
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Data.Sqlite;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
|
||||
|
||||
/// <summary>
|
||||
/// Records operational events to a local SQLite database.
|
||||
/// Only the active node generates events. Not replicated to standby.
|
||||
/// On failover, the new active node starts a fresh log.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// A single <see cref="SqliteConnection"/> is owned here and is NOT thread-safe.
|
||||
/// All access — recording, querying, purging — must be funnelled through
|
||||
/// <see cref="WithConnection"/>, which serialises callers on a shared lock.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Event recording is offloaded to a dedicated background writer thread (fed by a
|
||||
/// <em>bounded</em> <see cref="Channel{T}"/>; capacity <see cref="SiteEventLogOptions.WriteQueueCapacity"/>,
|
||||
/// default 10 000, overflow <see cref="BoundedChannelFullMode.DropOldest"/>).
|
||||
/// <see cref="LogEventAsync"/> only validates its arguments and enqueues, so callers —
|
||||
/// typically Akka actor threads on hot paths — never block on disk I/O or on
|
||||
/// contention for the write lock. The returned <see cref="Task"/> completes once the
|
||||
/// event is durably persisted and faults if the write fails. SiteEventLogging-015:
|
||||
/// when a queued event is evicted to make room for a newer one, that event's Task
|
||||
/// is faulted with <see cref="InvalidOperationException"/> and
|
||||
/// <see cref="FailedWriteCount"/> is incremented so the drop is observable.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SiteEventLogger : ISiteEventLogger, IDisposable
|
||||
{
|
||||
private readonly SqliteConnection _connection;
|
||||
private readonly ILogger<SiteEventLogger> _logger;
|
||||
private readonly object _writeLock = new();
|
||||
private readonly Channel<PendingEvent> _writeQueue;
|
||||
private readonly Task _writerLoop;
|
||||
private long _failedWriteCount;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the event logger, opens the SQLite connection, and starts the background writer loop.
|
||||
/// </summary>
|
||||
/// <param name="options">Site event log configuration (database path, retention settings).</param>
|
||||
/// <param name="logger">Logger for write-failure diagnostics.</param>
|
||||
/// <param name="connectionStringOverride">Optional connection string override; uses the configured path when null.</param>
|
||||
public SiteEventLogger(
|
||||
IOptions<SiteEventLogOptions> options,
|
||||
ILogger<SiteEventLogger> logger,
|
||||
string? connectionStringOverride = null)
|
||||
{
|
||||
_logger = logger;
|
||||
|
||||
// SiteEventLogging-022: Cache=Shared is a cross-connection optimisation
|
||||
// that lets multiple SqliteConnections share an in-process page cache.
|
||||
// This logger owns exactly one SqliteConnection and serialises all
|
||||
// access through _writeLock, so the mode is dormant — at best dead
|
||||
// configuration, at worst a small future foot-gun for any second
|
||||
// connection opened to the same file. A test path that genuinely
|
||||
// needs Cache=Shared can still inject it via connectionStringOverride.
|
||||
var connectionString = connectionStringOverride
|
||||
?? $"Data Source={options.Value.DatabasePath}";
|
||||
_connection = new SqliteConnection(connectionString);
|
||||
_connection.Open();
|
||||
|
||||
InitializeSchema();
|
||||
|
||||
// SiteEventLogging-015: bounded queue with DropOldest preserves the
|
||||
// "callers never block" guarantee (SiteEventLogging-005) while putting an
|
||||
// upper bound on memory under sustained writer slowness. Drops are
|
||||
// observable — itemDropped faults the evicted Task and increments
|
||||
// FailedWriteCount.
|
||||
var capacity = Math.Max(1, options.Value.WriteQueueCapacity);
|
||||
_writeQueue = Channel.CreateBounded<PendingEvent>(
|
||||
new BoundedChannelOptions(capacity)
|
||||
{
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
FullMode = BoundedChannelFullMode.DropOldest,
|
||||
},
|
||||
itemDropped: dropped =>
|
||||
{
|
||||
Interlocked.Increment(ref _failedWriteCount);
|
||||
dropped.Completion.TrySetException(
|
||||
new InvalidOperationException(
|
||||
$"Event was dropped because the write queue exceeded its bounded capacity ({capacity})."));
|
||||
});
|
||||
_writerLoop = Task.Run(ProcessWriteQueueAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-018: number of event writes that have failed (SQLite
|
||||
/// error, disk full, bounded-queue overflow drop, etc.) since this logger
|
||||
/// was created. Available for future Health Monitoring integration — the
|
||||
/// counter is correct and observable, but the central health-metric
|
||||
/// pipeline does not yet poll it, so a sustained non-zero value currently
|
||||
/// goes unnoticed in production beyond the per-failure log line. Wiring
|
||||
/// the metric into the 30-second site-metric publish is tracked
|
||||
/// separately; promoted to <see cref="ISiteEventLogger"/> so the eventual
|
||||
/// consumer reads it without a concrete-type downcast.
|
||||
/// </summary>
|
||||
public long FailedWriteCount => Interlocked.Read(ref _failedWriteCount);
|
||||
|
||||
/// <summary>
|
||||
/// Runs <paramref name="action"/> against the shared connection while holding the
|
||||
/// write lock, so purge / query / record callers on different threads never use
|
||||
/// the non-thread-safe <see cref="SqliteConnection"/> concurrently.
|
||||
/// Returns <see langword="false"/> without invoking the action if the logger has
|
||||
/// been disposed.
|
||||
/// </summary>
|
||||
/// <param name="action">The action to run against the shared connection.</param>
|
||||
internal bool WithConnection(Action<SqliteConnection> action)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(action);
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return false;
|
||||
action(_connection);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs <paramref name="func"/> against the shared connection while holding the
|
||||
/// write lock. Throws <see cref="ObjectDisposedException"/> if the logger has
|
||||
/// been disposed (callers that need a result cannot proceed without the database).
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The return type of the function.</typeparam>
|
||||
/// <param name="func">The function to run against the shared connection.</param>
|
||||
internal T WithConnection<T>(Func<SqliteConnection, T> func)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(func);
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
return func(_connection);
|
||||
}
|
||||
}
|
||||
|
||||
private void InitializeSchema()
|
||||
{
|
||||
// auto_vacuum must be set before any table is created for it to take effect
|
||||
// on a fresh database. With INCREMENTAL mode, PRAGMA incremental_vacuum can
|
||||
// later reclaim free pages so the storage-cap purge can shrink the file.
|
||||
using (var pragmaCmd = _connection.CreateCommand())
|
||||
{
|
||||
pragmaCmd.CommandText = "PRAGMA auto_vacuum = INCREMENTAL";
|
||||
pragmaCmd.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
CREATE TABLE IF NOT EXISTS site_events (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
timestamp TEXT NOT NULL,
|
||||
event_type TEXT NOT NULL,
|
||||
severity TEXT NOT NULL,
|
||||
instance_id TEXT,
|
||||
source TEXT NOT NULL,
|
||||
message TEXT NOT NULL,
|
||||
details TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_events_timestamp ON site_events(timestamp);
|
||||
CREATE INDEX IF NOT EXISTS idx_events_type ON site_events(event_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_events_instance ON site_events(instance_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_events_severity ON site_events(severity);
|
||||
""";
|
||||
// The query service also supports keyword search via leading-wildcard
|
||||
// LIKE on message/source. A leading-wildcard LIKE cannot use a B-tree
|
||||
// index, so that path intentionally full-scans; severity/event_type/
|
||||
// instance_id/timestamp filters above are all covered.
|
||||
cmd.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-020: closed set of allowed severities. Case-sensitive to
|
||||
/// match the SQLite default <c>BINARY</c> collation used by the query filter —
|
||||
/// a row stored as <c>"error"</c> would be invisible to a query filtering on
|
||||
/// <c>"Error"</c>, so the contract on the way in must match the contract on
|
||||
/// the way out.
|
||||
/// </summary>
|
||||
private static readonly HashSet<string> AllowedSeverities =
|
||||
new(StringComparer.Ordinal) { "Info", "Warning", "Error" };
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task LogEventAsync(
|
||||
string eventType,
|
||||
string severity,
|
||||
string? instanceId,
|
||||
string source,
|
||||
string message,
|
||||
string? details = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(eventType);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(severity);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(source);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(message);
|
||||
|
||||
// SiteEventLogging-020: reject unknown severities so the query-time filter
|
||||
// (case-sensitive BINARY collation) and the documented enum stay in sync.
|
||||
if (!AllowedSeverities.Contains(severity))
|
||||
{
|
||||
throw new ArgumentException(
|
||||
$"Severity '{severity}' is not one of the allowed values: Info, Warning, Error.",
|
||||
nameof(severity));
|
||||
}
|
||||
|
||||
var pending = new PendingEvent(
|
||||
DateTimeOffset.UtcNow.ToString("o"),
|
||||
eventType,
|
||||
severity,
|
||||
instanceId,
|
||||
source,
|
||||
message,
|
||||
details);
|
||||
|
||||
// Enqueue only — the actual SQLite write happens on the background writer
|
||||
// thread so the caller (an Akka actor thread on a hot path) never blocks
|
||||
// on disk I/O or on contention for the write lock.
|
||||
if (!_writeQueue.Writer.TryWrite(pending))
|
||||
{
|
||||
// The channel is unbounded, so the only way TryWrite fails is that the
|
||||
// writer has been completed (logger disposed). The event cannot be
|
||||
// persisted — fault the Task (SiteEventLogging-012) rather than
|
||||
// reporting false success, so a caller that awaits a critical audit
|
||||
// event can tell it was dropped.
|
||||
pending.Completion.TrySetException(
|
||||
new ObjectDisposedException(nameof(SiteEventLogger),
|
||||
"Event could not be recorded: the event logger has been disposed."));
|
||||
}
|
||||
|
||||
return pending.Completion.Task;
|
||||
}
|
||||
|
||||
private async Task ProcessWriteQueueAsync()
|
||||
{
|
||||
await foreach (var pending in _writeQueue.Reader.ReadAllAsync().ConfigureAwait(false))
|
||||
{
|
||||
try
|
||||
{
|
||||
var written = WithConnection(connection =>
|
||||
{
|
||||
using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
INSERT INTO site_events (timestamp, event_type, severity, instance_id, source, message, details)
|
||||
VALUES ($timestamp, $event_type, $severity, $instance_id, $source, $message, $details)
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$timestamp", pending.Timestamp);
|
||||
cmd.Parameters.AddWithValue("$event_type", pending.EventType);
|
||||
cmd.Parameters.AddWithValue("$severity", pending.Severity);
|
||||
cmd.Parameters.AddWithValue("$instance_id", (object?)pending.InstanceId ?? DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("$source", pending.Source);
|
||||
cmd.Parameters.AddWithValue("$message", pending.Message);
|
||||
cmd.Parameters.AddWithValue("$details", (object?)pending.Details ?? DBNull.Value);
|
||||
cmd.ExecuteNonQuery();
|
||||
});
|
||||
|
||||
if (written)
|
||||
{
|
||||
pending.Completion.TrySetResult();
|
||||
}
|
||||
else
|
||||
{
|
||||
// WithConnection returns false only when the logger has been
|
||||
// disposed mid-drain; the event was not persisted. Fault the
|
||||
// Task (SiteEventLogging-012) instead of reporting false
|
||||
// success for a dropped audit event.
|
||||
pending.Completion.TrySetException(
|
||||
new ObjectDisposedException(nameof(SiteEventLogger),
|
||||
"Event could not be recorded: the event logger was disposed before the write completed."));
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// SiteEventLogging-008: a write failure must be observable. Count it
|
||||
// (Health Monitoring reads FailedWriteCount) and fault the caller's
|
||||
// Task instead of silently discarding the exception.
|
||||
Interlocked.Increment(ref _failedWriteCount);
|
||||
_logger.LogError(ex, "Failed to record event: {EventType} from {Source}",
|
||||
pending.EventType, pending.Source);
|
||||
pending.Completion.TrySetException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops accepting new events, drains the write queue, and disposes the SQLite connection.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
Task? writerLoop = null;
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
// Stop accepting new events and let the writer loop drain.
|
||||
_writeQueue.Writer.TryComplete();
|
||||
writerLoop = _writerLoop;
|
||||
}
|
||||
|
||||
// Wait for the writer loop to finish outside the lock — the loop itself
|
||||
// acquires the lock for each write.
|
||||
try
|
||||
{
|
||||
writerLoop?.Wait(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
catch (AggregateException)
|
||||
{
|
||||
// A faulted writer loop has already been logged per event; nothing more
|
||||
// to do during disposal.
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
_connection.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>An event awaiting persistence by the background writer.</summary>
|
||||
private sealed record PendingEvent(
|
||||
string Timestamp,
|
||||
string EventType,
|
||||
string Severity,
|
||||
string? InstanceId,
|
||||
string Source,
|
||||
string Message,
|
||||
string? Details)
|
||||
{
|
||||
/// <summary>Completes when the event has been durably persisted, or faults on write failure.</summary>
|
||||
public TaskCompletionSource Completion { get; } =
|
||||
new(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
}
|
||||
}
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Akka" />
|
||||
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.SiteEventLogging.Tests" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user