refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj,
namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated.
ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated.
SQL roles/logins, LDAP domains, CLI command name, and CLI config dir
(~/.scadalink → ~/.scadabridge) also renamed.

Build green; 5 Host.Tests fail awaiting SQL login rename in next commit.
Pre-existing StaleTagMonitor timing flakes unchanged.

Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
Joseph Doherty
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,29 @@
using Akka.Actor;
using Akka.Event;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
/// <summary>
/// Akka actor bridge for <see cref="IEventLogQueryService"/>.
/// Receives <see cref="EventLogQueryRequest"/> from the SiteCommunicationActor
/// and returns <see cref="EventLogQueryResponse"/>.
/// </summary>
public class EventLogHandlerActor : ReceiveActor
{
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly IEventLogQueryService _queryService;
/// <summary>Initializes the actor and registers the <see cref="EventLogQueryRequest"/> handler.</summary>
/// <param name="queryService">The query service that executes event log queries.</param>
public EventLogHandlerActor(IEventLogQueryService queryService)
{
_queryService = queryService;
Receive<EventLogQueryRequest>(msg =>
{
var response = _queryService.ExecuteQuery(msg);
Sender.Tell(response);
});
}
}
@@ -0,0 +1,223 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
/// <summary>
/// SiteEventLogging-019: predicate the <see cref="EventLogPurgeService"/>
/// consults at the top of every purge tick to decide whether THIS node should
/// run the daily purge. The design states "a daily background job runs on the
/// active node and deletes all events older than 30 days"; the standby's local
/// SQLite receives no writes, so purging there is harmless but unnecessary —
/// and silently doing it anyway diverges from the design.
///
/// Registration is the Host's responsibility (it knows the cluster topology);
/// when no implementation is registered the purge service defaults to "always
/// active" so non-clustered hosts and unit tests are unaffected — backward
/// compatible with the prior "run on every host" behaviour.
/// </summary>
public delegate bool SiteEventLogActiveNodeCheck();
/// <summary>
/// Background service that periodically purges old events from the SQLite event log.
/// Enforces both time-based retention (default 30 days) and storage cap (default 1GB).
/// Runs on a background thread and does not block event recording.
/// </summary>
public class EventLogPurgeService : BackgroundService
{
/// <summary>Number of events deleted per cap-purge batch.</summary>
private const int CapPurgeBatchSize = 1000;
private readonly SiteEventLogger _eventLogger;
private readonly SiteEventLogOptions _options;
private readonly ILogger<EventLogPurgeService> _logger;
private readonly SiteEventLogActiveNodeCheck _isActiveNode;
/// <summary>Initializes a new instance of <see cref="EventLogPurgeService"/>.</summary>
/// <param name="eventLogger">The concrete event logger providing lock-guarded database access.</param>
/// <param name="options">Site event log options (retention days, storage cap, purge interval).</param>
/// <param name="logger">Logger instance.</param>
/// <param name="isActiveNode">
/// SiteEventLogging-019: optional active-node check. When <c>null</c>, the
/// service runs the purge on every tick (preserves the pre-fix behaviour
/// for non-clustered hosts and existing tests). When supplied — e.g. by
/// the Host on a site node — each tick early-exits on the standby so the
/// daily purge runs only on the active node, matching the design.
/// </param>
public EventLogPurgeService(
SiteEventLogger eventLogger,
IOptions<SiteEventLogOptions> options,
ILogger<EventLogPurgeService> logger,
SiteEventLogActiveNodeCheck? isActiveNode = null)
{
// Depend on the concrete recorder directly: purge must funnel database access
// through its lock-guarded WithConnection. Taking ISiteEventLogger and
// downcasting would throw InvalidCastException for any other implementation.
_eventLogger = eventLogger;
_options = options.Value;
_logger = logger;
_isActiveNode = isActiveNode ?? (static () => true);
}
/// <inheritdoc />
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation(
"Event log purge service started — retention: {Days} days, cap: {Cap} MB, interval: {Interval}",
_options.RetentionDays, _options.MaxStorageMb, _options.PurgeInterval);
using var timer = new PeriodicTimer(_options.PurgeInterval);
// Run an initial purge on startup
RunPurge();
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
{
RunPurge();
}
}
/// <summary>Runs a single purge cycle: first by retention age, then by storage cap.</summary>
internal void RunPurge()
{
try
{
// SiteEventLogging-019: gate every tick on the active-node check.
// The standby's local SQLite receives no writes, so purging there
// is harmless but unnecessary; the design (Component-SiteEventLogging
// → Storage) explicitly states the purge runs on the active node.
// Defensive try/catch: a transient cluster-state read failure must
// not stop the purge loop — fall back to running the purge (the
// pre-fix behaviour was "always run", which is harmless on standby).
bool isActive;
try
{
isActive = _isActiveNode();
}
catch (Exception checkEx)
{
_logger.LogDebug(checkEx,
"Active-node check threw during purge tick; running purge to be safe");
isActive = true;
}
if (!isActive)
{
_logger.LogDebug("Skipping event log purge tick — this node is not the active site member");
return;
}
PurgeByRetention();
PurgeByStorageCap();
}
catch (Exception ex)
{
_logger.LogError(ex, "Error during event log purge");
}
}
private void PurgeByRetention()
{
var cutoff = DateTimeOffset.UtcNow.AddDays(-_options.RetentionDays).ToString("o");
var deleted = _eventLogger.WithConnection(connection =>
{
using var cmd = connection.CreateCommand();
cmd.CommandText = "DELETE FROM site_events WHERE timestamp < $cutoff";
cmd.Parameters.AddWithValue("$cutoff", cutoff);
return cmd.ExecuteNonQuery();
});
if (deleted > 0)
{
_logger.LogInformation("Purged {Count} events older than {Days} days", deleted, _options.RetentionDays);
}
}
private void PurgeByStorageCap()
{
var capBytes = (long)_options.MaxStorageMb * 1024 * 1024;
var currentSizeBytes = GetDatabaseSizeBytes();
if (currentSizeBytes <= capBytes)
return;
_logger.LogWarning(
"Event log size {Size:F1} MB exceeds cap {Cap} MB — purging oldest events",
currentSizeBytes / (1024.0 * 1024.0), _options.MaxStorageMb);
// Delete the oldest events in batches until the database is under the cap.
// The loop also stops if the on-disk size fails to decrease across an
// iteration (e.g. if vacuum cannot reclaim space), so a cap that can never
// be met does not silently empty the entire table.
while (currentSizeBytes > capBytes)
{
var previousSizeBytes = currentSizeBytes;
var deleted = _eventLogger.WithConnection(connection =>
{
using var cmd = connection.CreateCommand();
cmd.CommandText = $"""
DELETE FROM site_events WHERE id IN (
SELECT id FROM site_events ORDER BY id ASC LIMIT {CapPurgeBatchSize}
)
""";
var rows = cmd.ExecuteNonQuery();
// Reclaim free pages so page_count/freelist measurement reflects the
// delete. Effective because auto_vacuum = INCREMENTAL is set at schema
// creation; harmless otherwise.
using var vacuumCmd = connection.CreateCommand();
vacuumCmd.CommandText = "PRAGMA incremental_vacuum";
vacuumCmd.ExecuteNonQuery();
return rows;
});
if (deleted == 0)
break;
currentSizeBytes = GetDatabaseSizeBytes();
if (currentSizeBytes >= previousSizeBytes)
{
// Size is not shrinking despite deletes — stop rather than wipe the
// whole table. This should not happen now that logical size is
// measured, but guards against any future regression.
_logger.LogWarning(
"Event log size did not decrease after a cap-purge batch ({Size:F1} MB); " +
"stopping to avoid emptying the log",
currentSizeBytes / (1024.0 * 1024.0));
break;
}
}
}
/// <summary>
/// Returns the logical size of the database in bytes — only pages that hold live
/// data, excluding free pages on the freelist. Measuring logical size (rather than
/// the raw file size from <c>page_count</c>) means the storage-cap loop observes
/// space being reclaimed even if free pages have not yet been returned to the OS.
/// </summary>
internal long GetDatabaseSizeBytes()
{
return _eventLogger.WithConnection(connection =>
{
using var pageCountCmd = connection.CreateCommand();
pageCountCmd.CommandText = "PRAGMA page_count";
var pageCount = (long)pageCountCmd.ExecuteScalar()!;
using var freeListCmd = connection.CreateCommand();
freeListCmd.CommandText = "PRAGMA freelist_count";
var freeListCount = (long)freeListCmd.ExecuteScalar()!;
using var pageSizeCmd = connection.CreateCommand();
pageSizeCmd.CommandText = "PRAGMA page_size";
var pageSize = (long)pageSizeCmd.ExecuteScalar()!;
var usedPages = Math.Max(0, pageCount - freeListCount);
return usedPages * pageSize;
});
}
}
@@ -0,0 +1,198 @@
using System.Globalization;
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
/// <summary>
/// Processes event log queries locally against SQLite.
/// Supports filtering by event_type, time range, instance_id, severity,
/// and keyword search (LIKE on message and source).
/// Uses keyset pagination with continuation token (last event ID).
/// </summary>
public class EventLogQueryService : IEventLogQueryService
{
private readonly SiteEventLogger _eventLogger;
private readonly SiteEventLogOptions _options;
private readonly ILogger<EventLogQueryService> _logger;
/// <summary>Initializes a new instance of <see cref="EventLogQueryService"/>.</summary>
/// <param name="eventLogger">The concrete event logger providing lock-guarded database access.</param>
/// <param name="options">Site event log options (page size and other query settings).</param>
/// <param name="logger">Logger instance.</param>
public EventLogQueryService(
SiteEventLogger eventLogger,
IOptions<SiteEventLogOptions> options,
ILogger<EventLogQueryService> logger)
{
// Depend on the concrete recorder directly: queries must funnel database
// access through its lock-guarded WithConnection. Taking ISiteEventLogger and
// downcasting would throw InvalidCastException for any other implementation.
_eventLogger = eventLogger;
_options = options.Value;
_logger = logger;
}
/// <summary>
/// Escapes the SQL <c>LIKE</c> metacharacters (<c>\</c>, <c>%</c>, <c>_</c>) in a
/// user-supplied keyword so it is matched as a literal substring. Used together
/// with a <c>LIKE ... ESCAPE '\'</c> clause.
/// </summary>
private static string EscapeLikePattern(string input)
{
return input
.Replace("\\", "\\\\")
.Replace("%", "\\%")
.Replace("_", "\\_");
}
/// <inheritdoc />
public EventLogQueryResponse ExecuteQuery(EventLogQueryRequest request)
{
try
{
// SiteEventLogging-017: clamp caller-supplied PageSize to a hard upper
// bound so a central client sending int.MaxValue can't force the query
// to materialise the entire log into a single list while holding the
// shared write lock. Silent clamp — misconfigured clients still get a
// usable response.
var requestedSize = request.PageSize > 0 ? request.PageSize : _options.QueryPageSize;
var pageSize = Math.Min(requestedSize, _options.MaxQueryPageSize);
var whereClauses = new List<string>();
var parameters = new List<SqliteParameter>();
// Keyset pagination: only return events with id > continuation token
if (request.ContinuationToken.HasValue)
{
whereClauses.Add("id > $afterId");
parameters.Add(new SqliteParameter("$afterId", request.ContinuationToken.Value));
}
if (request.From.HasValue)
{
whereClauses.Add("timestamp >= $from");
parameters.Add(new SqliteParameter("$from", request.From.Value.ToString("o")));
}
if (request.To.HasValue)
{
whereClauses.Add("timestamp <= $to");
parameters.Add(new SqliteParameter("$to", request.To.Value.ToString("o")));
}
if (!string.IsNullOrWhiteSpace(request.EventType))
{
whereClauses.Add("event_type = $eventType");
parameters.Add(new SqliteParameter("$eventType", request.EventType));
}
if (!string.IsNullOrWhiteSpace(request.Severity))
{
whereClauses.Add("severity = $severity");
parameters.Add(new SqliteParameter("$severity", request.Severity));
}
if (!string.IsNullOrWhiteSpace(request.InstanceId))
{
whereClauses.Add("instance_id = $instanceId");
parameters.Add(new SqliteParameter("$instanceId", request.InstanceId));
}
if (!string.IsNullOrWhiteSpace(request.KeywordFilter))
{
// Keyword search is a literal substring match. The LIKE
// metacharacters % and _ (and the escape char itself) must be
// escaped so identifiers such as "store_and_forward" or a literal
// "%" are not misinterpreted as wildcards (SiteEventLogging-013).
var escaped = EscapeLikePattern(request.KeywordFilter);
whereClauses.Add(
"(message LIKE $keyword ESCAPE '\\' OR source LIKE $keyword ESCAPE '\\')");
parameters.Add(new SqliteParameter("$keyword", $"%{escaped}%"));
}
var whereClause = whereClauses.Count > 0
? "WHERE " + string.Join(" AND ", whereClauses)
: "";
// Run the read against the shared connection under the logger's write
// lock — the connection is not thread-safe and is also used by the
// recorder and the purge service on other threads.
var entries = _eventLogger.WithConnection(connection =>
{
using var cmd = connection.CreateCommand();
// Fetch pageSize + 1 to determine if there are more results
cmd.CommandText = $"""
SELECT id, timestamp, event_type, severity, instance_id, source, message, details
FROM site_events
{whereClause}
ORDER BY id ASC
LIMIT $limit
""";
cmd.Parameters.AddWithValue("$limit", pageSize + 1);
foreach (var p in parameters)
cmd.Parameters.Add(p);
var rows = new List<EventLogEntry>();
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(new EventLogEntry(
Id: reader.GetInt64(0),
// Parse with explicit invariant culture and round-trip style
// (SiteEventLogging-021). Stored values are ISO 8601 "o" UTC
// (see SiteEventLogger.LogEventAsync), and the recorder's
// emitted offset is always +00:00; AssumeUniversal +
// AdjustToUniversal guarantees the parsed value is UTC and
// does not depend on the host's CurrentCulture.
Timestamp: DateTimeOffset.Parse(
reader.GetString(1),
CultureInfo.InvariantCulture,
DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal),
EventType: reader.GetString(2),
Severity: reader.GetString(3),
InstanceId: reader.IsDBNull(4) ? null : reader.GetString(4),
Source: reader.GetString(5),
Message: reader.GetString(6),
Details: reader.IsDBNull(7) ? null : reader.GetString(7)));
}
return rows;
});
var hasMore = entries.Count > pageSize;
if (hasMore)
{
entries.RemoveAt(entries.Count - 1);
}
var continuationToken = entries.Count > 0 ? entries[^1].Id : (long?)null;
return new EventLogQueryResponse(
CorrelationId: request.CorrelationId,
SiteId: request.SiteId,
Entries: entries,
ContinuationToken: continuationToken,
HasMore: hasMore,
Success: true,
ErrorMessage: null,
Timestamp: DateTimeOffset.UtcNow);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to execute event log query: {CorrelationId}", request.CorrelationId);
return new EventLogQueryResponse(
CorrelationId: request.CorrelationId,
SiteId: request.SiteId,
Entries: [],
ContinuationToken: null,
HasMore: false,
Success: false,
ErrorMessage: ex.Message,
Timestamp: DateTimeOffset.UtcNow);
}
}
}
@@ -0,0 +1,16 @@
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
/// <summary>
/// Interface for querying site event logs.
/// Used by Communication Layer to process remote queries from central.
/// </summary>
public interface IEventLogQueryService
{
/// <summary>
/// Executes a query against the local site event log and returns the matching entries.
/// </summary>
/// <param name="request">The query parameters including filters, pagination, and keyword search.</param>
EventLogQueryResponse ExecuteQuery(EventLogQueryRequest request);
}
@@ -0,0 +1,40 @@
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
/// <summary>
/// Interface for recording operational events to the local SQLite event log.
/// </summary>
public interface ISiteEventLogger
{
/// <summary>
/// Record an event asynchronously. The call enqueues the event onto a background
/// writer and returns without blocking the caller on disk I/O. The returned
/// <see cref="Task"/> completes once the event is durably persisted and faults if
/// the write fails, so callers that <c>await</c> it observe success or failure.
/// </summary>
/// <param name="eventType">Category: script, alarm, deployment, connection, store_and_forward, instance_lifecycle</param>
/// <param name="severity">Info, Warning, or Error</param>
/// <param name="instanceId">Optional instance ID associated with the event</param>
/// <param name="source">Source identifier, e.g., "ScriptActor:MonitorSpeed"</param>
/// <param name="message">Human-readable event description</param>
/// <param name="details">
/// Optional free-form detail text (stack traces, compilation errors, etc.).
/// Stored verbatim — JSON is conventional but not validated or enforced.
/// </param>
Task LogEventAsync(
string eventType,
string severity,
string? instanceId,
string source,
string message,
string? details = null);
/// <summary>
/// SiteEventLogging-018: total number of event writes that have failed
/// (SQLite error, disk full, bounded-queue overflow drop, etc.) since this
/// logger was created. Available for future Health Monitoring integration —
/// promoted onto the interface so a Health consumer can read it without a
/// concrete-type downcast. Not yet polled by Health Monitoring; the wiring
/// is tracked separately.
/// </summary>
long FailedWriteCount { get; }
}
@@ -0,0 +1,45 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
public static class ServiceCollectionExtensions
{
/// <summary>
/// Register site event logging services (recording, purge, query).
/// </summary>
/// <param name="services">The DI service collection to register into.</param>
public static IServiceCollection AddSiteEventLogging(this IServiceCollection services)
{
// The recorder is registered as a concrete singleton and the interface is
// forwarded to the same instance. The purge and query services depend on the
// concrete SiteEventLogger directly (they need its lock-guarded WithConnection)
// rather than downcasting an ISiteEventLogger, which would throw
// InvalidCastException for any other ISiteEventLogger implementation.
services.AddSingleton<SiteEventLogger>();
services.AddSingleton<ISiteEventLogger>(sp => sp.GetRequiredService<SiteEventLogger>());
services.AddSingleton<IEventLogQueryService, EventLogQueryService>();
// SiteEventLogging-019: the purge service still registers on every host
// node, but it consults an optional SiteEventLogActiveNodeCheck on each
// tick and early-exits on the standby. The Host registers the real
// active-node check on site nodes; tests and non-clustered hosts leave
// it unregistered, and the purge defaults to "always run" (the
// pre-fix behaviour). Building the service via a factory so the
// optional delegate flows from DI rather than the constructor default.
services.AddHostedService(sp => new EventLogPurgeService(
sp.GetRequiredService<SiteEventLogger>(),
sp.GetRequiredService<IOptions<SiteEventLogOptions>>(),
sp.GetRequiredService<ILogger<EventLogPurgeService>>(),
sp.GetService<SiteEventLogActiveNodeCheck>()));
return services;
}
// NOTE: EventLogHandlerActor is wired up directly in
// ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs as a cluster singleton, because the
// actor must be created inside the ActorSystem with the resolved
// IEventLogQueryService. There is intentionally no DI helper for that here — a
// former AddSiteEventLoggingActors placeholder was dead code and has been removed.
}
@@ -0,0 +1,30 @@
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
public class SiteEventLogOptions
{
/// <summary>Number of days to retain site event log entries before purge; default 30.</summary>
public int RetentionDays { get; set; } = 30;
/// <summary>Maximum SQLite database size in megabytes before old entries are purged; default 1024 MB.</summary>
public int MaxStorageMb { get; set; } = 1024;
/// <summary>File path for the site event log SQLite database.</summary>
public string DatabasePath { get; set; } = "site_events.db";
/// <summary>Maximum number of rows returned per paginated query; default 500.</summary>
public int QueryPageSize { get; set; } = 500;
/// <summary>
/// SiteEventLogging-017: hard upper bound on a caller-supplied <c>PageSize</c>. A
/// misbehaving or hostile central client that requests <c>int.MaxValue</c> would
/// otherwise force the query to materialise the entire log into a single list while
/// holding the shared write lock. Silent clamp; default 500 matches
/// <see cref="QueryPageSize"/>.
/// </summary>
public int MaxQueryPageSize { get; set; } = 500;
/// <summary>Interval between purge runs; default 24 hours.</summary>
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
/// <summary>
/// SiteEventLogging-015: bound on the background write queue. Default 10 000 events.
/// Overflow uses <c>BoundedChannelFullMode.DropOldest</c> — callers never block; the
/// dropped event's <c>Task</c> is faulted and <c>FailedWriteCount</c> is incremented
/// so the drop is observable.
/// </summary>
public int WriteQueueCapacity { get; set; } = 10_000;
}
@@ -0,0 +1,334 @@
using System.Threading.Channels;
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ZB.MOM.WW.ScadaBridge.SiteEventLogging;
/// <summary>
/// Records operational events to a local SQLite database.
/// Only the active node generates events. Not replicated to standby.
/// On failover, the new active node starts a fresh log.
/// </summary>
/// <remarks>
/// <para>
/// A single <see cref="SqliteConnection"/> is owned here and is NOT thread-safe.
/// All access — recording, querying, purging — must be funnelled through
/// <see cref="WithConnection"/>, which serialises callers on a shared lock.
/// </para>
/// <para>
/// Event recording is offloaded to a dedicated background writer thread (fed by a
/// <em>bounded</em> <see cref="Channel{T}"/>; capacity <see cref="SiteEventLogOptions.WriteQueueCapacity"/>,
/// default 10 000, overflow <see cref="BoundedChannelFullMode.DropOldest"/>).
/// <see cref="LogEventAsync"/> only validates its arguments and enqueues, so callers —
/// typically Akka actor threads on hot paths — never block on disk I/O or on
/// contention for the write lock. The returned <see cref="Task"/> completes once the
/// event is durably persisted and faults if the write fails. SiteEventLogging-015:
/// when a queued event is evicted to make room for a newer one, that event's Task
/// is faulted with <see cref="InvalidOperationException"/> and
/// <see cref="FailedWriteCount"/> is incremented so the drop is observable.
/// </para>
/// </remarks>
public class SiteEventLogger : ISiteEventLogger, IDisposable
{
private readonly SqliteConnection _connection;
private readonly ILogger<SiteEventLogger> _logger;
private readonly object _writeLock = new();
private readonly Channel<PendingEvent> _writeQueue;
private readonly Task _writerLoop;
private long _failedWriteCount;
private bool _disposed;
/// <summary>
/// Initializes the event logger, opens the SQLite connection, and starts the background writer loop.
/// </summary>
/// <param name="options">Site event log configuration (database path, retention settings).</param>
/// <param name="logger">Logger for write-failure diagnostics.</param>
/// <param name="connectionStringOverride">Optional connection string override; uses the configured path when null.</param>
public SiteEventLogger(
IOptions<SiteEventLogOptions> options,
ILogger<SiteEventLogger> logger,
string? connectionStringOverride = null)
{
_logger = logger;
// SiteEventLogging-022: Cache=Shared is a cross-connection optimisation
// that lets multiple SqliteConnections share an in-process page cache.
// This logger owns exactly one SqliteConnection and serialises all
// access through _writeLock, so the mode is dormant — at best dead
// configuration, at worst a small future foot-gun for any second
// connection opened to the same file. A test path that genuinely
// needs Cache=Shared can still inject it via connectionStringOverride.
var connectionString = connectionStringOverride
?? $"Data Source={options.Value.DatabasePath}";
_connection = new SqliteConnection(connectionString);
_connection.Open();
InitializeSchema();
// SiteEventLogging-015: bounded queue with DropOldest preserves the
// "callers never block" guarantee (SiteEventLogging-005) while putting an
// upper bound on memory under sustained writer slowness. Drops are
// observable — itemDropped faults the evicted Task and increments
// FailedWriteCount.
var capacity = Math.Max(1, options.Value.WriteQueueCapacity);
_writeQueue = Channel.CreateBounded<PendingEvent>(
new BoundedChannelOptions(capacity)
{
SingleReader = true,
SingleWriter = false,
FullMode = BoundedChannelFullMode.DropOldest,
},
itemDropped: dropped =>
{
Interlocked.Increment(ref _failedWriteCount);
dropped.Completion.TrySetException(
new InvalidOperationException(
$"Event was dropped because the write queue exceeded its bounded capacity ({capacity})."));
});
_writerLoop = Task.Run(ProcessWriteQueueAsync);
}
/// <summary>
/// SiteEventLogging-018: number of event writes that have failed (SQLite
/// error, disk full, bounded-queue overflow drop, etc.) since this logger
/// was created. Available for future Health Monitoring integration — the
/// counter is correct and observable, but the central health-metric
/// pipeline does not yet poll it, so a sustained non-zero value currently
/// goes unnoticed in production beyond the per-failure log line. Wiring
/// the metric into the 30-second site-metric publish is tracked
/// separately; promoted to <see cref="ISiteEventLogger"/> so the eventual
/// consumer reads it without a concrete-type downcast.
/// </summary>
public long FailedWriteCount => Interlocked.Read(ref _failedWriteCount);
/// <summary>
/// Runs <paramref name="action"/> against the shared connection while holding the
/// write lock, so purge / query / record callers on different threads never use
/// the non-thread-safe <see cref="SqliteConnection"/> concurrently.
/// Returns <see langword="false"/> without invoking the action if the logger has
/// been disposed.
/// </summary>
/// <param name="action">The action to run against the shared connection.</param>
internal bool WithConnection(Action<SqliteConnection> action)
{
ArgumentNullException.ThrowIfNull(action);
lock (_writeLock)
{
if (_disposed) return false;
action(_connection);
return true;
}
}
/// <summary>
/// Runs <paramref name="func"/> against the shared connection while holding the
/// write lock. Throws <see cref="ObjectDisposedException"/> if the logger has
/// been disposed (callers that need a result cannot proceed without the database).
/// </summary>
/// <typeparam name="T">The return type of the function.</typeparam>
/// <param name="func">The function to run against the shared connection.</param>
internal T WithConnection<T>(Func<SqliteConnection, T> func)
{
ArgumentNullException.ThrowIfNull(func);
lock (_writeLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
return func(_connection);
}
}
private void InitializeSchema()
{
// auto_vacuum must be set before any table is created for it to take effect
// on a fresh database. With INCREMENTAL mode, PRAGMA incremental_vacuum can
// later reclaim free pages so the storage-cap purge can shrink the file.
using (var pragmaCmd = _connection.CreateCommand())
{
pragmaCmd.CommandText = "PRAGMA auto_vacuum = INCREMENTAL";
pragmaCmd.ExecuteNonQuery();
}
using var cmd = _connection.CreateCommand();
cmd.CommandText = """
CREATE TABLE IF NOT EXISTS site_events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
event_type TEXT NOT NULL,
severity TEXT NOT NULL,
instance_id TEXT,
source TEXT NOT NULL,
message TEXT NOT NULL,
details TEXT
);
CREATE INDEX IF NOT EXISTS idx_events_timestamp ON site_events(timestamp);
CREATE INDEX IF NOT EXISTS idx_events_type ON site_events(event_type);
CREATE INDEX IF NOT EXISTS idx_events_instance ON site_events(instance_id);
CREATE INDEX IF NOT EXISTS idx_events_severity ON site_events(severity);
""";
// The query service also supports keyword search via leading-wildcard
// LIKE on message/source. A leading-wildcard LIKE cannot use a B-tree
// index, so that path intentionally full-scans; severity/event_type/
// instance_id/timestamp filters above are all covered.
cmd.ExecuteNonQuery();
}
/// <summary>
/// SiteEventLogging-020: closed set of allowed severities. Case-sensitive to
/// match the SQLite default <c>BINARY</c> collation used by the query filter —
/// a row stored as <c>"error"</c> would be invisible to a query filtering on
/// <c>"Error"</c>, so the contract on the way in must match the contract on
/// the way out.
/// </summary>
private static readonly HashSet<string> AllowedSeverities =
new(StringComparer.Ordinal) { "Info", "Warning", "Error" };
/// <inheritdoc />
public Task LogEventAsync(
string eventType,
string severity,
string? instanceId,
string source,
string message,
string? details = null)
{
ArgumentException.ThrowIfNullOrWhiteSpace(eventType);
ArgumentException.ThrowIfNullOrWhiteSpace(severity);
ArgumentException.ThrowIfNullOrWhiteSpace(source);
ArgumentException.ThrowIfNullOrWhiteSpace(message);
// SiteEventLogging-020: reject unknown severities so the query-time filter
// (case-sensitive BINARY collation) and the documented enum stay in sync.
if (!AllowedSeverities.Contains(severity))
{
throw new ArgumentException(
$"Severity '{severity}' is not one of the allowed values: Info, Warning, Error.",
nameof(severity));
}
var pending = new PendingEvent(
DateTimeOffset.UtcNow.ToString("o"),
eventType,
severity,
instanceId,
source,
message,
details);
// Enqueue only — the actual SQLite write happens on the background writer
// thread so the caller (an Akka actor thread on a hot path) never blocks
// on disk I/O or on contention for the write lock.
if (!_writeQueue.Writer.TryWrite(pending))
{
// The channel is unbounded, so the only way TryWrite fails is that the
// writer has been completed (logger disposed). The event cannot be
// persisted — fault the Task (SiteEventLogging-012) rather than
// reporting false success, so a caller that awaits a critical audit
// event can tell it was dropped.
pending.Completion.TrySetException(
new ObjectDisposedException(nameof(SiteEventLogger),
"Event could not be recorded: the event logger has been disposed."));
}
return pending.Completion.Task;
}
private async Task ProcessWriteQueueAsync()
{
await foreach (var pending in _writeQueue.Reader.ReadAllAsync().ConfigureAwait(false))
{
try
{
var written = WithConnection(connection =>
{
using var cmd = connection.CreateCommand();
cmd.CommandText = """
INSERT INTO site_events (timestamp, event_type, severity, instance_id, source, message, details)
VALUES ($timestamp, $event_type, $severity, $instance_id, $source, $message, $details)
""";
cmd.Parameters.AddWithValue("$timestamp", pending.Timestamp);
cmd.Parameters.AddWithValue("$event_type", pending.EventType);
cmd.Parameters.AddWithValue("$severity", pending.Severity);
cmd.Parameters.AddWithValue("$instance_id", (object?)pending.InstanceId ?? DBNull.Value);
cmd.Parameters.AddWithValue("$source", pending.Source);
cmd.Parameters.AddWithValue("$message", pending.Message);
cmd.Parameters.AddWithValue("$details", (object?)pending.Details ?? DBNull.Value);
cmd.ExecuteNonQuery();
});
if (written)
{
pending.Completion.TrySetResult();
}
else
{
// WithConnection returns false only when the logger has been
// disposed mid-drain; the event was not persisted. Fault the
// Task (SiteEventLogging-012) instead of reporting false
// success for a dropped audit event.
pending.Completion.TrySetException(
new ObjectDisposedException(nameof(SiteEventLogger),
"Event could not be recorded: the event logger was disposed before the write completed."));
}
}
catch (Exception ex)
{
// SiteEventLogging-008: a write failure must be observable. Count it
// (Health Monitoring reads FailedWriteCount) and fault the caller's
// Task instead of silently discarding the exception.
Interlocked.Increment(ref _failedWriteCount);
_logger.LogError(ex, "Failed to record event: {EventType} from {Source}",
pending.EventType, pending.Source);
pending.Completion.TrySetException(ex);
}
}
}
/// <summary>
/// Stops accepting new events, drains the write queue, and disposes the SQLite connection.
/// </summary>
public void Dispose()
{
Task? writerLoop = null;
lock (_writeLock)
{
if (_disposed) return;
_disposed = true;
// Stop accepting new events and let the writer loop drain.
_writeQueue.Writer.TryComplete();
writerLoop = _writerLoop;
}
// Wait for the writer loop to finish outside the lock — the loop itself
// acquires the lock for each write.
try
{
writerLoop?.Wait(TimeSpan.FromSeconds(5));
}
catch (AggregateException)
{
// A faulted writer loop has already been logged per event; nothing more
// to do during disposal.
}
lock (_writeLock)
{
_connection.Dispose();
}
}
/// <summary>An event awaiting persistence by the background writer.</summary>
private sealed record PendingEvent(
string Timestamp,
string EventType,
string Severity,
string? InstanceId,
string Source,
string Message,
string? Details)
{
/// <summary>Completes when the event has been durably persisted, or faults on write failure.</summary>
public TaskCompletionSource Completion { get; } =
new(TaskCreationOptions.RunContinuationsAsynchronously);
}
}
@@ -0,0 +1,27 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Akka" />
<PackageReference Include="Microsoft.Data.Sqlite" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
</ItemGroup>
<ItemGroup>
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.SiteEventLogging.Tests" />
</ItemGroup>
</Project>