fix(api-surface): close Theme 9 — 27 naming / dead-code / config / hygiene findings
The largest themed batch — small mechanical fixes across 11 modules.
API / message hygiene:
- Comm-020: SiteAddressCacheLoaded now carries IReadOnlyDictionary /
IReadOnlyList — Akka messages must be immutable.
- Commons-016: BundleSession.MaxUnlockAttempts named constant replaces
magic 3.
- Commons-018: IOperationTrackingStore + IPartitionMaintenance moved from
Interfaces/ root to Interfaces/Services/ (namespace preserved — 9
consumers exceeded the in-prompt move threshold).
- Commons-023: TrackingStatusSnapshot.SourceNode now consistent with the
trailing-optional-with-default pattern used elsewhere.
- SR-022: AuditingDbCommand.DbConnection.set no longer uses reflection —
exposes AuditingDbConnection.Inner via internal API surface.
Dead code / config cleanup:
- ClusterInfra-011: decorative SectionName constant deleted.
- ClusterInfra-014: dead AddClusterInfrastructureActors method + its
"throws-when-called" test deleted.
- Host-021: Microsoft Logging:LogLevel block deleted from appsettings.json
(dead under Serilog).
Fail-loud over fail-silent:
- DM-021: ResolveSiteIdentifierAsync throws on missing site (was silently
substituting a DB id).
- DM-022: dropped transient Pending write — record now lands directly in
InProgress (no UI flicker, one fewer DB write).
- Host-020: LoggerConfigurationFactory emits a Console.Error warning when
both Serilog:MinimumLevel and ScadaLink:Logging:MinimumLevel are set
(ScadaLink remains truth per Host-011).
- SnF-022: NotifyCachedCallObserverAsync logs Warning on unparseable
TrackedOperationId (was silently dropping).
- SnF-023: empty siteId default replaced with $unknown-site sentinel
+ constructor normalisation.
Correctness:
- SCA-001: SupervisorStrategy XML rewritten to match actual
DefaultDecider/Restart semantics (was claiming Resume).
- SCA-003: OnUpsertAsync now restamps IngestedAtUtc on every upsert.
- SR-021: HandleDeployArtifacts now dispatches an internal
ApplyArtifactDataConnectionsToDcl message after the SQLite write so
system-wide artifact-deploy data-connection changes go live
immediately (was requiring a site restart).
- SnF-020: RetryParkedMessageAsync captures the parked row BEFORE the
local write so a concurrent delete can't skip standby replication.
Sentinels / naming collisions:
- HM-021: CentralSiteId changed from "central" to "$central"
(uncollideable — leading $ is forbidden in real SiteIdentifiers).
Doc / surface cleanups:
- SEL-018: FailedWriteCount promoted to ISiteEventLogger; XML softened
to "Available for future Health Monitoring integration".
- SnF-019: VERIFY outcome — documented parking-after-DefaultMaxRetries
in Component-StoreAndForward.md + DefaultMaxRetries XML (uniform
cap; maxRetries:0 is the unbounded escape hatch).
- SnF-021: Component-StoreAndForward.md no longer claims the tracking
table lives in SnF — it's in SiteRuntime, the interface is in Commons.
- CLI-020: bundle export response parse guarded with try/catch on
JsonException / KeyNotFoundException / FormatException — emits a
clean INVALID_RESPONSE exit instead of a stack trace.
Config:
- ClusterInfra-013: intent comment added to "catastrophic config" test.
- Host-016: appsettings.Site.json second CentralContactPoints entry
removed (was pointing at the SITE's own port); doc-key explains how
to extend.
- Host-018: NodeName added to both shipped per-role configs (was
causing SourceNode to be null on audit rows).
UI:
- CentralUI-029: replaced JS.InvokeAsync<int>("eval", …) with an ES
module import (new wwwroot/js/browser-time.js).
- CentralUI-032: AuditResultsGrid gains a Previous button backed by a
cursor stack.
10+ new regression tests across the affected projects. Build clean;
all suites green. README regenerated: 6 open (was 33).
Session-to-date: 130 of 136 originally-open Theme findings closed.
This commit is contained in:
@@ -118,9 +118,33 @@ public static class BundleCommands
|
||||
timeout: BundleCommandTimeout,
|
||||
onSuccess: jsonOk =>
|
||||
{
|
||||
using var doc = JsonDocument.Parse(jsonOk);
|
||||
var base64 = doc.RootElement.GetProperty("base64Bundle").GetString()!;
|
||||
var byteCount = doc.RootElement.GetProperty("byteCount").GetInt32();
|
||||
// CLI-020: previously the JSON envelope parse + property extraction +
|
||||
// base64 decode all ran unguarded — a server-side bug that omits one of
|
||||
// the two expected properties, returns a null base64 value, sends invalid
|
||||
// base64, or returns a malformed JSON envelope would surface as one of
|
||||
// KeyNotFoundException / InvalidOperationException / FormatException /
|
||||
// JsonException, i.e. an unhandled stack trace rather than the
|
||||
// documented "exit 1 with a clean INVALID_RESPONSE error". Wrap the
|
||||
// envelope parse and the streamed write in a single try/catch matching
|
||||
// the graceful-degradation theme established by CLI-002 / CLI-003 / CLI-005.
|
||||
string base64;
|
||||
int byteCount;
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(jsonOk);
|
||||
base64 = doc.RootElement.GetProperty("base64Bundle").GetString()!;
|
||||
byteCount = doc.RootElement.GetProperty("byteCount").GetInt32();
|
||||
}
|
||||
catch (Exception ex) when (ex is JsonException
|
||||
or KeyNotFoundException
|
||||
or InvalidOperationException)
|
||||
{
|
||||
OutputFormatter.WriteError(
|
||||
$"Server returned a malformed bundle-export response: {ex.Message}",
|
||||
"INVALID_RESPONSE");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// CLI-019: stream the base64 → file write so a 100 MB bundle
|
||||
// doesn't double-buffer through Convert.FromBase64String's
|
||||
// ~100 MB byte[] on the LOH plus a synchronous File.WriteAllBytes.
|
||||
@@ -128,7 +152,18 @@ public static class BundleCommands
|
||||
// jsonOk string (wire-format limit), but the decode + write
|
||||
// are now chunked, so peak working-set drops from
|
||||
// ~base64+byte[]+envelope to ~base64+small-chunk.
|
||||
var written = StreamBase64ToFile(base64, output);
|
||||
long written;
|
||||
try
|
||||
{
|
||||
written = StreamBase64ToFile(base64, output);
|
||||
}
|
||||
catch (FormatException ex)
|
||||
{
|
||||
OutputFormatter.WriteError(
|
||||
$"Server returned invalid base64 in the bundle response: {ex.Message}",
|
||||
"INVALID_RESPONSE");
|
||||
return 1;
|
||||
}
|
||||
Console.WriteLine($"Wrote {written:N0} bytes to {output} (server reported {byteCount:N0}).");
|
||||
return 0;
|
||||
});
|
||||
|
||||
@@ -75,10 +75,21 @@
|
||||
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<span class="text-muted small">Page @_pageNumber · @_rows.Count rows</span>
|
||||
<button class="btn btn-outline-secondary btn-sm"
|
||||
data-test="grid-next-page"
|
||||
disabled="@(_loading || _rows.Count < _pageSize)"
|
||||
@onclick="NextPage">Next page</button>
|
||||
@* CentralUI-032: keyset paging is naturally forward-only, but the
|
||||
in-component _cursorStack lets the user step back through previous
|
||||
pages by replaying the prior cursor. The Previous button is gated
|
||||
on the stack having at least one prior cursor — i.e. we are not on
|
||||
the first page. *@
|
||||
<div class="btn-group">
|
||||
<button class="btn btn-outline-secondary btn-sm"
|
||||
data-test="grid-prev-page"
|
||||
disabled="@(_loading || !CanGoBack)"
|
||||
@onclick="PrevPage">Previous page</button>
|
||||
<button class="btn btn-outline-secondary btn-sm"
|
||||
data-test="grid-next-page"
|
||||
disabled="@(_loading || _rows.Count < _pageSize)"
|
||||
@onclick="NextPage">Next page</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -66,6 +66,16 @@ public partial class AuditResultsGrid : IAsyncDisposable
|
||||
private bool _loading;
|
||||
private string? _error;
|
||||
|
||||
// CentralUI-032: small in-component stack of prior-page cursors so the user
|
||||
// can step backwards through results. Each Next push captures the cursor
|
||||
// that produced the current page (null for page 1) before advancing; each
|
||||
// Previous pop reloads the page at the recovered cursor. Mirrors the
|
||||
// SiteCallsReport keyset-paging shape called out in the finding.
|
||||
private readonly Stack<AuditLogPaging?> _cursorStack = new();
|
||||
// The cursor that produced the page currently on screen — kept so Next can
|
||||
// push it before advancing without recomputing it from _rows.
|
||||
private AuditLogPaging? _currentPaging;
|
||||
|
||||
private AuditLogQueryFilter? _activeFilter;
|
||||
|
||||
[Inject] private IJSRuntime JS { get; set; } = default!;
|
||||
@@ -196,6 +206,8 @@ public partial class AuditResultsGrid : IAsyncDisposable
|
||||
_activeFilter = Filter;
|
||||
_pageNumber = 1;
|
||||
_rows.Clear();
|
||||
_cursorStack.Clear();
|
||||
_currentPaging = null;
|
||||
if (Filter is not null)
|
||||
{
|
||||
await LoadAsync(paging: null);
|
||||
@@ -216,10 +228,36 @@ public partial class AuditResultsGrid : IAsyncDisposable
|
||||
AfterOccurredAtUtc: last.OccurredAtUtc,
|
||||
AfterEventId: last.EventId);
|
||||
|
||||
// CentralUI-032: remember the cursor that produced the current page so
|
||||
// a later Previous can navigate back to it. The page-1 entry is pushed
|
||||
// as null — LoadAsync treats null as "first page" (PageSize-only).
|
||||
_cursorStack.Push(_currentPaging);
|
||||
await LoadAsync(cursor);
|
||||
_pageNumber++;
|
||||
}
|
||||
|
||||
// CentralUI-032: pops the previous-page cursor off the stack and reloads
|
||||
// at that position. The pop only happens AFTER a successful reload — a
|
||||
// failed page-fetch leaves the user on the current page with the error
|
||||
// banner instead of stranding them between pages.
|
||||
private async Task PrevPage()
|
||||
{
|
||||
if (_cursorStack.Count == 0 || _activeFilter is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var prior = _cursorStack.Peek();
|
||||
await LoadAsync(prior);
|
||||
if (_error is null)
|
||||
{
|
||||
_cursorStack.Pop();
|
||||
_pageNumber = Math.Max(1, _pageNumber - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private bool CanGoBack => _cursorStack.Count > 0;
|
||||
|
||||
private async Task LoadAsync(AuditLogPaging? paging)
|
||||
{
|
||||
if (_activeFilter is null)
|
||||
@@ -235,6 +273,9 @@ public partial class AuditResultsGrid : IAsyncDisposable
|
||||
var page = await QueryService.QueryAsync(_activeFilter, effective);
|
||||
_rows.Clear();
|
||||
_rows.AddRange(page);
|
||||
// Track the cursor that produced the page now on screen so a later
|
||||
// Next can push it onto the stack before advancing.
|
||||
_currentPaging = paging;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
|
||||
@@ -245,14 +245,24 @@
|
||||
// same query param doesn't re-run the query on every parameter set.
|
||||
private Guid? _lastFetchedBundleImportId;
|
||||
|
||||
// CentralUI-029: the browser-time JS module that hosts getTimezoneOffsetMinutes().
|
||||
// Loaded lazily on first render via dynamic import; replaces the previous
|
||||
// `JS.InvokeAsync<int>("eval", "new Date().getTimezoneOffset()")` call, which
|
||||
// widened the JS-interop attack surface and was incompatible with strict CSP
|
||||
// `script-src` directives that forbid `unsafe-eval`.
|
||||
private const string BrowserTimeModulePath = "./_content/ScadaLink.CentralUI/js/browser-time.js";
|
||||
private IJSObjectReference? _browserTimeModule;
|
||||
|
||||
protected override async Task OnAfterRenderAsync(bool firstRender)
|
||||
{
|
||||
if (!firstRender) return;
|
||||
try
|
||||
{
|
||||
// Date.getTimezoneOffset() returns (UTC - local) in minutes.
|
||||
_browserUtcOffsetMinutes = await JS.InvokeAsync<int>(
|
||||
"eval", "new Date().getTimezoneOffset()");
|
||||
_browserTimeModule ??= await JS.InvokeAsync<IJSObjectReference>(
|
||||
"import", BrowserTimeModulePath);
|
||||
_browserUtcOffsetMinutes = await _browserTimeModule.InvokeAsync<int>(
|
||||
"getTimezoneOffsetMinutes");
|
||||
}
|
||||
catch (Exception ex) when (ex is JSException or JSDisconnectedException
|
||||
or InvalidOperationException or TaskCanceledException)
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
// CentralUI-029: small JS module to replace the JS.InvokeAsync<int>("eval", ...)
|
||||
// anti-pattern previously used by ConfigurationAuditLog. Exporting a named
|
||||
// function from an ES module:
|
||||
// * removes the residual `eval` JS-interop surface,
|
||||
// * is CSP-friendly (no `unsafe-eval` directive required),
|
||||
// * matches the module-import pattern (`session-expiry.js`, `audit-grid.js`,
|
||||
// `nav-state.js`, `transport.js`) the rest of the Central UI follows.
|
||||
//
|
||||
// The function returns the same value as `new Date().getTimezoneOffset()` —
|
||||
// minutes of (UTC - local), positive for time zones west of UTC.
|
||||
export function getTimezoneOffsetMinutes() {
|
||||
return new Date().getTimezoneOffset();
|
||||
}
|
||||
@@ -20,11 +20,15 @@ namespace ScadaLink.ClusterInfrastructure;
|
||||
/// </summary>
|
||||
public class ClusterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// The <c>appsettings.json</c> section name this options class binds from.
|
||||
/// Single source of truth so binding sites do not hard-code the magic string.
|
||||
/// </summary>
|
||||
public const string SectionName = "ScadaLink:Cluster";
|
||||
// ClusterInfra-011: the previous `public const string SectionName = "ScadaLink:Cluster";`
|
||||
// was documented as "single source of truth so binding sites do not hard-code the
|
||||
// magic string" but no caller ever read it — the Host's SiteServiceRegistration and
|
||||
// StartupValidator both hard-code the literal directly. Wiring those binding sites
|
||||
// to reference the constant lives in the Host's edit scope (a separate code-review
|
||||
// task); rather than carry a public constant whose guarantee the code does not
|
||||
// deliver, the constant is removed and the literal stays in the Host until the
|
||||
// Host-side wiring is done. If a future Host change wants the constant back, add it
|
||||
// when the binding sites can be updated in the same commit.
|
||||
|
||||
/// <summary>
|
||||
/// Akka.NET cluster seed nodes. Both nodes are seed nodes — each node lists
|
||||
|
||||
@@ -30,20 +30,17 @@ public static class ServiceCollectionExtensions
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reserved for cluster-infrastructure actor registration. This component does
|
||||
/// not register any actors — the Akka.NET bootstrap and actor wiring live in
|
||||
/// <c>ScadaLink.Host</c>. The method throws rather than silently returning
|
||||
/// success so that any caller assuming this component registers actors fails
|
||||
/// fast with a clear cause instead of failing later, far from here.
|
||||
/// </summary>
|
||||
/// <exception cref="NotImplementedException">Always thrown.</exception>
|
||||
/// <param name="services">The service collection (unused; method always throws).</param>
|
||||
public static IServiceCollection AddClusterInfrastructureActors(this IServiceCollection services)
|
||||
{
|
||||
throw new NotImplementedException(
|
||||
"ScadaLink.ClusterInfrastructure registers no actors. The Akka.NET actor system " +
|
||||
"bootstrap and all cluster actor registration live in ScadaLink.Host " +
|
||||
"(AkkaHostedService). Do not call AddClusterInfrastructureActors().");
|
||||
}
|
||||
// ClusterInfra-014: the previous `AddClusterInfrastructureActors` extension
|
||||
// was dead surface — its XML doc told callers "do not call", its body
|
||||
// unconditionally threw `NotImplementedException`, and no production caller
|
||||
// existed anywhere in the solution (verified by grep). The CI-002
|
||||
// "throw loudly" decision was made while CI-001's ownership question was
|
||||
// still open; that question is now permanently settled by the
|
||||
// "Implementation Note — Code Placement" section of
|
||||
// Component-ClusterInfrastructure.md, which records that all actor wiring
|
||||
// lives in ScadaLink.Host (AkkaHostedService). Keeping a public extension
|
||||
// method that exists only to throw was API-surface noise that an IDE would
|
||||
// still suggest via auto-complete, so the method and its companion
|
||||
// `AddClusterInfrastructureActors_ThrowsRatherThanSilentlySucceeding` test
|
||||
// were both removed.
|
||||
}
|
||||
|
||||
+6
@@ -1,5 +1,11 @@
|
||||
using ScadaLink.Commons.Types;
|
||||
|
||||
// Commons-018: physically lives under Interfaces/Services/ to match the
|
||||
// established subfolder convention (REQ-COM-5b), but the namespace stays
|
||||
// `ScadaLink.Commons.Interfaces` to avoid a cascading update to 9+ consumer
|
||||
// files across ScadaLink.SiteRuntime, ScadaLink.AuditLog and ScadaLink.Host.
|
||||
// Adopting the canonical `ScadaLink.Commons.Interfaces.Services` namespace
|
||||
// can be picked up alongside any future Commons-wide namespace tidy-up.
|
||||
namespace ScadaLink.Commons.Interfaces;
|
||||
|
||||
/// <summary>
|
||||
+6
@@ -1,3 +1,9 @@
|
||||
// Commons-018: physically lives under Interfaces/Services/ to match the
|
||||
// established subfolder convention (REQ-COM-5b), but the namespace stays
|
||||
// `ScadaLink.Commons.Interfaces` to avoid a cascading update to consumers
|
||||
// across ScadaLink.AuditLog and ScadaLink.ConfigurationDatabase. Adopting
|
||||
// the canonical `ScadaLink.Commons.Interfaces.Services` namespace can be
|
||||
// picked up alongside any future Commons-wide namespace tidy-up.
|
||||
namespace ScadaLink.Commons.Interfaces;
|
||||
|
||||
/// <summary>
|
||||
@@ -29,6 +29,11 @@ namespace ScadaLink.Commons.Types;
|
||||
/// Cluster node that submitted the cached call (e.g. <c>"node-a"</c> /
|
||||
/// <c>"node-b"</c>), captured at enqueue time. Null on rows persisted before
|
||||
/// the SourceNode stamping migration; stamping itself is wired in a later task.
|
||||
/// Commons-023: trailing-optional with a <c>= null</c> default, matching the
|
||||
/// SourceNode rollout convention now used on <c>SiteCallSummary</c>,
|
||||
/// <c>SiteCallDetail</c>, <c>NotificationSummary</c> and <c>NotificationDetail</c>
|
||||
/// — so existing positional construction sites keep compiling as new
|
||||
/// optional fields land on this record.
|
||||
/// </param>
|
||||
public sealed record TrackingStatusSnapshot(
|
||||
TrackedOperationId Id,
|
||||
@@ -43,4 +48,4 @@ public sealed record TrackingStatusSnapshot(
|
||||
DateTime? TerminalAtUtc,
|
||||
string? SourceInstanceId,
|
||||
string? SourceScript,
|
||||
string? SourceNode);
|
||||
string? SourceNode = null);
|
||||
|
||||
@@ -2,6 +2,16 @@ namespace ScadaLink.Commons.Types.Transport;
|
||||
|
||||
public sealed class BundleSession
|
||||
{
|
||||
/// <summary>
|
||||
/// Commons-016: legacy per-session lockout threshold (kept on this type for the
|
||||
/// shim <see cref="Locked"/> getter). The authoritative, server-side per-bundle
|
||||
/// counter is bounded by <c>TransportOptions.MaxUnlockAttemptsPerSession</c>
|
||||
/// (default also <c>3</c>) and is what <c>BundleImporter.LoadAsync</c> consults.
|
||||
/// This constant exists so the comparison in <see cref="Locked"/> uses a named
|
||||
/// symbol that a security review can grep for, rather than a literal <c>3</c>.
|
||||
/// </summary>
|
||||
public const int MaxUnlockAttempts = 3;
|
||||
|
||||
/// <summary>Unique identifier for this import session.</summary>
|
||||
public Guid SessionId { get; init; }
|
||||
/// <summary>Parsed manifest from the uploaded bundle.</summary>
|
||||
@@ -22,6 +32,7 @@ public sealed class BundleSession
|
||||
/// <summary>
|
||||
/// T-003 legacy: always <c>false</c> on a session returned by <c>LoadAsync</c>
|
||||
/// because lockout enforcement moved server-side; see <see cref="FailedUnlockAttempts"/>.
|
||||
/// The threshold is the named <see cref="MaxUnlockAttempts"/> constant (default 3).
|
||||
/// </summary>
|
||||
public bool Locked => FailedUnlockAttempts >= 3;
|
||||
public bool Locked => FailedUnlockAttempts >= MaxUnlockAttempts;
|
||||
}
|
||||
|
||||
@@ -410,7 +410,14 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
contacts[site.SiteIdentifier] = addrs;
|
||||
}
|
||||
|
||||
return new SiteAddressCacheLoaded(contacts);
|
||||
// Communication-020: freeze the cross-task payload before piping to
|
||||
// Self. The message record exposes read-only types (
|
||||
// IReadOnlyDictionary / IReadOnlyList) so the Akka.NET message-
|
||||
// immutability convention is enforced by type, not just convention.
|
||||
var frozen = contacts.ToDictionary(
|
||||
kvp => kvp.Key,
|
||||
kvp => (IReadOnlyList<string>)kvp.Value.AsReadOnly());
|
||||
return new SiteAddressCacheLoaded(frozen);
|
||||
}).PipeTo(self);
|
||||
}
|
||||
|
||||
@@ -540,8 +547,14 @@ public record RefreshSiteAddresses;
|
||||
/// <summary>
|
||||
/// Internal message carrying the loaded site contact data from the database.
|
||||
/// ClusterClient creation happens on the actor thread in HandleSiteAddressCacheLoaded.
|
||||
///
|
||||
/// Communication-020: the payload is exposed as <see cref="IReadOnlyDictionary{TKey,TValue}"/>
|
||||
/// of <see cref="IReadOnlyList{T}"/> so the Akka.NET "messages are immutable"
|
||||
/// convention is enforced at the type level rather than relying on producer
|
||||
/// discipline. The producer wraps the constructed buckets with
|
||||
/// <c>List<T>.AsReadOnly()</c> before piping to Self.
|
||||
/// </summary>
|
||||
internal record SiteAddressCacheLoaded(Dictionary<string, List<string>> SiteContacts);
|
||||
internal record SiteAddressCacheLoaded(IReadOnlyDictionary<string, IReadOnlyList<string>> SiteContacts);
|
||||
|
||||
/// <summary>
|
||||
/// Notification sent to debug view subscribers when the stream is terminated
|
||||
|
||||
@@ -103,11 +103,26 @@ public class DeploymentService
|
||||
/// <summary>
|
||||
/// Resolves the site's string identifier from the numeric DB ID.
|
||||
/// The communication layer routes by string identifier (e.g. "site-a"), not DB ID.
|
||||
///
|
||||
/// DeploymentManager-021: when the <see cref="Site"/> row is missing (FK was
|
||||
/// deleted, race with admin delete, DB inconsistency) the previous behaviour
|
||||
/// silently substituted the numeric id rendered as a string — every
|
||||
/// downstream `CommunicationService` call then failed with a confusing
|
||||
/// "unknown site" routing error that hid the real cause. Treat a missing
|
||||
/// site row as a hard validation failure: throw
|
||||
/// <see cref="InvalidOperationException"/> naming the unresolved id so the
|
||||
/// operator sees the actual problem. On the deploy path the existing
|
||||
/// try/catch turns this into a Failed deployment record with a clear
|
||||
/// message; lifecycle paths propagate it to the caller (CLI/UI) which
|
||||
/// surface it as an error to the operator.
|
||||
/// </summary>
|
||||
private async Task<string> ResolveSiteIdentifierAsync(int siteId, CancellationToken cancellationToken)
|
||||
{
|
||||
var site = await _siteRepository.GetSiteByIdAsync(siteId, cancellationToken);
|
||||
return site?.SiteIdentifier ?? siteId.ToString();
|
||||
if (site == null)
|
||||
throw new InvalidOperationException(
|
||||
$"Site with ID {siteId} not found; cannot resolve its SiteIdentifier for routing.");
|
||||
return site.SiteIdentifier;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -174,11 +189,23 @@ public class DeploymentService
|
||||
if (reconciled != null)
|
||||
return Result<DeploymentRecord>.Success(reconciled);
|
||||
|
||||
// WP-4: Create deployment record with Pending status
|
||||
// WP-4: Create the deployment record directly in InProgress.
|
||||
//
|
||||
// DeploymentManager-022: the previous code wrote the record as Pending,
|
||||
// then immediately updated it to InProgress with no work in between
|
||||
// (flattening, validation, and reconciliation all completed above). The
|
||||
// back-to-back write cost an extra SaveChangesAsync round-trip, an
|
||||
// extra IDeploymentStatusNotifier push (CentralUI-006 rendered a
|
||||
// Pending→InProgress flicker for ~ms), and an extra row-version bump
|
||||
// for nothing. The transient Pending slot carried no operational
|
||||
// meaning — it was set and immediately overwritten — so dropping it
|
||||
// collapses the start of the deploy into a single insert + notify.
|
||||
// InProgress remains the documented "sent to site, awaiting response"
|
||||
// state, set immediately before the round-trip below.
|
||||
var record = new DeploymentRecord(deploymentId, user)
|
||||
{
|
||||
InstanceId = instanceId,
|
||||
Status = DeploymentStatus.Pending,
|
||||
Status = DeploymentStatus.InProgress,
|
||||
RevisionHash = revisionHash,
|
||||
DeployedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
@@ -187,12 +214,6 @@ public class DeploymentService
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
NotifyStatusChange(record);
|
||||
|
||||
// Update status to InProgress
|
||||
record.Status = DeploymentStatus.InProgress;
|
||||
await _repository.UpdateDeploymentRecordAsync(record, cancellationToken);
|
||||
await _repository.SaveChangesAsync(cancellationToken);
|
||||
NotifyStatusChange(record);
|
||||
|
||||
try
|
||||
{
|
||||
// WP-1: Send to site via CommunicationService
|
||||
|
||||
@@ -18,8 +18,22 @@ public class CentralHealthReportLoop : BackgroundService
|
||||
/// <summary>
|
||||
/// Reserved siteId used to represent the central cluster in the
|
||||
/// shared CentralHealthAggregator keyspace.
|
||||
///
|
||||
/// HealthMonitoring-021: the value is prefixed with <c>$</c> — a character
|
||||
/// that is forbidden in real site identifiers (the configuration /
|
||||
/// repository layer only permits Sites whose <c>SiteIdentifier</c> is a
|
||||
/// plain identifier) — so the synthetic central entry cannot collide with
|
||||
/// a real site whose operator-set identifier happened to be the bare word
|
||||
/// "central". A collision would have caused the two reports to clobber
|
||||
/// each other in the aggregator keyspace via the sequence-number guard,
|
||||
/// and the real site would inherit the longer
|
||||
/// <see cref="HealthMonitoringOptions.CentralOfflineTimeout"/> grace and
|
||||
/// stay falsely-online for an extra two minutes after going down.
|
||||
/// Consumers (<see cref="CentralHealthAggregator.CheckForOfflineSites"/>,
|
||||
/// the Central UI health dashboard) reference this constant rather than
|
||||
/// the literal string, so the change is local.
|
||||
/// </summary>
|
||||
public const string CentralSiteId = "central";
|
||||
public const string CentralSiteId = "$central";
|
||||
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly ICentralHealthAggregator _aggregator;
|
||||
|
||||
@@ -15,6 +15,15 @@ namespace ScadaLink.Host;
|
||||
/// set, console output template, file path and rolling interval are all
|
||||
/// configuration-driven (defined in <c>appsettings.json</c>), not hard-coded. The
|
||||
/// explicit <c>MinimumLevel.Is</c> below pins the floor from <see cref="LoggingOptions"/>.
|
||||
///
|
||||
/// Host-020: <c>ScadaLink:Logging:MinimumLevel</c> is the single source of truth
|
||||
/// for the floor — the explicit <c>MinimumLevel.Is</c> call deliberately runs
|
||||
/// AFTER <c>ReadFrom.Configuration</c> so a <c>Serilog:MinimumLevel</c> entry in
|
||||
/// configuration is overridden. To make that precedence visible (so an operator
|
||||
/// who sets <c>Serilog:MinimumLevel</c> does not wonder why the change had no
|
||||
/// effect), <see cref="Build"/> writes a one-shot warning to
|
||||
/// <see cref="Console.Error"/> when both keys are present. Pick one path —
|
||||
/// editing <c>Serilog:MinimumLevel</c> alone has no effect.
|
||||
/// </summary>
|
||||
public static class LoggerConfigurationFactory
|
||||
{
|
||||
@@ -29,11 +38,47 @@ public static class LoggerConfigurationFactory
|
||||
string nodeRole,
|
||||
string siteId,
|
||||
string nodeHostname)
|
||||
=> Build(configuration, nodeRole, siteId, nodeHostname, Console.Error);
|
||||
|
||||
/// <summary>
|
||||
/// Test-visible overload of <see cref="Build(IConfiguration, string, string, string)"/>
|
||||
/// that routes the Host-020 precedence warning through a caller-supplied
|
||||
/// writer so unit tests can capture it. Production calls the four-arg
|
||||
/// overload which uses <see cref="Console.Error"/>.
|
||||
/// </summary>
|
||||
/// <param name="configuration">Application configuration supplying the Serilog section and logging options.</param>
|
||||
/// <param name="nodeRole">Role label added as a log enrichment property.</param>
|
||||
/// <param name="siteId">Site identifier added as a log enrichment property.</param>
|
||||
/// <param name="nodeHostname">Hostname added as a log enrichment property.</param>
|
||||
/// <param name="warningWriter">Writer that receives the one-shot Host-020 override-warning when both keys are present.</param>
|
||||
internal static LoggerConfiguration Build(
|
||||
IConfiguration configuration,
|
||||
string nodeRole,
|
||||
string siteId,
|
||||
string nodeHostname,
|
||||
TextWriter warningWriter)
|
||||
{
|
||||
var loggingOptions = new LoggingOptions();
|
||||
configuration.GetSection("ScadaLink:Logging").Bind(loggingOptions);
|
||||
|
||||
var minimumLevel = ParseLevel(loggingOptions.MinimumLevel);
|
||||
var minimumLevel = ParseLevel(loggingOptions.MinimumLevel, warningWriter);
|
||||
|
||||
// Host-020: warn once if the operator also set a Serilog:MinimumLevel —
|
||||
// they almost certainly expected it to take effect, but the explicit
|
||||
// MinimumLevel.Is call below silently overrides it. The warning is
|
||||
// emitted only when the conflicting key is actually present (a bare
|
||||
// "Default" value is what ReadFrom.Configuration reads); a missing /
|
||||
// empty Serilog:MinimumLevel section is silent.
|
||||
var serilogMinimumLevel = configuration["Serilog:MinimumLevel"]
|
||||
?? configuration["Serilog:MinimumLevel:Default"];
|
||||
if (!string.IsNullOrWhiteSpace(serilogMinimumLevel))
|
||||
{
|
||||
warningWriter.WriteLine(
|
||||
$"warning: Serilog:MinimumLevel ('{serilogMinimumLevel}') is being overridden by " +
|
||||
$"ScadaLink:Logging:MinimumLevel ('{loggingOptions.MinimumLevel ?? "Information (default)"}'). " +
|
||||
"ScadaLink:Logging:MinimumLevel is the documented source of truth for the floor (Host-011); " +
|
||||
"remove the Serilog:MinimumLevel entry to silence this warning.");
|
||||
}
|
||||
|
||||
return new LoggerConfiguration()
|
||||
.ReadFrom.Configuration(configuration)
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
{
|
||||
"ScadaLink": {
|
||||
"_nodeName": "Host-018: NodeName stamps SourceNode on AuditLog/Notifications/SiteCalls rows (CLAUDE.md 'Centralized Audit Log' decision) and backs IX_AuditLog_Node_Occurred. Convention: 'central-a'/'central-b' for central nodes, 'node-a'/'node-b' for site nodes. Override per-node in multi-node deployments (the docker per-node configs do this). When left at the default below, single-node dev rows are stamped with 'central-a'; an empty value normalises to a NULL SourceNode.",
|
||||
"Node": {
|
||||
"Role": "Central",
|
||||
"NodeHostname": "localhost",
|
||||
"RemotingPort": 8081
|
||||
"RemotingPort": 8081,
|
||||
"NodeName": "central-a"
|
||||
},
|
||||
"Cluster": {
|
||||
"SeedNodes": [
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
{
|
||||
"ScadaLink": {
|
||||
"_nodeName": "Host-018: NodeName stamps SourceNode on AuditLog/Notifications/SiteCalls rows (CLAUDE.md 'Centralized Audit Log' decision) and backs IX_AuditLog_Node_Occurred. Convention: 'node-a'/'node-b' for site nodes, 'central-a'/'central-b' for central nodes. Override per-node in multi-node deployments (the docker per-node configs do this). When left at the default below, single-node dev rows are stamped with 'node-a'; an empty value normalises to a NULL SourceNode.",
|
||||
"Node": {
|
||||
"Role": "Site",
|
||||
"NodeHostname": "localhost",
|
||||
"SiteId": "site-a",
|
||||
"RemotingPort": 8082,
|
||||
"GrpcPort": 8083
|
||||
"GrpcPort": 8083,
|
||||
"NodeName": "node-a"
|
||||
},
|
||||
"Cluster": {
|
||||
"SeedNodes": [
|
||||
@@ -31,9 +33,9 @@
|
||||
"ReplicationEnabled": true
|
||||
},
|
||||
"Communication": {
|
||||
"_centralContactPoints": "Host-016: each entry MUST be a central node's remoting endpoint, NOT this site's own remoting port. The single dev-loopback default below points only at central-a (localhost:8081). In a multi-central deployment add the second central node here (e.g. 'akka.tcp://scadalink@central-b-host:8081') so ClusterClient can fail over when central-a is down. The previous template listed localhost:8082 as the second contact — that is THIS site's own RemotingPort and is a permanent failure in the initial-contact rotation.",
|
||||
"CentralContactPoints": [
|
||||
"akka.tcp://scadalink@localhost:8081",
|
||||
"akka.tcp://scadalink@localhost:8082"
|
||||
"akka.tcp://scadalink@localhost:8081"
|
||||
],
|
||||
"DeploymentTimeout": "00:02:00",
|
||||
"LifecycleTimeout": "00:00:30",
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
{
|
||||
"Logging": {
|
||||
"LogLevel": {
|
||||
"Default": "Information"
|
||||
}
|
||||
},
|
||||
"_logging": "Host-021: Serilog is the sole logger provider (Program.cs calls builder.Host.UseSerilog()), so the standard Microsoft 'Logging:LogLevel' block has no effect and was removed. The minimum level is set via 'ScadaLink:Logging:MinimumLevel' (bound to LoggingOptions per Host-011); sinks are defined under the 'Serilog' section below and applied via ReadFrom.Configuration (Host-014). See LoggerConfigurationFactory + Component-Host.md REQ-HOST-8.",
|
||||
"Serilog": {
|
||||
"Using": [
|
||||
"Serilog.Sinks.Console",
|
||||
|
||||
@@ -36,9 +36,18 @@ namespace ScadaLink.SiteCallAudit;
|
||||
/// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" —
|
||||
/// the actor catches every exception from the repository call and replies
|
||||
/// <c>Accepted=false</c> without rethrowing, so the central singleton stays
|
||||
/// alive. The <see cref="SupervisorStrategy"/> uses <c>Resume</c> so an
|
||||
/// unexpected throw before the catch (defence in depth) does not restart the
|
||||
/// actor and reset in-flight state.
|
||||
/// alive. The <see cref="SupervisorStrategy"/> override governs the actor's
|
||||
/// <em>children</em>, not the actor itself; this actor has no children today,
|
||||
/// so the override is currently inert. It returns a one-for-one strategy with
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart on most
|
||||
/// exceptions, Stop on <see cref="ActorInitializationException"/> /
|
||||
/// <see cref="ActorKilledException"/>) and <c>maxNrOfRetries: 0</c>, so any
|
||||
/// future child that throws is Stopped on the first failure — a deliberate
|
||||
/// "fail loudly" posture for the central singleton's eventual sub-actors
|
||||
/// (reconciliation puller, purge scheduler). Self-supervision of this actor
|
||||
/// is whatever the parent <see cref="Akka.Cluster.Tools.Singleton.ClusterSingletonManager"/>
|
||||
/// supplies; the in-handler <c>try/catch</c> in <see cref="OnUpsertAsync"/>
|
||||
/// is what actually keeps the singleton alive across repository faults.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Two constructors exist for the same reason as
|
||||
@@ -147,7 +156,18 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
Receive<DiscardSiteCallRequest>(HandleDiscardSiteCall);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// SiteCallAudit-001: child supervision strategy — governs children, not this
|
||||
/// actor. The actor has no children today, so this override is inert; it
|
||||
/// returns a one-for-one strategy with the framework
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart on
|
||||
/// most exceptions; Stop on <see cref="ActorInitializationException"/> /
|
||||
/// <see cref="ActorKilledException"/>) and <c>maxNrOfRetries: 0</c>, so any
|
||||
/// future child that throws is Stopped on the first failure. The actor's
|
||||
/// own resilience comes from the <c>try/catch</c> in <see cref="OnUpsertAsync"/>
|
||||
/// plus the parent <see cref="Akka.Cluster.Tools.Singleton.ClusterSingletonManager"/>'s
|
||||
/// supervision — not from this override.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
|
||||
@@ -179,7 +199,14 @@ public class SiteCallAuditActor : ReceiveActor
|
||||
|
||||
try
|
||||
{
|
||||
await repository.UpsertAsync(cmd.SiteCall).ConfigureAwait(false);
|
||||
// SiteCallAudit-003: stamp IngestedAtUtc at central-side persist
|
||||
// time on every upsert, mirroring AuditLogIngestActor's combined-
|
||||
// telemetry hot path. IngestedAtUtc is the "central ingested (or
|
||||
// last refreshed) this row" timestamp; callers (telemetry,
|
||||
// future reconciliation puller, direct-writes) cannot in general
|
||||
// know they are running on central, so the actor owns the stamp.
|
||||
var siteCall = cmd.SiteCall with { IngestedAtUtc = DateTime.UtcNow };
|
||||
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
||||
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: true));
|
||||
}
|
||||
catch (Exception ex)
|
||||
|
||||
@@ -27,4 +27,14 @@ public interface ISiteEventLogger
|
||||
string source,
|
||||
string message,
|
||||
string? details = null);
|
||||
|
||||
/// <summary>
|
||||
/// SiteEventLogging-018: total number of event writes that have failed
|
||||
/// (SQLite error, disk full, bounded-queue overflow drop, etc.) since this
|
||||
/// logger was created. Available for future Health Monitoring integration —
|
||||
/// promoted onto the interface so a Health consumer can read it without a
|
||||
/// concrete-type downcast. Not yet polled by Health Monitoring; the wiring
|
||||
/// is tracked separately.
|
||||
/// </summary>
|
||||
long FailedWriteCount { get; }
|
||||
}
|
||||
|
||||
@@ -90,9 +90,15 @@ public class SiteEventLogger : ISiteEventLogger, IDisposable
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Number of event writes that have failed (SQLite error, disk full, etc.)
|
||||
/// since this logger was created. Surfaced so Health Monitoring can detect a
|
||||
/// logging outage instead of relying on a local log line nobody is watching.
|
||||
/// SiteEventLogging-018: number of event writes that have failed (SQLite
|
||||
/// error, disk full, bounded-queue overflow drop, etc.) since this logger
|
||||
/// was created. Available for future Health Monitoring integration — the
|
||||
/// counter is correct and observable, but the central health-metric
|
||||
/// pipeline does not yet poll it, so a sustained non-zero value currently
|
||||
/// goes unnoticed in production beyond the per-failure log line. Wiring
|
||||
/// the metric into the 30-second site-metric publish is tracked
|
||||
/// separately; promoted to <see cref="ISiteEventLogger"/> so the eventual
|
||||
/// consumer reads it without a concrete-type downcast.
|
||||
/// </summary>
|
||||
public long FailedWriteCount => Interlocked.Read(ref _failedWriteCount);
|
||||
|
||||
|
||||
@@ -132,6 +132,11 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
// WP-33: Handle system-wide artifact deployment
|
||||
Receive<DeployArtifactsCommand>(HandleDeployArtifacts);
|
||||
|
||||
// SiteRuntime-021: artifact-deploy DCL push, dispatched back from the
|
||||
// off-thread persistence task so the hash-cache mutation stays
|
||||
// actor-thread-confined.
|
||||
Receive<ApplyArtifactDataConnectionsToDcl>(HandleApplyArtifactDataConnectionsToDcl);
|
||||
|
||||
// Debug View — route to Instance Actors
|
||||
Receive<SubscribeDebugViewRequest>(RouteDebugViewSubscribe);
|
||||
Receive<UnsubscribeDebugViewRequest>(RouteDebugViewUnsubscribe);
|
||||
@@ -642,23 +647,12 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
|
||||
foreach (var (name, connConfig) in config.Connections)
|
||||
{
|
||||
var configHash = ComputeConnectionConfigHash(connConfig);
|
||||
if (_createdConnections.TryGetValue(name, out var lastHash) && lastHash == configHash)
|
||||
continue;
|
||||
|
||||
var primaryDetails = FlattenConnectionConfig(connConfig.Protocol, connConfig.ConfigurationJson);
|
||||
var backupDetails = string.IsNullOrEmpty(connConfig.BackupConfigurationJson)
|
||||
? null
|
||||
: FlattenConnectionConfig(connConfig.Protocol, connConfig.BackupConfigurationJson);
|
||||
|
||||
_dclManager.Tell(new Commons.Messages.DataConnection.CreateConnectionCommand(
|
||||
name, connConfig.Protocol, primaryDetails, backupDetails, connConfig.FailoverRetryCount));
|
||||
|
||||
var changed = _createdConnections.ContainsKey(name);
|
||||
_createdConnections[name] = configHash;
|
||||
_logger.LogInformation(
|
||||
"{Action} DCL connection {Connection} (protocol={Protocol})",
|
||||
changed ? "Updated" : "Created", name, connConfig.Protocol);
|
||||
EnsureDclConnection(
|
||||
name,
|
||||
connConfig.Protocol,
|
||||
connConfig.ConfigurationJson,
|
||||
connConfig.BackupConfigurationJson,
|
||||
connConfig.FailoverRetryCount);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
@@ -667,20 +661,78 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-021: hash-guarded DCL connection push shared by the inline
|
||||
/// per-instance path (<see cref="EnsureDclConnections(string)"/>) and the
|
||||
/// system-wide artifact-deploy path (<see cref="HandleDeployArtifacts"/>).
|
||||
/// Unchanged config is a no-op; a changed endpoint/credentials/backup/
|
||||
/// failover-count re-issues a <c>CreateConnectionCommand</c> so a system-
|
||||
/// wide artifact-deploy makes its data-connection change live immediately
|
||||
/// (the artifact-deploy path previously only persisted to SQLite — the
|
||||
/// DCL didn't see the change until next instance redeploy or node
|
||||
/// restart, contradicting the "site is self-contained after artifact
|
||||
/// deployment" intent).
|
||||
/// </summary>
|
||||
private void EnsureDclConnection(
|
||||
string name,
|
||||
string protocol,
|
||||
string? primaryConfigurationJson,
|
||||
string? backupConfigurationJson,
|
||||
int failoverRetryCount)
|
||||
{
|
||||
if (_dclManager == null) return;
|
||||
|
||||
var configHash = ComputeConnectionConfigHashCore(
|
||||
protocol, primaryConfigurationJson, backupConfigurationJson, failoverRetryCount);
|
||||
if (_createdConnections.TryGetValue(name, out var lastHash) && lastHash == configHash)
|
||||
return;
|
||||
|
||||
var primaryDetails = FlattenConnectionConfig(protocol, primaryConfigurationJson);
|
||||
var backupDetails = string.IsNullOrEmpty(backupConfigurationJson)
|
||||
? null
|
||||
: FlattenConnectionConfig(protocol, backupConfigurationJson);
|
||||
|
||||
_dclManager.Tell(new Commons.Messages.DataConnection.CreateConnectionCommand(
|
||||
name, protocol, primaryDetails, backupDetails, failoverRetryCount));
|
||||
|
||||
var changed = _createdConnections.ContainsKey(name);
|
||||
_createdConnections[name] = configHash;
|
||||
_logger.LogInformation(
|
||||
"{Action} DCL connection {Connection} (protocol={Protocol})",
|
||||
changed ? "Updated" : "Created", name, protocol);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes a stable hash over the configuration fields that affect how the DCL
|
||||
/// connects, so a changed endpoint/credential/backup/failover count is detected
|
||||
/// (SiteRuntime-010).
|
||||
/// </summary>
|
||||
private static string ComputeConnectionConfigHash(
|
||||
Commons.Types.Flattening.ConnectionConfig connConfig)
|
||||
Commons.Types.Flattening.ConnectionConfig connConfig) =>
|
||||
ComputeConnectionConfigHashCore(
|
||||
connConfig.Protocol,
|
||||
connConfig.ConfigurationJson,
|
||||
connConfig.BackupConfigurationJson,
|
||||
connConfig.FailoverRetryCount);
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-021: field-based core so the system-wide artifact-deploy
|
||||
/// path (which carries protocol/config-json/backup-json/failover directly
|
||||
/// on <see cref="Commons.Messages.Artifacts.DataConnectionArtifact"/>) can
|
||||
/// share the same hash + skip-or-resend logic as the inline-config path.
|
||||
/// </summary>
|
||||
private static string ComputeConnectionConfigHashCore(
|
||||
string protocol,
|
||||
string? primaryConfigurationJson,
|
||||
string? backupConfigurationJson,
|
||||
int failoverRetryCount)
|
||||
{
|
||||
var material = string.Join(
|
||||
"",
|
||||
connConfig.Protocol,
|
||||
connConfig.ConfigurationJson ?? string.Empty,
|
||||
connConfig.BackupConfigurationJson ?? string.Empty,
|
||||
connConfig.FailoverRetryCount.ToString());
|
||||
"",
|
||||
protocol,
|
||||
primaryConfigurationJson ?? string.Empty,
|
||||
backupConfigurationJson ?? string.Empty,
|
||||
failoverRetryCount.ToString(System.Globalization.CultureInfo.InvariantCulture));
|
||||
|
||||
var bytes = System.Security.Cryptography.SHA256.HashData(
|
||||
System.Text.Encoding.UTF8.GetBytes(material));
|
||||
@@ -983,6 +1035,20 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
dc.Name, dc.Protocol, dc.PrimaryConfigurationJson,
|
||||
dc.BackupConfigurationJson, dc.FailoverRetryCount);
|
||||
}
|
||||
|
||||
// SiteRuntime-021: after the SQLite store, dispatch an
|
||||
// internal message back to the actor thread so the DCL
|
||||
// push runs through EnsureDclConnection — keeping the
|
||||
// _createdConnections hash cache mutation actor-thread-
|
||||
// confined while still making the change live immediately
|
||||
// (previously the change landed in SQLite but the DCL
|
||||
// kept using the stale connection until next instance
|
||||
// redeploy or node restart, contradicting "site is
|
||||
// self-contained after artifact deployment"). The
|
||||
// helper's hash cache skips unchanged definitions, so
|
||||
// the push is idempotent for re-deploys of the same
|
||||
// artifact bundle.
|
||||
Self.Tell(new ApplyArtifactDataConnectionsToDcl(command.DataConnections));
|
||||
}
|
||||
|
||||
// Store SMTP configurations
|
||||
@@ -1044,6 +1110,27 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
_logger.LogDebug("Created Instance Actor for {Instance}", instanceName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-021: actor-thread handler that pushes artifact-deploy data
|
||||
/// connection definitions to the DCL via the shared
|
||||
/// <see cref="EnsureDclConnection"/> helper. Dispatched from
|
||||
/// <see cref="HandleDeployArtifacts"/>'s off-thread Task so the
|
||||
/// <see cref="_createdConnections"/> hash-cache mutation stays
|
||||
/// actor-thread-confined.
|
||||
/// </summary>
|
||||
private void HandleApplyArtifactDataConnectionsToDcl(ApplyArtifactDataConnectionsToDcl msg)
|
||||
{
|
||||
foreach (var dc in msg.DataConnections)
|
||||
{
|
||||
EnsureDclConnection(
|
||||
dc.Name,
|
||||
dc.Protocol,
|
||||
dc.PrimaryConfigurationJson,
|
||||
dc.BackupConfigurationJson,
|
||||
dc.FailoverRetryCount);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of active Instance Actors (for testing/diagnostics).
|
||||
/// </summary>
|
||||
@@ -1085,4 +1172,14 @@ public class DeploymentManagerActor : ReceiveActor, IWithTimers
|
||||
/// A redeployment command buffered until the previous Instance Actor terminates.
|
||||
/// </summary>
|
||||
internal record PendingRedeploy(DeployInstanceCommand Command, IActorRef OriginalSender);
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-021: internal message dispatched from
|
||||
/// <see cref="HandleDeployArtifacts"/>'s off-thread persistence task back
|
||||
/// onto the actor thread, so the DCL push (and its hash-cache mutation)
|
||||
/// runs through <see cref="EnsureDclConnection"/> without crossing
|
||||
/// thread-confinement boundaries.
|
||||
/// </summary>
|
||||
internal record ApplyArtifactDataConnectionsToDcl(
|
||||
IReadOnlyList<Commons.Messages.Artifacts.DataConnectionArtifact> DataConnections);
|
||||
}
|
||||
|
||||
@@ -135,15 +135,20 @@ internal sealed class AuditingDbCommand : DbCommand
|
||||
// the wrapper, but writes from the user go through to the inner
|
||||
// command so the underlying provider keeps its wiring intact.
|
||||
get => _wrappingConnection ?? _inner.Connection;
|
||||
// SiteRuntime-022: unwrap the AuditingDbConnection wrapper via its
|
||||
// own internal Inner accessor instead of reflecting into a private
|
||||
// _inner field. Reflection was the original SiteRuntime-006 anti-
|
||||
// pattern (and is forbidden inside script bodies by the trust
|
||||
// model) — both classes are internal sealed in the same assembly,
|
||||
// so the proper API surface is available without leaking anything
|
||||
// public.
|
||||
set
|
||||
{
|
||||
_wrappingConnection = value;
|
||||
_inner.Connection = value switch
|
||||
{
|
||||
AuditingDbConnection auditing => auditing.GetType()
|
||||
.GetField("_inner", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic)
|
||||
!.GetValue(auditing) as DbConnection,
|
||||
_ => value
|
||||
AuditingDbConnection auditing => auditing.Inner,
|
||||
_ => value,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,6 +86,18 @@ internal sealed class AuditingDbConnection : DbConnection
|
||||
_parentExecutionId = parentExecutionId;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SiteRuntime-022: exposes the wrapped <see cref="DbConnection"/> to the
|
||||
/// sibling <see cref="AuditingDbCommand"/> in the same assembly, so the
|
||||
/// command's <c>DbConnection</c> setter can unwrap an
|
||||
/// <see cref="AuditingDbConnection"/> without reflecting into the
|
||||
/// private <c>_inner</c> field. Both classes are <c>internal sealed</c>
|
||||
/// in this assembly, so the accessor stays out of the public API and
|
||||
/// matches the SiteRuntime-006 precedent of preferring proper API surface
|
||||
/// over <see cref="System.Reflection"/>.
|
||||
/// </summary>
|
||||
internal DbConnection Inner => _inner;
|
||||
|
||||
/// <inheritdoc />
|
||||
// ConnectionString is settable on DbConnection — forward both halves.
|
||||
public override string ConnectionString
|
||||
|
||||
@@ -42,6 +42,11 @@ public static class ServiceCollectionExtensions
|
||||
// ISiteIdentityProvider — HealthMonitoring already references S&F.
|
||||
var cachedCallObserver = sp.GetService<ICachedCallLifecycleObserver>();
|
||||
var siteContext = sp.GetService<IStoreAndForwardSiteContext>();
|
||||
// StoreAndForward-023: pass null/empty through unchanged — the
|
||||
// service constructor normalises it to UnknownSiteSentinel so a
|
||||
// host without an IStoreAndForwardSiteContext registration is
|
||||
// observable in the central audit log instead of producing a
|
||||
// silent empty-string SourceSite.
|
||||
var siteId = siteContext?.SiteId ?? string.Empty;
|
||||
return new StoreAndForwardService(
|
||||
storage,
|
||||
|
||||
@@ -14,7 +14,24 @@ public class StoreAndForwardOptions
|
||||
/// <summary>WP-10: Default retry interval for messages without per-source settings.</summary>
|
||||
public TimeSpan DefaultRetryInterval { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>WP-10: Default maximum retry count before parking.</summary>
|
||||
/// <summary>
|
||||
/// WP-10: Default maximum retry count before parking. Applied when an
|
||||
/// <c>EnqueueAsync</c> caller does not pass an explicit <c>maxRetries</c>.
|
||||
/// <para>
|
||||
/// <b>StoreAndForward-019:</b> this default is enforced uniformly across
|
||||
/// every category, including <see cref="Commons.Types.Enums.StoreAndForwardCategory.Notification"/>:
|
||||
/// once the buffered message's retry count reaches this cap the engine
|
||||
/// parks the row. The Component-StoreAndForward.md "notifications do not
|
||||
/// park" wording reflects the operational <i>intent</i> when central is
|
||||
/// reachable on the normal cadence; under a sustained central outage that
|
||||
/// exceeds <c>DefaultMaxRetries × forward-interval</c> a buffered
|
||||
/// notification <i>will</i> park and surface in the parked-message UI,
|
||||
/// matching the rest of the system's bounded-retry-then-park behaviour.
|
||||
/// Callers that genuinely require unbounded retry must pass
|
||||
/// <c>maxRetries: 0</c> on <c>EnqueueAsync</c> (the documented "no limit"
|
||||
/// escape hatch — see <c>StoreAndForwardService.EnqueueAsync</c>).
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public int DefaultMaxRetries { get; set; } = 50;
|
||||
|
||||
/// <summary>WP-10: Interval for the background retry timer sweep.</summary>
|
||||
|
||||
@@ -46,8 +46,31 @@ public class StoreAndForwardService
|
||||
/// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the
|
||||
/// cached-call attempt context so the audit bridge can build the
|
||||
/// <see cref="SiteCallOperational"/> half of the telemetry packet.
|
||||
/// <para>
|
||||
/// <b>StoreAndForward-023:</b> an empty-string site id must never reach
|
||||
/// downstream consumers — the central audit pipeline keys
|
||||
/// <c>(SourceSite, TrackedOperationId)</c> off this value, so an empty
|
||||
/// string degrades correlation to a per-id-only index and breaks the
|
||||
/// per-site routing of <c>RetryParkedOperation</c>/<c>DiscardParkedOperation</c>
|
||||
/// commands. The constructor normalises a null/empty/whitespace
|
||||
/// <paramref name="siteId"/> argument to <see cref="UnknownSiteSentinel"/>
|
||||
/// so a misconfigured host (no <c>IStoreAndForwardSiteContext</c>
|
||||
/// registered) produces a distinctive marker in the central audit log
|
||||
/// rather than silently merging multiple sites into the empty bucket.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
private readonly string _siteId;
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-023: distinctive marker stamped onto cached-call audit
|
||||
/// telemetry when the host has not registered an
|
||||
/// <see cref="IStoreAndForwardSiteContext"/>. Chosen with a leading <c>$</c>
|
||||
/// so it cannot collide with a real site id (which is a configuration
|
||||
/// identifier and never starts with <c>$</c>). Surfacing this in the
|
||||
/// central audit log makes a missing site-context binding immediately
|
||||
/// recognisable instead of an unattributable empty string.
|
||||
/// </summary>
|
||||
public const string UnknownSiteSentinel = "$unknown-site";
|
||||
private Timer? _retryTimer;
|
||||
private int _retryInProgress;
|
||||
|
||||
@@ -120,7 +143,11 @@ public class StoreAndForwardService
|
||||
_logger = logger;
|
||||
_replication = replication;
|
||||
_cachedCallObserver = cachedCallObserver;
|
||||
_siteId = siteId;
|
||||
// StoreAndForward-023: normalise an empty / whitespace site id to the
|
||||
// distinctive UnknownSiteSentinel so downstream consumers (the central
|
||||
// audit pipeline keying off SourceSite) never see an empty string and
|
||||
// a misconfigured host is recognisable in the central log.
|
||||
_siteId = string.IsNullOrWhiteSpace(siteId) ? UnknownSiteSentinel : siteId;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -583,8 +610,21 @@ public class StoreAndForwardService
|
||||
|
||||
if (!TrackedOperationId.TryParse(message.Id, out var trackedId))
|
||||
{
|
||||
// Pre-M3 message (random GUID-N id from S&F itself, no
|
||||
// TrackedOperationId threaded in). Skip — no audit row to bind to.
|
||||
// StoreAndForward-022: previously a silent skip — but a non-GUID
|
||||
// message id means a caller bypassed the audit hot path with zero
|
||||
// feedback. The drop is still best-effort (S&F retry bookkeeping
|
||||
// must never depend on the audit pipeline) but it is now observable
|
||||
// via a Warning so a misconfigured caller can be diagnosed.
|
||||
// Engine-minted ids (Guid.NewGuid().ToString("N")) and the current
|
||||
// caller set (NotificationOutbox enqueue with NotificationId,
|
||||
// cached-call enqueue with TrackedOperationId.ToString()) all
|
||||
// parse — this log line fires only when a future caller supplies a
|
||||
// non-GUID id, which is exactly when the silent-drop was hardest
|
||||
// to diagnose.
|
||||
_logger.LogWarning(
|
||||
"Cached-call audit observer skipped: message id {MessageId} is not a parseable TrackedOperationId (category {Category}, outcome {Outcome}). " +
|
||||
"Audit lifecycle for this operation will have no rows.",
|
||||
message.Id, message.Category, outcome);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -667,26 +707,51 @@ public class StoreAndForwardService
|
||||
/// so a failover preserves the operator's retry intent.
|
||||
/// StoreAndForward-017: the activity-log entry carries the message's true
|
||||
/// category rather than a hard-coded one.
|
||||
/// StoreAndForward-020: the parked row is captured <i>before</i> the local
|
||||
/// requeue write rather than re-read after it, so a concurrent
|
||||
/// <c>RemoveMessageAsync</c> or <c>DiscardParkedMessageAsync</c> running
|
||||
/// between the two storage calls cannot leave the standby in <c>Parked</c>
|
||||
/// while the active node has already requeued — we always have the row in
|
||||
/// hand for the <c>Requeue</c> replication.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The identifier of the message to retry.</param>
|
||||
/// <returns>True if successfully retried, false otherwise.</returns>
|
||||
public async Task<bool> RetryParkedMessageAsync(string messageId)
|
||||
{
|
||||
var success = await _storage.RetryParkedMessageAsync(messageId);
|
||||
if (success)
|
||||
// StoreAndForward-020: capture the parked row up front so the standby
|
||||
// gets a Requeue even if a concurrent writer (a sweep delete after a
|
||||
// successful delivery, or an operator discard) removes the row between
|
||||
// the local update and the re-load. The storage call below is
|
||||
// conditional on status = Parked, so if the row has already moved we
|
||||
// return false here without replicating — the standby's matching row
|
||||
// will be reconciled by whichever other operator path won the race.
|
||||
var captured = await _storage.GetMessageByIdAsync(messageId);
|
||||
if (captured is null || captured.Status != StoreAndForwardMessageStatus.Parked)
|
||||
{
|
||||
// Re-load the requeued row so the activity log gets the real category
|
||||
// and the standby gets the post-requeue state (Pending, retry_count = 0).
|
||||
var message = await _storage.GetMessageByIdAsync(messageId);
|
||||
var category = message?.Category ?? StoreAndForwardCategory.ExternalSystem;
|
||||
if (message != null)
|
||||
{
|
||||
_replication?.ReplicateRequeue(message);
|
||||
}
|
||||
RaiseActivity("Retry", category,
|
||||
$"Parked message {messageId} moved back to queue");
|
||||
return false;
|
||||
}
|
||||
return success;
|
||||
|
||||
var success = await _storage.RetryParkedMessageAsync(messageId);
|
||||
if (!success)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// The active node just rewrote this row to Pending with retry_count = 0
|
||||
// and cleared last_error / last_attempt_at (see
|
||||
// StoreAndForwardStorage.RetryParkedMessageAsync). Reconstruct the
|
||||
// post-requeue state on the captured POCO so the standby applies the
|
||||
// same mutations even if a concurrent writer has already deleted the
|
||||
// row underneath us.
|
||||
captured.Status = StoreAndForwardMessageStatus.Pending;
|
||||
captured.RetryCount = 0;
|
||||
captured.LastError = null;
|
||||
captured.LastAttemptAt = null;
|
||||
_replication?.ReplicateRequeue(captured);
|
||||
|
||||
RaiseActivity("Retry", captured.Category,
|
||||
$"Parked message {messageId} moved back to queue");
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
Reference in New Issue
Block a user