131 Commits

Author SHA1 Message Date
Joseph Doherty efb3efe6dc docs: mark Playwright coverage-fill Wave 1 tasks complete 2026-06-06 12:37:47 -04:00
Joseph Doherty 0700777e2f test(e2e): guard ApiSurfaceFixture partial-init delete + seal TransportImportTests (final review nits) 2026-06-06 12:37:21 -04:00
Joseph Doherty 09f14f18ea test(e2e): cap live browser contexts to bound Blazor circuit pressure (fixes full-suite timeouts); import negative-test review fixes 2026-06-06 12:33:06 -04:00
Joseph Doherty b52f7281aa test(e2e): Transport import wrong-passphrase shows error and stays on passphrase step 2026-06-06 12:21:56 -04:00
Joseph Doherty 3f88de932c test(e2e): harden Transport export test — render sentinel + step-scoped Next (review fix) 2026-06-06 12:19:38 -04:00
Joseph Doherty 79586ca5ad test(e2e): row-scope API-key kebab dropdown selectors + visibility-gate items (review fix) 2026-06-06 12:16:50 -04:00
Joseph Doherty 57ca5d6321 test(e2e): Transport Export wizard reaches download summary for a zztest template 2026-06-06 12:13:55 -04:00
Joseph Doherty 73b213442f test(e2e): tighten API-key validation locator to div.text-danger.small (review precision fix) 2026-06-06 12:10:12 -04:00
Joseph Doherty 89231e3245 test(e2e): API-key enable/disable badge transition + delete-with-confirm removes row 2026-06-06 12:08:53 -04:00
Joseph Doherty 9fe3ac30c9 test(e2e): API-key create→token reveal + name/method validation edges 2026-06-06 12:06:09 -04:00
Joseph Doherty 84edf5a134 test(e2e): add ApiSurfaceFixture (inbound api-method for API-key form checkboxes) 2026-06-06 12:00:42 -04:00
Joseph Doherty fecac45d05 test(e2e): InstanceConfigure attribute-override + area reassignment + not-found edge 2026-06-06 11:58:45 -04:00
Joseph Doherty 3e4b0ca44c test(e2e): InstanceConfigure bindings round-trip (bulk assign → save → verify via instance get) 2026-06-06 11:55:23 -04:00
Joseph Doherty 8bd7656110 docs: sync Wave 1 plan with Task 0 review fixes (GetInstanceDocumentAsync, CreateApiKeyAsync) 2026-06-06 11:44:56 -04:00
Joseph Doherty 32240919cc test(e2e): address Task 0 review — rename GetInstanceDocumentAsync (ownership), add CreateApiKeyAsync (CLI emits prose not JSON) 2026-06-06 11:44:22 -04:00
Joseph Doherty e618137ce7 test(e2e): add InstanceConfigureFixture (template+attr+connection+area+instance on site-a)
Also extends AddAttributeAsync with an optional dataSourceReference parameter
so the fixture attribute appears in both _bindingDataSourceAttrs (bindings UI)
and _overrideAttrs (overrides UI) on the InstanceConfigure page.
2026-06-06 11:41:52 -04:00
Joseph Doherty a8a515ec8a test(e2e): add CliRunner helpers for data-connection, api-method, api-key teardown, instance read-back 2026-06-06 11:37:57 -04:00
Joseph Doherty c23e2bf227 feat(centralui): add data-test hooks to InstanceConfigure selects + error alert (test instrumentation) 2026-06-06 11:37:03 -04:00
Joseph Doherty 8e8bf44a29 docs: add Playwright coverage-fill Wave 1 plan (InstanceConfigure, API keys, Transport export) + tasks 2026-06-06 11:32:18 -04:00
Joseph Doherty 58bf59a42d docs: add Playwright coverage-fill design (Tier 1-3 + edge sweep, 4 waves) 2026-06-06 11:23:59 -04:00
Joseph Doherty 667d141f1a test(e2e): unify toast assertion + extract shared PlaywrightDbConnection (review cleanups) 2026-06-05 11:00:27 -04:00
Joseph Doherty 5546c32593 test(e2e): move deploy/disable preconditions inside try for guaranteed cleanup (review fix)
In Disable_Instance_ShowsOutcomeToast and Enable_Instance_ShowsOutcomeToast, the
precondition CLI calls (DeployInstanceAsync / DisableInstanceAsync) were between
CreateInstanceAsync and the try block. A throw there would skip the finally
DeleteInstanceAsync, leaking the instance. Moved those calls to be the first
statements inside try so cleanup always runs once the instance exists.
2026-06-05 10:52:00 -04:00
Joseph Doherty ad0bc33231 test(e2e): cover Topology Enable/Disable/Delete + fix toast assertion 2026-06-05 10:48:06 -04:00
Joseph Doherty fac0bcbb01 test(e2e): use web-first toast assertion in notification tests (review fix) 2026-06-05 10:47:06 -04:00
Joseph Doherty 1cbf260969 test(e2e): add wizard sentinel waits + tighten audit-link assertion (review fixes) 2026-06-05 10:46:57 -04:00
Joseph Doherty 4d55c0ac95 test(e2e): cover notification retry/discard + parked-messages query 2026-06-05 10:40:06 -04:00
Joseph Doherty 9cc5b7355e test(e2e): cover Transport Import apply round-trip 2026-06-05 10:38:42 -04:00
Joseph Doherty e358c231ce test(e2e): cover Topology Deploy action 2026-06-05 10:38:34 -04:00
Joseph Doherty 043914fd71 test(e2e): scope Site delete dropdown to .dropdown (review fix) 2026-06-05 10:34:07 -04:00
Joseph Doherty 917e5f30bf test(e2e): structural modal discriminator + simpler post-delete wait (review fixes) 2026-06-05 10:34:07 -04:00
Joseph Doherty 8e11f1f900 test(e2e): guard DeploymentFixture partial-init + teardown (review fixes) 2026-06-05 10:30:16 -04:00
Joseph Doherty 19c4412fd1 test(e2e): cover Template create/add-attribute/delete round-trip 2026-06-05 10:28:53 -04:00
Joseph Doherty 3998a6126f test(e2e): cover Site create/edit/delete round-trip
Adds CreateEditDelete_Site_RoundTrips [SkippableFact] to SiteCrudTests.
Exercises the full create → edit → delete UI flow against the live cluster,
with CliRunner best-effort teardown so no zztest-* sites leak on mid-test failure.
2026-06-05 10:28:40 -04:00
Joseph Doherty 271f70b1d2 test(e2e): standardize AuditLog tests on SkippableFact + skip summary log 2026-06-05 10:25:16 -04:00
Joseph Doherty 234ddb5201 test(e2e): add DeploymentFixture (ephemeral instance on site-a) 2026-06-05 10:25:06 -04:00
Joseph Doherty 3d9ef0a477 test(e2e): harden LDAP teardown + tighten nav/health selectors (review fixes) 2026-06-05 10:19:48 -04:00
Joseph Doherty 754f049a98 test(e2e): guard BundleExport comma + document DeleteAreaAsync (review fixes) 2026-06-05 10:11:17 -04:00
Joseph Doherty 12bf08f64a test(e2e): cover LDAP mapping create/edit/delete round-trip 2026-06-05 10:09:15 -04:00
Joseph Doherty 4f4b34ea89 test(e2e): assert Health KPI tiles resolve (singleton-hang guard)
Guards against the Akka singleton Ask hang regression: asserts all nine KPI
tiles on /monitoring/health resolve to numeric values and never show the
em-dash degrade placeholder (—). Covers Notification Outbox, Audit, and
Site Call tile groups. Selector disambiguation: Outbox tiles are div.card,
Site Call tiles are button.card — prevents strict-mode collisions on the
shared "Stuck" and "Parked" labels.
2026-06-05 10:08:32 -04:00
Joseph Doherty 2a25f2aaf8 test(e2e): assert destination renders, not just URL, in nav tests 2026-06-05 10:07:19 -04:00
Joseph Doherty 4a7c46f1db test(e2e): add CliRunner typed fixture helpers 2026-06-05 10:04:05 -04:00
Joseph Doherty bf78e3e7bf test(e2e): harden CliRunner timeout drain + skip-counter (review fixes) 2026-06-05 09:59:41 -04:00
Joseph Doherty 9e914299c8 test(e2e): add CliRunner + ClusterAvailability probe 2026-06-05 09:56:47 -04:00
Joseph Doherty 51e48fca91 test(e2e): reference CLI project so tests can shell out to it 2026-06-05 09:53:51 -04:00
Joseph Doherty b540015fbd docs(tests): implementation plan for Playwright coverage expansion
16 task-by-task steps: shared CliRunner + ClusterAvailability skip infra,
DeploymentFixture + deploy/enable/disable/delete suites, notification
retry/discard + parked-messages query, Transport Import round-trip, Site/
Template/LDAP CRUD round-trips, nav render hardening, Health KPI guard, and a
no-residue verification pass. Co-located .tasks.json for resumable execution.
2026-06-05 09:52:12 -04:00
Joseph Doherty cb3b3bf373 docs(tests): design for Playwright coverage expansion (7 audit recs)
Captures the 2026-06-05 coverage audit's gaps and the approved approach for
closing them: ephemeral CLI-provisioned fixtures with outcome-tolerant asserts
for the mutating suites (deploy lifecycle, retry/discard, transport import),
UI CRUD round-trips, nav render hardening, a Health KPI load test, and a
standardized skip-and-log policy. Next: writing-plans turns this into tasks.
2026-06-05 09:39:35 -04:00
Joseph Doherty d33617d65d fix(host): register ActorSystem as DI singleton so health-probe scopes don't dispose it (HOST-021)
Per-probe health-check child scopes were disposing the AddTransient-bridged
ActorSystem (IDisposable), terminating the live cluster node ~4s after boot and
leaving every singleton-proxy Ask to hang the full 30s QueryTimeout — the central
report pages (/notifications, /site-calls, /monitoring/health) loaded in ~30s.
Bridge it as a singleton via a new lazy AkkaHostedService.GetOrCreateActorSystem()
so child-scope disposal never touches it. Verified: 0 post-startup terminates,
healthy active/standby, report pages ~0.05s, Playwright 68 passed / 0 failed.
2026-06-05 08:26:09 -04:00
Joseph Doherty 0783547a2d chore(theme): bump ZB.MOM.WW.Theme 0.3.0 -> 0.3.1 (interactive-render nav fix) 2026-06-05 07:19:11 -04:00
Joseph Doherty 2515c9db2d chore(theme): consume ZB.MOM.WW.Theme 0.3.0 (nav/login kit fixes) 2026-06-05 04:44:47 -04:00
Joseph Doherty 35a4a5bfea docs(glauth): dev/test LDAP is now the shared GLAuth on 10.100.0.35
infra/ no longer runs scadabridge-ldap (retired); central nodes bind the shared
zb-shared-glauth on 10.100.0.35:3893 (dc=zb,dc=local). Source of truth:
scadaproj/infra/glauth/. test_infra_ldap.md banner-marked SUPERSEDED.
2026-06-04 16:38:08 -04:00
Joseph Doherty 5ddb17a089 feat(auth): seed SCADA-Viewers->Viewer LDAP-group role mapping
Completes the multi-role test user's 4th role. HasData row Id=5
(SCADA-Viewers->Viewer) + the SCADA-Viewers group in the (now-retired) local
glauth config. The live shared dir is scadaproj/infra/glauth/.
2026-06-04 16:38:08 -04:00
Joseph Doherty 244207c0db feat(auth): point dev clusters at shared GLAuth 10.100.0.35; retire local scadabridge-ldap
Both :9000 (docker) and :9100 (docker-env2) central nodes now bind the shared dev
GLAuth (scadaproj/infra/glauth/, dc=zb,dc=local) via the cn=serviceaccount search
account instead of the bundled scadabridge-ldap container (now commented out in
infra/docker-compose.yml, kept for rollback). Verified: multi-role -> all 4 roles
on both clusters with scadabridge-ldap stopped.
2026-06-04 15:58:42 -04:00
Joseph Doherty 0e2d9ed186 chore(theme): bump ZB.MOM.WW.Theme 0.2.0 -> 0.2.1 (desktop app-shell render fix) 2026-06-04 10:23:16 -04:00
Joseph Doherty 0c3837c778 docs(components): accuracy fixes from deep review (batch 4)
ManagementService (role table: queries any-auth, area mutations Designer;
audit contract exception), CLI (missing instance/api-key subcommands; server
JSON printed verbatim; bundle preview timeout), Transport (BundleFormatVersion
exact-match gate; dependency scan fields; three flushes), CentralUI
(/api/script-analysis endpoints; LoginLayout minimal; Health tile components),
TreeView (Topology no RevealNode; ContextMenu Site branch; InitiallyExpanded).
2026-06-03 16:39:29 -04:00
Joseph Doherty 9175b0c013 docs(components): accuracy fixes from deep review (batch 3)
NotificationService (Notify.Send returns string not NotificationId;
MaxConcurrentConnections unenforced; AddHttpClient), NotificationOutbox
(one Attempted row always, terminal row only on terminal status), SiteCallAudit
(direct dual-write, no Tell; KPI tiles consumed by CentralUI), HealthMonitoring
(CentralOfflineTimeout 180s = 6x ReportInterval; HealthReportSender gates on
IsActiveNode), SiteEventLogging (active-node purge seam not wired; runs on both
nodes), InboundAPI (whole System.Diagnostics namespace forbidden).
2026-06-03 16:37:15 -04:00
Joseph Doherty 25bae4e43b docs(components): accuracy fixes from deep review (batch 2)
TemplateEngine (alarm-script-ref ordering, native-alarm-sources not in
revision hash, composition cycle checks, 9-step pipeline), SiteRuntime
(alarm on-trigger scripts run with a restricted context; PreStart seeds
children from defaults before overrides arrive), DataConnectionLayer
(UnsubscribeAlarmsRequest stashed in Connecting), StoreAndForward (InFlight/
Delivered are dead enum values; notifications can park at 50 retries),
ExternalSystemGateway (CachedWrite returns void + enqueues directly; log levels).
2026-06-03 16:34:37 -04:00
Joseph Doherty c5fb02d640 docs(components): accuracy fixes from deep review (batch 1)
Commons (third-party dep, 7 namespaces, retired ApiKey, repo SaveChanges
carve-out), ConfigurationDatabase (5 persisted + 1 non-persisted computed col),
ClusterInfrastructure (abbreviated HOCON note, RemotingPort default),
Host (component matrix: CI/HealthMonitoring/ExternalSystemGateway have no
actors; DeadLetterMonitorActor runs on both roles), Security (Bearer not
X-API-Key; ApiKeyAdmin registered by Host), Communication (Task.Run/Sender).
2026-06-03 16:32:01 -04:00
Joseph Doherty 66f0f96328 docs(components): verification pass — fix cross-link targets, tag code fences, correct type names
- Fix 15 link-text/target mismatches (ConfigurationDatabase ×8 to Commons,
  NotificationOutbox ×4, ClusterInfrastructure case, HealthMonitoring,
  SiteCallAudit) caught by a link-text-vs-target consistency check.
- Tag 14 untagged code-fence openers (ASCII diagrams/trees, JSON, HTTP).
- Correct 4 type names to match source (ValidationService, HealthReportSender,
  CentralCommunicationActor, DebugSnapshotCommand set).
- Soften Traefik version prose per the style guide.
2026-06-03 16:09:06 -04:00
Joseph Doherty a26f4a5f81 docs(components): index + link from README 2026-06-03 15:59:20 -04:00
Joseph Doherty d14fc3f68f docs(components): reference docs batch 4/4 — ManagementService, CLI, Transport, CentralUI, TraefikProxy, TreeView 2026-06-03 15:57:32 -04:00
Joseph Doherty c1c8e35687 docs(components): reference docs batch 3/4 — NotificationService, NotificationOutbox, SiteCallAudit, HealthMonitoring, SiteEventLogging, InboundAPI 2026-06-03 15:52:33 -04:00
Joseph Doherty 8fb90ba400 docs(components): reference docs batch 2/4 — TemplateEngine, DeploymentManager, SiteRuntime, DataConnectionLayer, StoreAndForward, ExternalSystemGateway 2026-06-03 15:47:16 -04:00
Joseph Doherty b89611464b docs(components): reference docs batch 1/4 — Commons, ConfigurationDatabase, Communication, ClusterInfrastructure, Host, Security 2026-06-03 15:42:03 -04:00
Joseph Doherty b2770764c5 docs(components): AuditLog reference doc (pilot exemplar) 2026-06-03 15:34:30 -04:00
Joseph Doherty 0da5d3dd0b docs(components): scaffold reference-docs folder + link checker 2026-06-03 15:24:05 -04:00
Joseph Doherty 5e106df9e6 docs(plans): implementation plan for per-component reference docs
28-task plan: scaffold, AuditLog pilot (approval gate), 24-doc parallel
fan-out, index+README, verification pass. Co-located .tasks.json for resume.
2026-06-03 15:24:05 -04:00
Joseph Doherty e89cf2b278 docs(plans): design for per-component reference docs in docs/components/
Brainstormed design: generate 25 StyleGuide-conformant developer-reference
docs derived from src/ code (pilot AuditLog, then parallel fan-out, then
accuracy/conformance verification). Complements the requirements specs;
leaves src/, XML docs, and specs untouched.
2026-06-03 13:58:14 -04:00
Joseph Doherty 15752f8c2d fix(security): make auth cookie name configurable, override per env
The auth cookie name was hardcoded to ZB.MOM.WW.ScadaBridge.Auth. Because
browser cookies are scoped by host+path but NOT by port, two ScadaBridge
clusters on the same host (the local docker stack on localhost:9000 and
docker-env2 on localhost:9100) shared one cookie jar: signing into one
overwrote the other's cookie, and since the clusters use different JWT
signing keys + separate Data Protection key rings, the overwritten side
could no longer validate its cookie and the session died.

Add SecurityOptions.CookieName (default = canonical ZB.MOM.WW.ScadaBridge.Auth,
blank falls back to the default) applied via the SecurityOptions-bound cookie
PostConfigure. Override it to ...Auth.env2 in both docker-env2 Central nodes so
the two local clusters no longer collide; the primary cluster keeps the default
so its live sessions and production are unaffected. Adds 3 Security.Tests cases.
2026-06-03 13:11:29 -04:00
Joseph Doherty eabf270d71 docs: complete XML doc coverage (returns, summaries, inheritdoc)
Resolve all 622 issues flagged by the enhanced CommentChecker: add missing
<returns> tags (incl. the standard phrasing on non-generic Task methods),
add missing <summary> tags, and replace misused/redundant <inheritdoc/> on
members that override or implement nothing with real documentation.
Documentation-only — no behavior change; solution builds clean.
2026-06-03 11:39:32 -04:00
Joseph Doherty a050170414 chore(docker): supply DEV-ONLY ApiKeyPepper to local Central nodes
The Auth/Config normalization made ScadaBridge:InboundApi:ApiKeyPepper a hard
Central-only startup requirement (>=16 chars), but the local dev composes never
supplied it, so deploy.sh's freshly-built image crash-looped both Central nodes
on ConfigPreflight validation. Add a clearly-marked DEV-ONLY, insecure pepper
inline to each cluster's Central environment (distinct per environment). These
are NOT real secrets — production injects a true per-env secret out-of-band per
docs/operations/inbound-api-key-reissue.md; the inline values exist only so the
local docker / docker-env2 clusters start.
2026-06-03 05:30:38 -04:00
Joseph Doherty 9f18badf02 build(host): declare ZB.MOM.WW.Theme directly (not transitively via CentralUI)
Host/App.razor uses the kit's <ThemeHead/>/<ThemeScripts/>, but Host had no direct
PackageReference — it relied on CentralUI re-exporting the package transitively.
Add a versionless <PackageReference Include="ZB.MOM.WW.Theme"/> (version pinned by
central PM at Directory.Packages.props) so the declared dependency matches actual
usage and survives any future PrivateAssets/refactor on CentralUI. Additive only;
Host builds clean (0/0).
2026-06-03 04:52:00 -04:00
Joseph Doherty 837fb74ae5 chore(centralui): remove dead .sidebar shell CSS left by the theme cutover
The .sidebar/#sidebar-collapse/.nav-link/.nav-section-toggle block is orphaned —
the side rail is now the ZB.MOM.WW.Theme kit's .side-rail/.rail-link shell, and
no markup references these selectors. Kept the app-only #reconnect-modal and
.script-editor-modal rules (not provided by the kit). 95 lines removed; builds clean.
2026-06-03 04:37:23 -04:00
Joseph Doherty 58352a67cb fix(centralui): include AntiforgeryToken in LoginCard (match OtOpcUa + kit contract) 2026-06-03 03:39:47 -04:00
Joseph Doherty b9516e6721 feat(centralui): LoginCard sign-in
Replace hand-rolled Bootstrap card with the shared <LoginCard> from ZB.MOM.WW.Theme.
Update ComponentRenderingTests assertions to match LoginCard's rendered structure
(h1.login-title, div.panel.notice.login-error, "Sign in" button text).
2026-06-03 03:34:12 -04:00
Joseph Doherty 957203ec7b feat(centralui): MainLayout/NavMenu delegate to ZB.MOM.WW.Theme ThemeShell + kit nav 2026-06-03 03:31:10 -04:00
Joseph Doherty 6fb545d75b refactor(centralui): drop vendored theme.css/fonts/nav-state.js; keep app-only CSS in site.css 2026-06-03 03:25:04 -04:00
Joseph Doherty 6d75bdb372 feat(host): use ZB.MOM.WW.Theme ThemeHead + ThemeScripts 2026-06-03 03:23:03 -04:00
Joseph Doherty e1589497f1 build(centralui): reference ZB.MOM.WW.Theme 0.2.0 2026-06-03 03:21:44 -04:00
Joseph Doherty b3de8408fa feat(audit): ScadaBridge IAuditActorAccessor + wire audit Actor from Auth principal at authenticated emit sites (Phase 3) 2026-06-02 15:33:01 -04:00
Joseph Doherty bc0e5bfd37 docs(audit): ScadaBridge C7 review — correct 'six persisted' computed-col wording (5 persisted + IngestedAtUtc non-persisted) + stale perf iteration comment 2026-06-02 15:08:49 -04:00
Joseph Doherty 635461c0fd chore(audit): ScadaBridge C7 — perf re-baseline + CollapseAuditLogToCanonical projection test + index-test fix + dead-cref cleanup (Task 2.5)
Perf re-baseline (HotPathLatencyTests): empirical p95 on Apple M-series Release
build: 4KB DetailsJson slow path ≈14 µs, small-DetailsJson no-redactors ≈2 µs,
true no-op fast path ≈0 µs. Thresholds updated: 200 µs / 30 µs / 5 µs (≈15×
headroom for contested CI runners). Old thresholds (50 µs / 10 µs) were set for
the pre-C3 typed-field path; canonical JSON parse+rewrite is empirically faster.
Adds a third test (Filter_Apply_NoDetailsJson_FastPath) that asserts same-instance
return on the DetailsJson-null + within-cap fast path. Env-var overrides retained.

CollapseAuditLogToCanonicalMigrationTests (new): three MSSQL-gated [SkippableFact]
tests verifying Action/Category/Outcome projection, NULL Actor, DetailsJson codec
round-trip, and all six persisted computed columns (Kind/Status/SourceSiteId/
ExecutionId/ParentExecutionId) for ApiOutbound, InboundAuthFailure, and Failed-
status rows.

AddAuditLogTableMigrationTests: rename CreatesFiveNamedIndexes →
CreatesNineNamedIndexes; expand coverage from 5 original indexes to all 9 named
non-clustered indexes present after CollapseAuditLogToCanonical (adds
IX_AuditLog_Execution, IX_AuditLog_ParentExecution, IX_AuditLog_Node_Occurred,
UX_AuditLog_EventId).

Dead-cref cleanup: zero references to the deleted IAuditPayloadFilter /
DefaultAuditPayloadFilter / SafeDefaultAuditPayloadFilter types remain in any
.cs file (source or test). 26 occurrences across 13 files replaced with correct
references to IAuditRedactor / ScadaBridgeAuditRedactor / SafeDefaultAuditRedactor
or reworded as plain prose.

Residual sweep: no unused transitional code found beyond the acknowledged
"C3 transitional shim" comments on IngestedAtUtc stamping (active code, not dead).
2026-06-02 14:59:23 -04:00
Joseph Doherty 68a6bd1720 feat(audit)!: ScadaBridge C5 — collapse central dbo.AuditLog to 10 canonical cols + persisted computed cols; CollapseAuditLogToCanonical migration; repo writes canonical directly (Task 2.5) 2026-06-02 14:06:46 -04:00
Joseph Doherty 1737d15f04 fix(audit): ScadaBridge C4 review — enable PRAGMA foreign_keys + MarkForwarded state guard (no Reconciled demotion) + test (Task 2.5) 2026-06-02 13:23:36 -04:00
Joseph Doherty 946d3e2aef feat(audit): ScadaBridge C4 — site SQLite two-table (audit_event canonical + audit_forward_state sidecar), forwarding on sidecar, IsCachedKind drain split (Task 2.5) 2026-06-02 13:11:20 -04:00
Joseph Doherty c27b2c3d5f fix(audit): ScadaBridge C3 review — safe enum-parse (fallback) in SqliteAuditWriter.MapRow + AuditEventDtoMapper.FromDto (Task 2.5) 2026-06-02 12:55:07 -04:00
Joseph Doherty db707bb0de feat(audit)!: ScadaBridge C3 — swap to canonical ZB.MOM.WW.Audit.AuditEvent across seams/emitters/DTO/redactor wiring; transitional 24-col storage shim (Task 2.5) 2026-06-02 12:37:50 -04:00
Joseph Doherty 5aaf9e2923 fix(audit): ScadaBridge C2 review — over-redact scrubs all sensitive free-text fields + outer-catch never-leak test + marker alignment
I1 (security): OverRedact() in ScadaBridgeAuditRedactor now suppresses ErrorDetail,
ErrorMessage, and Extra (in addition to RequestSummary/ResponseSummary) to the
over-redacted marker in BOTH code paths (Deserialize+with path and the fallback
new-AuditDetails path). SafeDefaultAuditRedactor catch block aligned to match.

M3 (test): OuterCatch_OptionsThrows_NeverLeaks_AllSensitiveFieldsOverRedacted forces
the outer try/catch → OverRedact path via a ThrowingMonitor that throws from
CurrentValue (the first statement in the try block). Asserts (a) Apply does not
throw, and (b) all five sensitive free-text fields are suppressed to the
over-redacted marker with PayloadTruncated=true.

M1 (consistency): SafeDefaultAuditRedactor now uses AuditRedactionPrimitives
constants (RedactedMarker for line-format header values, OverRedactedEventMarker
for the catch block), eliminating the divergent [REDACTED]/[redacted by ...]
strings. AuditRedactionPrimitives gains OverRedactedEventMarker = RedactorErrorMarker.
SafeDefaultAuditRedactorTests updated from [REDACTED] → <redacted>.

M2 (comment): Added one-line note in TruncateField explaining why the char-count
(result.Length != value.Length) truncation check is sufficient given TruncateUtf8
only ever shortens.
2026-06-02 11:12:18 -04:00
Joseph Doherty adfb4d385c feat(audit): ScadaBridge C2 — ScadaBridgeAuditRedactor/SafeDefaultAuditRedactor : IAuditRedactor on canonical record (Task 2.5) 2026-06-02 11:00:36 -04:00
Joseph Doherty 3d77dc003c feat(audit): ScadaBridge C1 — AuditDetails codec (deterministic) + AuditOutcome projection + canonical field builders + ZB.MOM.WW.Audit ref (Task 2.5)
Additive foundation only — no existing type/interface/emitter changed.
Commons now references ZB.MOM.WW.Audit 0.1.0 (Gitea feed, central PM pin).
Adds four pure new types in Commons/Types/Audit/:
  AuditDetails (sealed record, 17 domain fields, declaration-order = JSON key order)
  AuditDetailsCodec (static; single cached JsonSerializerOptions: camelCase, no-indent,
    WhenWritingNull, UnsafeRelaxedJsonEscaping — byte-deterministic across calls)
  AuditOutcomeProjector (static; InboundAuthFailure→Denied first, then Delivered→Success,
    Failed/Parked/Discarded→Failure, all others→Success)
  AuditFieldBuilders (static; BuildAction="{channel}.{kind}", BuildCategory=channel.ToString())
56 new tests in Commons.Tests/Types/Audit/ covering codec round-trip, byte-determinism
(hand-pinned expected JSON string), null/empty sentinel, full projection table,
InboundAuthFailure-Denied precedence, and Action/Category builders. All pass.
2026-06-02 10:42:51 -04:00
Joseph Doherty 4118452e72 docs(auth): ScadaBridge Task 1.7 review — correct stale role-name prose in NavMenu comments (Admin/Design/Deployment/Audit→canonical) 2026-06-02 08:13:38 -04:00
Joseph Doherty b104760b3a feat(auth)!: ScadaBridge canonical roles + SoD collapse (Audit→Administrator, AuditReadOnly→Viewer) + config-DB migration (Task 1.7)
Standardize role string VALUES on the canonical vocabulary
(Administrator/Designer/Deployer/Viewer; Operator/Engineer unused here):
  Admin        -> Administrator
  Design       -> Designer
  Deployment   -> Deployer
  Audit        -> Administrator   (COLLAPSE; accepted privilege escalation)
  AuditReadOnly-> Viewer          (COLLAPSE; keeps audit-read, no export)

SoD: OperationalAuditRoles = { Administrator, Viewer },
     AuditExportRoles      = { Administrator }
so Viewer reads the audit log + nav but cannot bulk-export, while
Administrator does both + holds the full admin surface (the documented,
accepted auditor/admin SoD collapse).

Atomic move across every enforcement site:
- Roles constants; AuthorizationPolicies (RequireClaim values + SoD arrays +
  honest XML-doc); RoleMapper Deployer check.
- ManagementActor.GetRequiredRole switch + the hard-coded site-scope
  admin-bypass (now Roles.Administrator at all 6 sites). Site-scoping logic
  is otherwise unchanged.
- DebugStreamHub Administrator/Deployer gates (Deployer kept case-sensitive).
- CentralUI BrowseService/BindingTester Designer guards; LdapMappingForm
  dropdown now offers canonical values (incl. Viewer).
- Config-DB seed (LdapGroupMappings Id 1-4) + EF migration CanonicalizeRoles:
  Id-keyed UpdateData for seed rows + idempotent raw catch-all UPDATEs for
  operator-added rows. Down is lossy on the collapse (documented in-file).
  No pending model changes.

Tests reworked to the collapsed model across Security/CentralUI/
ManagementService/ConfigurationDatabase/Integration suites, incl. explicit
Viewer-reads-not-exports and former-Audit-now-Administrator-escalation cases.

CHANGELOG: BREAKING security note documenting the canonicalization + SoD
collapse.
2026-06-02 08:00:47 -04:00
Joseph Doherty 6ae605160c chore(auth): ScadaBridge unify dev LDAP base DN to dc=zb,dc=local (Task 1.6)
Replace dc=scadabridge,dc=local with dc=zb,dc=local in all dev/test LDAP
references — app config, docker test-cluster node configs (docker/ and
docker-env2/), GLAuth fixture, dev tooling, Host.Tests fixtures,
IntegrationTests factory, and operational test_infra docs. OU structure
(ou=SCADA-Admins,ou=users,etc.) preserved throughout. Email domains
(@scadabridge.local), hostnames, and container names are untouched.
Historical plan docs (2026-05-24-second-environment.md,
2026-05-31-folder-repo-rename-scadabridge-design.md) excluded as
point-in-time records. No synthetic dc=example,dc=com placeholders touched.
2026-06-02 06:54:14 -04:00
Joseph Doherty c185a567f5 fix(auth): ScadaBridge Task 1.5 review — use JwtTokenService.RoleClaimType constant in CentralUI tests (canonical spelling) 2026-06-02 06:29:16 -04:00
Joseph Doherty a0938f708b feat(auth): ScadaBridge full canonical claims (ZbClaimTypes role/scope) + ZbCookieDefaults, keep cookie name (Task 1.5) 2026-06-02 06:23:15 -04:00
Joseph Doherty afa55981d5 feat(auth)!: ScadaBridge retire SQL Server ApiKey entity + ApprovedApiKeyIds + legacy hashing; EF migration RetireInboundApiKeyStore; re-issue runbook + CHANGELOG (re-arch C5/E) — BREAKING: X-API-Key -> Bearer sbk_, keys re-issued 2026-06-02 05:39:59 -04:00
Joseph Doherty b13d7b3d28 fix(auth): C4 review polish — document backward-compat JSON tolerance, shared BundleJsonOptions, PreviewAsync legacy-bundle test, doc fix (review I-2/I-3/M-1/M-2; I-1 intentionally skipped) 2026-06-02 05:15:50 -04:00
Joseph Doherty 731cfd3bfc feat(auth): ScadaBridge TransportExport excludes inbound API keys (re-arch C4; methods-only, import ignores legacy key sections); keys re-issued per environment 2026-06-02 05:06:40 -04:00
Joseph Doherty d1191fddf9 fix(auth): C3 review — surface seam not-found (no silent success), partial-reconcile-failure guidance, create validation order, concurrent-edit reconciler test 2026-06-02 04:46:32 -04:00
Joseph Doherty 107e524914 feat(auth): ScadaBridge CentralUI pages onto IInboundApiKeyAdmin seam (re-arch C3; string keyId, method-scopes replace ApprovedApiKeyIds, token-once display, approved-keys<->scopes inversion) 2026-06-02 04:36:50 -04:00
Joseph Doherty 8219b8ee18 fix(auth): C2 review — not-found throws (no spurious audit) on update/delete/set-methods, reject empty methods (unusable-key/stealth-disable), richer set-methods response, token advisory to stderr 2026-06-02 04:21:28 -04:00
Joseph Doherty 6518e93424 feat(auth): ScadaBridge ManagementActor + CLI + Commons messages onto IInboundApiKeyAdmin seam (re-arch C2; int->string keyId, +Methods, +SetApiKeyMethods) 2026-06-02 04:11:44 -04:00
Joseph Doherty 7f7ea3f3c9 fix(auth): C1 review polish — guard name at seam, document seam contract (throws/O(n)), explicit cookie test (review #1/#2/#3/#5/#8) 2026-06-02 04:01:43 -04:00
Joseph Doherty 55099b19f6 fix(auth): move AddZbLdapAuth to Host composition root so component-lib AddSecurity() drops IConfiguration param (satisfy OptionsTests arch rule; fix pre-existing ac34dac red); behaviour-preserving 2026-06-02 03:50:16 -04:00
Joseph Doherty 7e25efa790 test(host): supply Central test ApiKeyPepper so StartupValidator preflight passes (fix pre-existing 1fcc4f5 red); lock pepper-required behavior
Commit 1fcc4f5 added a Central-only Require for ScadaBridge:InboundApi:ApiKeyPepper
(>=16 chars) to StartupValidator. That Require fires in Program.cs before WebApplicationFactory
can apply any WithWebHostBuilder config overlays, so it must be satisfied via environment
variables (which ARE in the pre-host AddEnvironmentVariables() pass).

Fix (test-only, no src/ changes):
- CentralDbTestEnvironment: add ScadaBridge__InboundApi__ApiKeyPepper env var (TestPepper
  constant, 23 chars) alongside the existing db connection string; restore on Dispose.
  Fixes HealthCheckTests, MetricsEndpointTests, and HostStartupTests.CentralRole_StartsWithoutError
  which all use CentralDbTestEnvironment.
- CentralActorPathTests.InitializeAsync: set the pepper env var before WebApplicationFactory
  is constructed (the class uses IAsyncLifetime directly, not CentralDbTestEnvironment).
- CentralCompositionRootTests ctor + Dispose: same env-var pattern; those tests already had
  the pepper in AddInMemoryCollection (DI-layer only, too late for pre-host validation).
- CentralAuditWiringTests ctor + Dispose: same env-var pattern for the same reason.
- StartupValidatorTests.ValidCentralConfig(): add pepper so the unit tests that call
  StartupValidator.Validate() directly with a Central config stop failing.
- Add guard tests: Central_MissingApiKeyPepper_FailsValidation,
  Central_ShortApiKeyPepper_FailsValidation, Site_ApiKeyPepper_NotRequired — these lock
  the production behavior introduced by 1fcc4f5.
2026-06-02 03:40:56 -04:00
Joseph Doherty d09def2be0 feat(auth): ScadaBridge re-pin Auth 0.1.3 + add IInboundApiKeyAdmin seam over library admin facade (re-arch C1, additive) 2026-06-02 03:32:25 -04:00
Joseph Doherty 1fcc4f5c2b fix(auth): ScadaBridge inbound auth review fixes — scope-before-DB, pinned 403 body, pepper fail-fast, log category 2026-06-02 02:50:10 -04:00
Joseph Doherty a94558c289 feat(auth): ScadaBridge inbound API — adopt ZB.MOM.WW.Auth.ApiKeys verifier + Bearer + scope=method (re-arch A+B); additive, old path retired later 2026-06-02 02:40:18 -04:00
Joseph Doherty 4db8c373af fix(auth): ScadaBridge 1.2 review fixes — secret-test repoint, checklist, Scope guard, 0.1.1 pin 2026-06-02 01:23:52 -04:00
Joseph Doherty ac34dac479 feat(auth): cut ScadaBridge over to ZB.MOM.WW.Auth.Ldap; nest+rename Ldap config; roles+sitescope via IGroupRoleMapper (Task 1.2/1.4) 2026-06-02 01:04:34 -04:00
Joseph Doherty 9230afa25f feat(auth): add IGroupRoleMapper<string> seam (Task 1.1) 2026-06-02 00:30:42 -04:00
Joseph Doherty aaad38958e build: add ZB.MOM.WW.Auth/Audit feed mapping + version pins
Maps ZB.MOM.WW.Auth, ZB.MOM.WW.Auth.*, ZB.MOM.WW.Audit to the gitea feed
and pins all 4 Auth packages + Audit at 0.1.0. PackageReferences added
during Phase 1/2 adoption.
2026-06-02 00:17:40 -04:00
Joseph Doherty 145d2668e2 fix: wire ValidateOnStart for ScadaBridge HealthMonitoring + Cluster options (fail-fast at startup) 2026-06-01 23:07:46 -04:00
Joseph Doherty 9668a4e84a refactor: ScadaBridge module options registration -> AddValidatedOptions; clarify De Morgan predicates 2026-06-01 22:49:41 -04:00
Joseph Doherty 6dbbc7ad04 refactor: ScadaBridge StartupValidator -> ConfigPreflight (byte-compatible) 2026-06-01 19:04:13 -04:00
Joseph Doherty aac59c9fae refactor: ScadaBridge validators onto OptionsValidatorBase (messages unchanged) 2026-06-01 18:56:04 -04:00
Joseph Doherty 9bca6aae61 build: add ZB.MOM.WW.Configuration feed mapping + version pin 2026-06-01 18:10:29 -04:00
Joseph Doherty 7d16f8f275 Merge feat/telemetry-followons: telemetry follow-ons for ScadaBridge
Site-node HTTP/1.1 /metrics listener (NodeOptions.MetricsPort=8084, avoids the
site RemotingPort collision; StartupValidator enforces distinctness). First
application instruments: ScadaBridgeTelemetry meter + deployments.applied,
store_and_forward.queue.depth, inbound_api.requests, site.connection.up.
Config-driven OTLP exporter opt-in (default Prometheus).
2026-06-01 17:17:39 -04:00
Joseph Doherty ccf43312e8 feat(scadabridge): config-driven OTLP exporter opt-in (default Prometheus) 2026-06-01 17:14:35 -04:00
Joseph Doherty a5f8651b0f feat(scadabridge): track scadabridge.site.connection.up over site-stream lifetime (balanced open/close) 2026-06-01 17:11:39 -04:00
Joseph Doherty 15a626390b fix(scadabridge): queue-depth seed uses Add (no lost concurrent enqueue) + clarify registration/discard comments 2026-06-01 17:07:03 -04:00
Joseph Doherty 782fb73015 feat(scadabridge): emit scadabridge.inbound_api.requests (by method) at inbound API entry 2026-06-01 17:03:10 -04:00
Joseph Doherty 547b685a42 feat(scadabridge): wire scadabridge.store_and_forward.queue.depth gauge to buffered count 2026-06-01 16:58:09 -04:00
Joseph Doherty 877f2e200b feat(scadabridge): emit scadabridge.deployments.applied on deployment success 2026-06-01 16:52:09 -04:00
Joseph Doherty c41cb41c7b fix(scadabridge): default MetricsPort to 8084 (avoid site RemotingPort collision) + validate port distinctness 2026-06-01 16:46:59 -04:00
Joseph Doherty fe25ac3e51 feat(scadabridge): add ScadaBridgeTelemetry meter + 4 instruments; register with OTel 2026-06-01 16:41:52 -04:00
Joseph Doherty bbc9f09268 feat(scadabridge): add HTTP/1.1 metrics listener on site nodes (NodeOptions.MetricsPort=8082) 2026-06-01 16:36:59 -04:00
Joseph Doherty 43f5886024 Merge feat/adopt-zb-telemetry: adopt ZB.MOM.WW.Telemetry across ScadaBridge
AddZbTelemetry (shared OTel Resource + standard instrumentation + /metrics) wired
into both Central and Site composition roots; kept LoggerConfigurationFactory
(min-level governance) and added the shared TraceContextEnricher for trace<->log
correlation. Behaviour-preserving (no AddZbSerilog; factory retained).
2026-06-01 16:05:49 -04:00
Joseph Doherty f743ffaad2 feat(scadabridge): add shared TraceContextEnricher to log pipeline (trace correlation) 2026-06-01 15:40:42 -04:00
Joseph Doherty b3070c0bda feat(scadabridge): wire AddZbTelemetry + /metrics in both composition roots 2026-06-01 15:36:55 -04:00
Joseph Doherty 20a31835cf build(scadabridge): reference ZB.MOM.WW.Telemetry packages from Gitea feed 2026-06-01 15:30:00 -04:00
Joseph Doherty 59dca0d5fd Merge feat/adopt-zb-health: adopt ZB.MOM.WW.Health shared probes (/healthz, canonical writer, ActorSystem DI bridge) 2026-06-01 14:07:00 -04:00
579 changed files with 31363 additions and 10016 deletions
+99
View File
@@ -0,0 +1,99 @@
# Changelog
All notable changes to ScadaBridge are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
## [Unreleased]
### Changed — BREAKING: canonical role names + audit separation-of-duties collapse (Task 1.7)
Role string VALUES are standardized onto the canonical vocabulary
(`Administrator`/`Designer`/`Deployer`/`Viewer`; `Operator`/`Engineer` are unused
by ScadaBridge). The legacy ScadaBridge role names were renamed and two were
**collapsed**:
| Legacy role | Canonical role | Notes |
|-----------------|-----------------|-------|
| `Admin` | `Administrator` | rename |
| `Design` | `Designer` | rename |
| `Deployment` | `Deployer` | rename |
| `Audit` | `Administrator` | **COLLAPSE** |
| `AuditReadOnly` | `Viewer` | **COLLAPSE** |
- **SECURITY — privilege escalation (accepted).** The former `Audit` role
collapses into `Administrator`. This is a real escalation: a former audit-only
user now holds the **entire admin surface** (create/update/delete sites, manage
LDAP group→role mappings and API keys, preview/import transport bundles), not
just audit read+export. This loss of auditor/admin separation-of-duties is a
deliberate, accepted trade-off of the canonicalization.
- **SECURITY — half-SoD preserved.** The former `AuditReadOnly` role collapses
into `Viewer`, which **keeps audit READ** (Audit Log page, Configuration Audit
Log page, audit nav group) but **cannot bulk-export**. The audit policy sets are
now `OperationalAuditRoles = { Administrator, Viewer }` and
`AuditExportRoles = { Administrator }`, so a `Viewer` reads the audit log but the
Export-CSV button / `/api/audit/export` endpoint correctly refuses it.
- **Enforcement.** Every enforcement site moved together: the role-claim values,
the authorization policies (`RequireAdmin`/`RequireDesign`/`RequireDeployment`
policy *names* are unchanged; only the role *values* inside them changed), the
`ManagementActor.GetRequiredRole` switch, the hard-coded site-scope admin-bypass
(`Roles.Administrator` everywhere), the `DebugStreamHub` Administrator/Deployer
gates, and the CentralUI `BrowseService`/`BindingTester` Designer guards.
**Site-scoping logic is otherwise unchanged** — only the admin-bypass *value*
moved from `"Admin"` to `Roles.Administrator`.
- **Config-DB migration `CanonicalizeRoles`.** Updates the four seeded
`LdapGroupMappings` rows (Id 1-4) to the canonical role values and adds raw
idempotent catch-all `UPDATE`s for operator-added rows
(`Admin`/`Audit``Administrator`, `Design``Designer`, `Deployment``Deployer`,
`AuditReadOnly``Viewer`). The Down migration is **lossy** for the collapse: it
best-effort maps `Administrator``Admin` and `Viewer``AuditReadOnly` but cannot
recover the original `Audit`/`Admin` or `Viewer`/`AuditReadOnly` distinction.
- **Operator action.** Any LDAP group→role mappings created with the legacy role
strings are migrated automatically by `CanonicalizeRoles`. New mappings created
via the CentralUI LDAP-mappings form now offer the canonical role values
(including a `Viewer` option for audit-read-only delegation).
### Changed — BREAKING: inbound API authentication
Inbound API authentication has migrated off the SQL Server `X-API-Key` scheme and
onto the shared `ZB.MOM.WW.Auth.ApiKeys` library.
- **Credential format.** The inbound `POST /api/{methodName}` endpoint now
authenticates an `Authorization: Bearer sbk_<keyId>_<secret>` token instead of the
raw `X-API-Key: <key>` header. The secret is verified with a peppered, constant-time
HMAC compare inside the shared library verifier.
- **Storage.** Inbound API keys now live in the shared `ZB.MOM.WW.Auth.ApiKeys` SQLite
store, not the SQL Server configuration database. The deterministic-HMAC `ApiKey`
table is gone.
- **Authorization model.** A key's allowed methods are now its per-key **scopes**
(scope string == method name, ordinal/case-sensitive). The previous
`ApiMethod.ApprovedApiKeyIds` CSV that linked methods to key IDs has been removed.
- **Peppering.** Keys are peppered per environment via
`ScadaBridge:InboundApi:ApiKeyPepper` (≥ 16 characters, **different per environment**,
kept secret). The same configuration key now backs the library verifier's pepper
secret.
> **BREAKING — all existing inbound API keys are INVALIDATED and must be re-issued.**
> Old `X-API-Key` credentials and their stored HMAC hashes are not migrated and are
> not recoverable; the `ApiKeys` table is dropped. Operators must re-issue every
> inbound key as an `sbk_…` token and update every API client. See the runbook:
> [`docs/operations/inbound-api-key-reissue.md`](docs/operations/inbound-api-key-reissue.md).
### Removed
- The SQL Server `ApiKey` entity (`ZB.MOM.WW.ScadaBridge.Commons.Entities.InboundApi.ApiKey`),
its EF Core mapping, and its `IInboundApiRepository` key methods
(`GetApiKeyByIdAsync`, `GetAllApiKeysAsync`, `GetApiKeyByValueAsync`, `AddApiKeyAsync`,
`UpdateApiKeyAsync`, `DeleteApiKeyAsync`, `GetApprovedKeysForMethodAsync`).
- The `ApiMethod.ApprovedApiKeyIds` property, its EF mapping, and the CSV
parse/serialize helpers.
- The legacy hashing code: `ApiKeyHasher` / `IApiKeyHasher` and the in-repo inbound
`ApiKeyValidator` (superseded by the shared `IApiKeyVerifier`), plus their DI
registrations and tests.
### Migrations
- `RetireInboundApiKeyStore` — drops the `ApiKeys` table and the
`ApiMethods.ApprovedApiKeyIds` column. `Down` recreates both, but **dropped keys are
not recoverable**: rolling the migration back does not restore credentials. Rollback
means reverting the deployment, then re-issuing keys.
+4 -4
View File
@@ -13,7 +13,7 @@ When a change is requested, the default assumption is: update the design doc *an
- `docs/` — Design documentation: `docs/requirements/` (high-level + per-component specs), `docs/test_infra/` (test infrastructure), `docs/plans/` (design-decision and implementation-plan docs). The spec the code implements.
- `docker/` — 8-node cluster topology (2 central + 3 sites), `deploy.sh`, per-node `appsettings.*.json`. See [`docker/README.md`](docker/README.md) for setup, ports, and management commands. Rebuild + redeploy with `bash docker/deploy.sh`.
- `docker-env2/` — Minimal second cluster topology (2 central + 1 site × 2 nodes), runs concurrently with `docker/` on host ports 91XX. Built specifically for testing the Transport (#24) feature with two real environments. See [`docker-env2/README.md`](docker-env2/README.md). Rebuild + redeploy with `bash docker-env2/deploy.sh`.
- `infra/` — Docker Compose for local test services (LDAP, MS SQL, OPC UA, SMTP, REST API, Traefik).
- `infra/` — Docker Compose for local test services (MS SQL, OPC UA, SMTP, REST API, Traefik). **LDAP is no longer started here** — dev/test LDAP is the shared GLAuth on `10.100.0.35:3893` (source of truth: `scadaproj/infra/glauth/`).
- `deploy/` — Production/on-host deployment artifacts (e.g. `deploy/wonder-app-vd03/`: `appsettings.Central.json`, `appsettings.Site.json`, `install.ps1`/`uninstall.ps1`, `RUNBOOK.md`).
- `deployments/` — Deployment topology notes (`docker-cluster.md`, `docker-cluster-env2.md`, `README.md`).
- `code-reviews/` — Per-component code-review notes (one folder per component, plus `_template`).
@@ -29,7 +29,7 @@ When a change is requested, the default assumption is: update the design doc *an
- `README.md` — Master index with component table and architecture diagrams.
- `docs/requirements/HighLevelReqs.md` — Complete high-level requirements covering all functional areas.
- `docs/requirements/Component-*.md` — Individual component design documents (one per component) — the spec the code implements.
- `docs/test_infra/test_infra.md` — Master test infrastructure doc (OPC UA, LDAP, MS SQL, SMTP, REST API, Traefik).
- `docs/test_infra/test_infra.md` — Master test infrastructure doc (OPC UA, MS SQL, SMTP, REST API, Traefik). LDAP is the shared GLAuth on `10.100.0.35:3893` (not a local infra container; see `scadaproj/infra/glauth/`).
- `docs/plans/` — Design decision and implementation-plan documents from refinement sessions.
## Sister Projects
@@ -225,5 +225,5 @@ Related repos cloned as sibling directories under `~/Desktop/` — referenced fo
- **Test user**: `--username multi-role --password password` — has Admin, Design, and Deployment roles. The `admin` user only has the Admin role and cannot create templates, data connections, or deploy.
- **Config file**: `~/.scadabridge/config.json` — stores `managementUrl` and default format. See `docker/README.md` for a ready-to-use test config.
- **Rebuild cluster**: `bash docker/deploy.sh` — builds the `scadabridge:latest` image and recreates all containers. Run this after code changes to ManagementActor, Host, or any server-side component.
- **Infrastructure services**: `cd infra && docker compose up -d` — starts LDAP, MS SQL, OPC UA, SMTP, and REST API. These are separate from the cluster containers in `docker/`.
- **All test LDAP passwords**: `password` (see `infra/glauth/config.toml` for users and groups).
- **Infrastructure services**: `cd infra && docker compose up -d` — starts MS SQL, OPC UA, SMTP, and REST API. These are separate from the cluster containers in `docker/`. **LDAP is NOT started here** — it is the shared GLAuth on `10.100.0.35:3893` (dc=zb,dc=local); source of truth and config: `scadaproj/infra/glauth/`.
- **All test LDAP passwords**: `password` (see `scadaproj/infra/glauth/config.toml` for users and groups; canonical cross-app login: `multi-role`).
+9
View File
@@ -75,8 +75,17 @@
<PackageVersion Include="ZB.MOM.WW.Health" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Health.Akka" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Health.EntityFrameworkCore" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Telemetry" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Telemetry.Serilog" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.MxGateway.Client" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.MxGateway.Contracts" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Configuration" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Auth.Abstractions" Version="0.1.3" />
<PackageVersion Include="ZB.MOM.WW.Auth.Ldap" Version="0.1.3" />
<PackageVersion Include="ZB.MOM.WW.Auth.ApiKeys" Version="0.1.3" />
<PackageVersion Include="ZB.MOM.WW.Auth.AspNetCore" Version="0.1.3" />
<PackageVersion Include="ZB.MOM.WW.Audit" Version="0.1.0" />
<PackageVersion Include="ZB.MOM.WW.Theme" Version="0.3.1" />
</ItemGroup>
</Project>
+6 -2
View File
@@ -4,7 +4,7 @@ ScadaBridge is a centrally-managed, distributed SCADA configuration and deployme
## Overview
This repository is the full **implementation** project for ScadaBridge — the C#/.NET source (`src/`), tests (`tests/`), deployable Docker topology (`docker/`, `docker-env2/`, `infra/`), and the design documentation (`docs/`) that the code implements. This README is the master index: it links the per-component **design specs** (the spec the code in `src/` implements) and shows the system architecture. The solution file is `ZB.MOM.WW.ScadaBridge.slnx`.
This repository is the full **implementation** project for ScadaBridge — the C#/.NET source (`src/`), tests (`tests/`), deployable Docker topology (`docker/`, `docker-env2/`, `infra/`), and the design documentation (`docs/`) that the code implements. This README is the master index: it links the per-component **design specs** (the spec the code in `src/` implements), the per-component **reference docs** (how the shipped code works), and shows the system architecture. The solution file is `ZB.MOM.WW.ScadaBridge.slnx`.
### Technology Stack
@@ -32,7 +32,7 @@ This repository is the full **implementation** project for ScadaBridge — the C
|------|----------|
| `src/` | C#/.NET implementation — one project per component (`ZB.MOM.WW.ScadaBridge.<Component>`). Solution: `ZB.MOM.WW.ScadaBridge.slnx`. |
| `tests/` | Unit and integration test projects. |
| `docs/` | Design documentation — `docs/requirements/` (high-level + per-component specs, the spec the code implements), `docs/test_infra/`, `docs/plans/`. |
| `docs/` | Documentation — `docs/requirements/` (high-level + per-component specs, the spec the code implements), `docs/components/` (per-component developer reference — how the code works), `docs/test_infra/`, `docs/plans/`. |
| `docker/` | Primary 8-node cluster topology (2 central + 3 sites × 2 nodes + Traefik) + `deploy.sh`. |
| `docker-env2/` | Minimal second cluster (2 central + 1 site) for exercising Transport (#24) against a real second environment. |
| `infra/` | Local test services (MS SQL, LDAP, OPC UA, SMTP, REST API, Traefik). |
@@ -103,6 +103,10 @@ Both stacks share the infrastructure services in [`infra/`](infra/) (MS SQL, LDA
**Shared UI sub-component** (not a top-level component): [TreeView](docs/requirements/Component-TreeView.md) — reusable hierarchical tree/grid Blazor component used by the Central UI (#9) for the templates folder hierarchy, data-connection browse, and tag pickers.
### Component Reference Documentation
The design documents above are the **specs** (what each component does and why). For developer **reference** docs that describe how the shipped code works — with real code examples, drawn from `src/` and written to the [Style Guide](StyleGuide.md) — see [docs/components/](docs/components/). One doc per component (plus the shared TreeView), indexed in [docs/components/README.md](docs/components/README.md).
### Reference Documentation
- [AkkaDotNet/](AkkaDotNet/) — Akka.NET reference notes covering actors, remoting, clustering, persistence, streams, serialization, hosting, testing, and best practices.
@@ -22,17 +22,20 @@
"MachineDataDb": "Server=scadabridge-mssql,1433;Database=ScadaBridgeMachineData2;User Id=scadabridge_app;Password=ScadaBridge_Dev1#;TrustServerCertificate=true"
},
"Security": {
"LdapServer": "scadabridge-ldap",
"LdapPort": 3893,
"LdapUseTls": false,
"AllowInsecureLdap": true,
"LdapSearchBase": "dc=scadabridge,dc=local",
"LdapServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
"LdapServiceAccountPassword": "password",
"Ldap": {
"Server": "10.100.0.35",
"Port": 3893,
"Transport": "None",
"AllowInsecure": true,
"SearchBase": "dc=zb,dc=local",
"ServiceAccountDn": "cn=serviceaccount,dc=zb,dc=local",
"ServiceAccountPassword": "serviceaccount123"
},
"JwtSigningKey": "scadabridge-env2-dev-jwt-signing-key-must-be-at-least-32-characters-long",
"JwtExpiryMinutes": 15,
"IdleTimeoutMinutes": 30,
"RequireHttpsCookie": false
"RequireHttpsCookie": false,
"CookieName": "ZB.MOM.WW.ScadaBridge.Auth.env2"
},
"Communication": {
"DeploymentTimeout": "00:02:00",
@@ -22,17 +22,20 @@
"MachineDataDb": "Server=scadabridge-mssql,1433;Database=ScadaBridgeMachineData2;User Id=scadabridge_app;Password=ScadaBridge_Dev1#;TrustServerCertificate=true"
},
"Security": {
"LdapServer": "scadabridge-ldap",
"LdapPort": 3893,
"LdapUseTls": false,
"AllowInsecureLdap": true,
"LdapSearchBase": "dc=scadabridge,dc=local",
"LdapServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
"LdapServiceAccountPassword": "password",
"Ldap": {
"Server": "10.100.0.35",
"Port": 3893,
"Transport": "None",
"AllowInsecure": true,
"SearchBase": "dc=zb,dc=local",
"ServiceAccountDn": "cn=serviceaccount,dc=zb,dc=local",
"ServiceAccountPassword": "serviceaccount123"
},
"JwtSigningKey": "scadabridge-env2-dev-jwt-signing-key-must-be-at-least-32-characters-long",
"JwtExpiryMinutes": 15,
"IdleTimeoutMinutes": 30,
"RequireHttpsCookie": false
"RequireHttpsCookie": false,
"CookieName": "ZB.MOM.WW.ScadaBridge.Auth.env2"
},
"Communication": {
"DeploymentTimeout": "00:02:00",
+12
View File
@@ -6,6 +6,12 @@ services:
SCADABRIDGE_CONFIG: Central
ASPNETCORE_ENVIRONMENT: Development
ASPNETCORE_URLS: "http://+:5000"
# DEV-ONLY local-cluster value — NOT a real secret. The Auth/Config normalization
# (2026-06-03) made ScadaBridge:InboundApi:ApiKeyPepper a hard Central startup
# requirement (>=16 chars, per-environment). Distinct from the docker/ cluster's
# pepper per the "different per environment" guidance; real deployments inject a
# true secret out-of-band, never from source control. Both Central nodes share it.
ScadaBridge__InboundApi__ApiKeyPepper: "dev-only-insecure-pepper-env2-cluster-0001"
ports:
- "9101:5000" # Web UI + Inbound API
- "9111:8081" # Akka remoting
@@ -23,6 +29,12 @@ services:
SCADABRIDGE_CONFIG: Central
ASPNETCORE_ENVIRONMENT: Development
ASPNETCORE_URLS: "http://+:5000"
# DEV-ONLY local-cluster value — NOT a real secret. The Auth/Config normalization
# (2026-06-03) made ScadaBridge:InboundApi:ApiKeyPepper a hard Central startup
# requirement (>=16 chars, per-environment). Distinct from the docker/ cluster's
# pepper per the "different per environment" guidance; real deployments inject a
# true secret out-of-band, never from source control. Both Central nodes share it.
ScadaBridge__InboundApi__ApiKeyPepper: "dev-only-insecure-pepper-env2-cluster-0001"
ports:
- "9102:5000" # Web UI + Inbound API
- "9112:8081" # Akka remoting
@@ -22,13 +22,15 @@
"MachineDataDb": "Server=scadabridge-mssql,1433;Database=ScadaBridgeMachineData;User Id=scadabridge_app;Password=ScadaBridge_Dev1#;TrustServerCertificate=true"
},
"Security": {
"LdapServer": "scadabridge-ldap",
"LdapPort": 3893,
"LdapUseTls": false,
"AllowInsecureLdap": true,
"LdapSearchBase": "dc=scadabridge,dc=local",
"LdapServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
"LdapServiceAccountPassword": "password",
"Ldap": {
"Server": "10.100.0.35",
"Port": 3893,
"Transport": "None",
"AllowInsecure": true,
"SearchBase": "dc=zb,dc=local",
"ServiceAccountDn": "cn=serviceaccount,dc=zb,dc=local",
"ServiceAccountPassword": "serviceaccount123"
},
"JwtSigningKey": "scadabridge-dev-jwt-signing-key-must-be-at-least-32-characters-long",
"JwtExpiryMinutes": 15,
"IdleTimeoutMinutes": 30,
@@ -22,13 +22,15 @@
"MachineDataDb": "Server=scadabridge-mssql,1433;Database=ScadaBridgeMachineData;User Id=scadabridge_app;Password=ScadaBridge_Dev1#;TrustServerCertificate=true"
},
"Security": {
"LdapServer": "scadabridge-ldap",
"LdapPort": 3893,
"LdapUseTls": false,
"AllowInsecureLdap": true,
"LdapSearchBase": "dc=scadabridge,dc=local",
"LdapServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
"LdapServiceAccountPassword": "password",
"Ldap": {
"Server": "10.100.0.35",
"Port": 3893,
"Transport": "None",
"AllowInsecure": true,
"SearchBase": "dc=zb,dc=local",
"ServiceAccountDn": "cn=serviceaccount,dc=zb,dc=local",
"ServiceAccountPassword": "serviceaccount123"
},
"JwtSigningKey": "scadabridge-dev-jwt-signing-key-must-be-at-least-32-characters-long",
"JwtExpiryMinutes": 15,
"IdleTimeoutMinutes": 30,
+12
View File
@@ -6,6 +6,12 @@ services:
SCADABRIDGE_CONFIG: Central
ASPNETCORE_ENVIRONMENT: Development
ASPNETCORE_URLS: "http://+:5000"
# DEV-ONLY local-cluster value — NOT a real secret. The Auth/Config normalization
# (2026-06-03) made ScadaBridge:InboundApi:ApiKeyPepper a hard Central startup
# requirement (>=16 chars, per-environment). Real deployments inject a true secret
# out-of-band (env/secret store), never from source control — see
# docs/operations/inbound-api-key-reissue.md. Both Central nodes share one pepper.
ScadaBridge__InboundApi__ApiKeyPepper: "dev-only-insecure-pepper-docker-cluster-0001"
ports:
- "9001:5000" # Web UI + Inbound API
- "9011:8081" # Akka remoting (host access for CLI/debugging)
@@ -23,6 +29,12 @@ services:
SCADABRIDGE_CONFIG: Central
ASPNETCORE_ENVIRONMENT: Development
ASPNETCORE_URLS: "http://+:5000"
# DEV-ONLY local-cluster value — NOT a real secret. The Auth/Config normalization
# (2026-06-03) made ScadaBridge:InboundApi:ApiKeyPepper a hard Central startup
# requirement (>=16 chars, per-environment). Real deployments inject a true secret
# out-of-band (env/secret store), never from source control — see
# docs/operations/inbound-api-key-reissue.md. Both Central nodes share one pepper.
ScadaBridge__InboundApi__ApiKeyPepper: "dev-only-insecure-pepper-docker-cluster-0001"
ports:
- "9002:5000" # Web UI + Inbound API
- "9012:8081" # Akka remoting
+2 -1
View File
@@ -6,7 +6,8 @@
"NodeHostname": "scadabridge-site-a-a",
"SiteId": "site-a",
"RemotingPort": 8082,
"GrpcPort": 8083
"GrpcPort": 8083,
"MetricsPort": 8084
},
"Cluster": {
"SeedNodes": [
+2 -1
View File
@@ -6,7 +6,8 @@
"NodeHostname": "scadabridge-site-a-b",
"SiteId": "site-a",
"RemotingPort": 8082,
"GrpcPort": 8083
"GrpcPort": 8083,
"MetricsPort": 8084
},
"Cluster": {
"SeedNodes": [
+2 -1
View File
@@ -6,7 +6,8 @@
"NodeHostname": "scadabridge-site-b-a",
"SiteId": "site-b",
"RemotingPort": 8082,
"GrpcPort": 8083
"GrpcPort": 8083,
"MetricsPort": 8084
},
"Cluster": {
"SeedNodes": [
+2 -1
View File
@@ -6,7 +6,8 @@
"NodeHostname": "scadabridge-site-b-b",
"SiteId": "site-b",
"RemotingPort": 8082,
"GrpcPort": 8083
"GrpcPort": 8083,
"MetricsPort": 8084
},
"Cluster": {
"SeedNodes": [
+2 -1
View File
@@ -6,7 +6,8 @@
"NodeHostname": "scadabridge-site-c-a",
"SiteId": "site-c",
"RemotingPort": 8082,
"GrpcPort": 8083
"GrpcPort": 8083,
"MetricsPort": 8084
},
"Cluster": {
"SeedNodes": [
+2 -1
View File
@@ -6,7 +6,8 @@
"NodeHostname": "scadabridge-site-c-b",
"SiteId": "site-c",
"RemotingPort": 8082,
"GrpcPort": 8083
"GrpcPort": 8083,
"MetricsPort": 8084
},
"Cluster": {
"SeedNodes": [
+244
View File
@@ -0,0 +1,244 @@
# Audit Log
The Audit Log component records every action a site or central script takes across a trust boundary — outbound API calls, outbound database writes, notification sends, and inbound API requests — into a central append-only `AuditLog` table, with a site SQLite hot-path, gRPC telemetry forwarding, and a reconciliation fallback.
## Overview
Audit Log (#23) is a layered subsystem that runs on both site and central nodes. It exists alongside the operational stores it complements — `Notifications` (Notification Outbox, #21) and `SiteCalls` (Site Call Audit, #22) — rather than replacing them. The operational tables answer "what is the current state of this notification / cached call?"; the `AuditLog` answers "what happened, in what order, who did it, and what crossed the boundary?".
The component code lives in `src/ZB.MOM.WW.ScadaBridge.AuditLog/`, split by role:
- `Site/` — the script-thread write path: `SqliteAuditWriter`, the `FallbackAuditWriter` chain, and the `Site/Telemetry/` drain that pushes rows to central.
- `Central/` — the central-node ingest singleton (`AuditLogIngestActor`), the direct-write path (`CentralAuditWriter`), the reconciliation puller (`SiteAuditReconciliationActor`), and retention maintenance.
- `Configuration/`, `Redaction/`, `Payload/` — options, the redactor, and the truncation/redaction primitives.
The same DI entry point, `ServiceCollectionExtensions.AddAuditLog`, registers the writer chain on every host; central nodes additionally call `AddAuditLogCentralMaintenance`, and site nodes call `AddAuditLogHealthMetricsBridge`. Because `AddAuditLog` runs on both site and central composition roots, it never registers a hosted service that would resolve a central-only dependency on a site — central-only registrations are split into their own helper.
## Key Concepts
### Script trust boundary
The audited scope is the script trust boundary, not framework traffic. The four channels are modelled by the `AuditChannel` enum (`ApiOutbound`, `DbOutbound`, `Notification`, `ApiInbound`), and the specific action by `AuditKind` (for example `ApiCall`, `DbWriteCached`, `NotifySend`, `InboundRequest`). Every row is built through `ScadaBridgeAuditEventFactory.Create`, which maps the domain vocabulary onto the canonical record: `Channel`/`Kind`/`Status` become `Action`/`Category`/`Outcome` plus a `DetailsJson` extension bag carrying every other domain field.
### Canonical `AuditEvent` and `DetailsJson`
The transport type is the canonical `ZB.MOM.WW.Audit.AuditEvent` record — ten fields: `EventId`, `OccurredAtUtc`, `Actor`, `Action`, `Outcome`, `Category`, `Target`, `SourceNode`, `CorrelationId`, `DetailsJson`. ScadaBridge domain fields (`ExecutionId`, `ParentExecutionId`, `SourceSiteId`, `RequestSummary`, `IngestedAtUtc`, and so on) ride inside `DetailsJson` as an `AuditDetails` record, serialized by `AuditDetailsCodec`. `AuditRowProjection.Decompose` / `Recompose` move between the canonical record and the domain view.
### `ExecutionId` vs `CorrelationId`
`CorrelationId` is the canonical top-level field and carries the per-operation lifecycle id — for cached calls it is the `TrackedOperationId`, and the cached-telemetry drain reads it back out to join the audit row to its operational tracking row (see `SiteAuditTelemetryActor.OnCachedDrainAsync`). `ExecutionId` is a `DetailsJson` field: the per-run correlation value shared by every row a single script execution or inbound request emits. `ParentExecutionId` (also in `DetailsJson`) is the cross-execution spawn pointer that bridges, for example, an inbound API request to the site script it routes to.
### One row per lifecycle event
Each lifecycle event is one row. A synchronous call produces a single row; a cached call produces several (`Submitted`, `Forwarded`, `Attempted`, then a terminal `Delivered`/`Parked`/`Discarded`). Idempotency is on `EventId`, so the same row arriving twice — from a telemetry retry and from a reconciliation pull — collapses to a no-op everywhere it is written.
## Architecture
### Site write path
`SqliteAuditWriter` is the site hot-path store: a singleton holding one owned `SqliteConnection` behind a write lock, fed by a bounded `Channel<T>` that a background task drains in batches, so script threads never block on disk I/O. It writes two tables — the append-only canonical `audit_event` and a mutable `audit_forward_state` sidecar that tracks the forwarding lifecycle (from `SqliteAuditWriter.InitializeSchema`):
```sql
CREATE TABLE IF NOT EXISTS audit_event (
EventId TEXT NOT NULL,
OccurredAtUtc TEXT NOT NULL,
Actor TEXT NOT NULL,
Action TEXT NOT NULL,
Outcome TEXT NOT NULL,
Category TEXT NULL,
Target TEXT NULL,
SourceNode TEXT NULL,
CorrelationId TEXT NULL,
DetailsJson TEXT NULL,
PRIMARY KEY (EventId)
);
```
The sidecar carries `ForwardState` (`Pending`/`Forwarded`/`Reconciled`, per the `AuditForwardState` enum), a duplicated `OccurredAtUtc` for the drain range scan, and a precomputed `IsCachedKind` flag so the cached/non-cached drain split is an integer predicate, not a `DetailsJson` parse on the read hot-path. The site store is ephemeral (roughly 7-day retention, recreated per deployment), so a schema change is an in-place reset rather than a migration.
`SqliteAuditWriter` also implements `ISiteAuditQueue`, the read/mark surface the drain and the reconciliation pull handler consume. The same singleton instance is bound to both `ISiteAuditQueue` and the hot-path `IAuditWriter` so the drain observes exactly the rows the script threads wrote.
### Site fallback chain
The script-facing `IAuditWriter` is `FallbackAuditWriter`, not the raw `SqliteAuditWriter`. It redacts once up front, attempts the primary write, and on any primary failure stashes the (already redacted) row in a drop-oldest `RingBufferFallback` and returns success — a primary outage must never reach the calling script:
```csharp
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(evt);
var filtered = _redactor.Apply(evt);
try
{
await _primary.WriteAsync(filtered, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
_failureCounter.Increment();
_logger.LogWarning(ex,
"Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.",
filtered.EventId);
_ring.TryEnqueue(filtered);
return;
}
if (_ring.Count > 0)
{
await TryDrainRingAsync(ct).ConfigureAwait(false);
}
}
```
On the next successful primary write the ring drains back through the primary in FIFO order.
### Telemetry forward and central ingest
`SiteAuditTelemetryActor` drains the local queue and pushes to central over two parallel transports, each self-ticking on its own cadence (`BusyIntervalSeconds` while rows flow, `IdleIntervalSeconds` when empty):
- `IngestAuditEvents` for single-row lifecycle events (sync `ApiCall`/`DbWrite`, `NotifySend`, `InboundRequest`).
- `IngestCachedTelemetry` for cached-call rows, each joined to its `IOperationTrackingStore` snapshot so central can write `AuditLog` and `SiteCalls` together.
On the central node, `AuditLogIngestActor` is a cluster singleton. It opens a fresh DI scope per message because `IAuditLogRepository` is a scoped EF Core service, stamps `IngestedAtUtc`, and inserts idempotently — one bad row never sinks the batch:
```csharp
private async Task IngestWithRepositoryAsync(
IAuditLogRepository repository,
IAuditRedactor? redactor,
ICentralAuditWriteFailureCounter? failureCounter,
IngestAuditEventsCommand cmd,
DateTime nowUtc,
List<Guid> accepted)
{
foreach (var evt in cmd.Events)
{
try
{
var safeRedactor = redactor ?? SafeDefaultAuditRedactor.Instance;
var filtered = safeRedactor.Apply(evt);
var ingested = AuditRowProjection.WithIngestedAtUtc(filtered, nowUtc);
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
accepted.Add(evt.EventId);
}
catch (Exception ex)
{
try { failureCounter?.Increment(); }
catch { /* counter must never throw — defence in depth */ }
_logger.LogError(ex,
"Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.",
evt.EventId);
}
}
}
```
The cached path, `OnCachedTelemetryAsync`, wraps each entry in its own MS SQL transaction and writes the `AuditLog` row and the `SiteCalls` row together, so the audit and operational mirrors never drift mid-row.
### Central direct-write
Events that originate on central — Notification Outbox dispatch and Inbound API — never go through site telemetry. They call `ICentralAuditWriter`, implemented by `CentralAuditWriter`, which redacts, stamps `SourceNode` from `INodeIdentityProvider` when the caller has not, opens a per-call scope, and inserts idempotently. Like every audit path, it swallows and logs failures rather than propagating them.
### Reconciliation and retention
`SiteAuditReconciliationActor` is a central singleton that, on a timer, pulls each site for rows at or after a per-site cursor and ingests them idempotently — the self-healing fallback for telemetry the push path missed. `AuditLogPurgeActor` drives the daily partition-switch purge against the central table, and `AuditLogPartitionMaintenanceService` rolls the monthly partition function forward so inserts never land in an unbounded tail partition.
## Usage
Rows are written through one of two DI seams, never constructed ad hoc. Site boundary code resolves the hot-path `IAuditWriter` — the `FallbackAuditWriter` shown above — and writes without blocking on disk and without ever throwing an audit failure back at the script. Central-originated events (Notification Outbox dispatch, Inbound API) resolve `ICentralAuditWriter` instead; its `CentralAuditWriter` implementation redacts, stamps `SourceNode`, opens a per-call EF Core scope, and inserts idempotently, swallowing failures the same way:
```csharp
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
if (evt is null)
{
_logger.LogWarning("CentralAuditWriter.WriteAsync received null event; ignoring.");
return;
}
try
{
var filtered = _redactor.Apply(evt);
if (filtered.SourceNode is null && _nodeIdentity?.NodeName is { } nodeName)
{
filtered = filtered with { SourceNode = nodeName };
}
await using var scope = _services.CreateAsyncScope();
var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var stamped = AuditRowProjection.WithIngestedAtUtc(filtered, DateTime.UtcNow);
await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
}
catch (Exception ex)
{
try { _failureCounter.Increment(); }
catch { /* counter must never throw — defence in depth */ }
_logger.LogWarning(ex,
"CentralAuditWriter failed for EventId {EventId} (Action={Action}, Outcome={Outcome})",
evt.EventId, evt.Action, evt.Outcome);
}
}
```
The two writer seams are intentionally distinct DI bindings: `IAuditWriter` is the site/boundary hot-path, `ICentralAuditWriter` is the central direct-write path. Keeping them separate stops a site composition root from accidentally resolving the central writer, which depends on a scoped `IAuditLogRepository` only registered by the Configuration Database.
## Configuration
The top-level options class is `AuditLogOptions`, bound from the `AuditLog` section and validated on startup by `AuditLogOptionsValidator`. The writer and telemetry collaborators bind from nested sections; the constant section names live on `ServiceCollectionExtensions`.
| Section | Key | Default | Description |
|---------|-----|---------|-------------|
| `AuditLog` | `DefaultCapBytes` | `8192` | Payload-summary cap in bytes. Must be `> 0`. |
| `AuditLog` | `ErrorCapBytes` | `65536` | Cap on error rows. Must be `>=` `DefaultCapBytes`. |
| `AuditLog` | `InboundMaxBytes` | `1048576` | Per-body ceiling for `ApiInbound` summaries. Range `[8192, 16777216]`. |
| `AuditLog` | `HeaderRedactList` | `Authorization`, `X-Api-Key`, `Cookie`, `Set-Cookie` | HTTP headers redacted before persistence. |
| `AuditLog` | `GlobalBodyRedactors` | empty | Body-content redactor regex patterns applied globally. |
| `AuditLog` | `PerTargetOverrides` | empty | Per-target overrides keyed by target name (`CapBytes`, `AdditionalBodyRedactors`, `RedactSqlParamsMatching`). |
| `AuditLog` | `RetentionDays` | `365` | Central retention window. Range `[30, 3650]`. |
| `AuditLog:SiteWriter` | `DatabasePath` | `auditlog.db` | Site SQLite file path. |
| `AuditLog:SiteWriter` | `ChannelCapacity` | `4096` | Bounded write-queue capacity. |
| `AuditLog:SiteWriter` | `BatchSize` | `256` | Max events per write transaction. |
| `AuditLog:SiteTelemetry` | `BatchSize` | `256` | Max rows per gRPC drain batch. |
| `AuditLog:SiteTelemetry` | `BusyIntervalSeconds` | `5` | Drain delay while rows are flowing. |
| `AuditLog:SiteTelemetry` | `IdleIntervalSeconds` | `30` | Drain delay when the queue is empty. |
| `AuditLog:PartitionMaintenance` | `IntervalSeconds` | `86400` | Partition roll-forward cadence. |
| `AuditLog:PartitionMaintenance` | `LookaheadMonths` | `1` | Future months `pf_AuditLog_Month` must always cover. |
`PerTargetRedactionOverride` is additive: per-target body redactors append to the global list, and `RedactSqlParamsMatching` is an opt-in case-insensitive regex applied only to `DbOutbound` rows (SQL parameter values are captured verbatim by default). Header redaction always runs — when no redactor is wired, the paths fall back to `SafeDefaultAuditRedactor`, which scrubs the default sensitive headers regardless.
`SiteAuditReconciliationOptions` exposes `ReconciliationIntervalSeconds` (default `300`) and `StalledAfterNonDrainingCycles` (default `2`); `AuditLogPurgeOptions` exposes `IntervalHours` (default `24`), while the purge window itself is sourced from `AuditLogOptions.RetentionDays` so retention is tuned from one place.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns the canonical `AuditEvent` shape consumed here (via `ZB.MOM.WW.Audit`), the `IAuditWriter` / `ICentralAuditWriter` / `ISiteAuditQueue` interfaces, the `AuditChannel` / `AuditKind` / `AuditForwardState` enums, the `AuditDetails` / `AuditRowProjection` / `ScadaBridgeAuditEventFactory` projection types, and the ingest/pull message contracts.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — registers the scoped `IAuditLogRepository` (the central `dbo.AuditLog` table, partition boundaries, and `InsertIfNotExistsAsync` idempotency). Central hosts must call `AddConfigurationDatabase` for the ingest, direct-write, reconciliation, and purge paths to resolve their repository.
- [CentralSite Communication (#5)](./Communication.md) — supplies the gRPC transport: `IngestAuditEvents` / `IngestCachedTelemetry` push and the `PullAuditEvents` reconciliation pull, plus the DTO mappers.
- [Site Call Audit (#22)](./SiteCallAudit.md) — shares the combined cached-telemetry packet. `AuditLogIngestActor.OnCachedTelemetryAsync` writes the `AuditLog` row and the `SiteCalls` upsert in one transaction; sites remain the source of truth for cached-call status.
- [Notification Outbox (#21)](./NotificationOutbox.md) — a central direct-write caller of `ICentralAuditWriter` (dispatch lifecycle rows), alongside the Inbound API.
- [Health Monitoring (#11)](./HealthMonitoring.md) — `AddAuditLogHealthMetricsBridge` replaces the NoOp failure counters with bridges that surface `SiteAuditWriteFailures` and `AuditRedactionFailure` on the site health report; `SiteAuditBacklogReporter` polls the writer backlog. On central, `AuditCentralHealthSnapshot` exposes `CentralAuditWriteFailures`, `AuditRedactionFailure`, and per-site `SiteAuditTelemetryStalled`.
- Design spec: [Component-AuditLog.md](../requirements/Component-AuditLog.md).
## Troubleshooting
### Telemetry loss self-heals
If the push path misses rows (a gRPC blip, a central restart, a site briefly offline), the site keeps those rows `Pending` and `SiteAuditReconciliationActor` re-pulls them on its next tick. Idempotency on `EventId` makes duplicates from both paths a no-op, so no operator action is required. The reconciliation cursor is in-memory; a singleton restart resets it to `DateTime.MinValue`, which re-pulls everything the site still holds — conservative but correct.
### A row repeatedly fails to ingest
`SiteAuditReconciliationActor` tracks per-`EventId` insert failures. While a row keeps failing below `MaxPermanentInsertAttempts` (5), the site cursor is held back so the next tick retries it. At the threshold the actor logs `Critical`, permanently abandons that one row, and advances the cursor so a single broken row cannot block all further progress for the site. A `Critical` log line naming an abandoned `EventId` is the signal to investigate that row's payload.
### A site shows as stalled
When two consecutive reconciliation cycles both return rows and report `MoreAvailable=true`, the backlog is not draining and the actor latches the site as stalled, publishing `SiteAuditTelemetryStalledChanged` on the EventStream (surfaced as `SiteAuditTelemetryStalled` on the central health snapshot). Only transitions are published, so a stalled site does not flood the health surface.
### Audit writes never abort the action
Every write path is best-effort by contract. A primary SQLite failure routes to the ring buffer; an ingest or direct-write failure is swallowed, logged, and counted on the health surface. The audited action's own success/failure path is authoritative — a missing audit row never means the action failed. The site retention purge enforces the matching invariant: a row is not dropped until it has reached `Forwarded` or `Reconciled`.
## Related Documentation
- [Audit Log design specification](../requirements/Component-AuditLog.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [CentralSite Communication](./Communication.md)
- [Commons](./Commons.md)
- [Health Monitoring](./HealthMonitoring.md)
+250
View File
@@ -0,0 +1,250 @@
# CLI
The CLI is a standalone command-line tool that provides scripting and automation access to all ScadaBridge administrative operations. It connects to the central cluster's HTTP Management API and dispatches commands to the `ManagementActor` via `POST /management`. Authentication and role resolution happen server-side; the CLI sends LDAP credentials as HTTP Basic Auth on every request.
## Overview
The CLI component lives in `src/ZB.MOM.WW.ScadaBridge.CLI/` and builds to a self-contained `scadabridge` executable (or `scadabridge.exe` on Windows). It is not part of the Host binary — it deploys on any machine with HTTP access to a central node.
The tool is built on `System.CommandLine` and exposes a hierarchical command tree organized by management domain (`template`, `instance`, `site`, `data-connection`, and so on). Every command follows the same pattern: resolve the management URL and credentials, construct a command object whose type maps to a `ManagementCommandRegistry` entry, serialize it into the `{ command, payload }` JSON envelope, and `POST` it to `<managementUrl>/management`. The server response — JSON success body or `{ error, code }` error envelope — is printed to stdout or stderr respectively.
The CLI is the preferred automation interface. When setting up system state (sites, templates, data connections, deployments, security) without the Central UI, use `scadabridge` commands rather than direct database manipulation.
## Key Concepts
### Connection to the active node
The CLI connects through Traefik, the reverse proxy fronting the central cluster. Traefik routes each request to the active central node based on the `/health/active` probe, so the CLI does not need to know which node (`central-a` or `central-b`) is currently primary. Pointing `--url` (or `managementUrl`) at the Traefik address (default `http://localhost:9000` in Docker) provides automatic HA failover without any CLI-side configuration change.
Direct-to-node access (`http://localhost:9001` for `central-a`, `http://localhost:9002` for `central-b`) bypasses Traefik and is useful for diagnostics but not for production automation.
### Command dispatch
Most commands serialize to a named management command and reach the `ManagementActor` through the `POST /management` endpoint. The `audit query` and `audit export` commands are the exception — they call plain REST endpoints (`GET /api/audit/query`, `GET /api/audit/export`) introduced by Audit Log (#23) and therefore use `SendGetAsync`/`SendGetStreamAsync` on `ManagementHttpClient` rather than `SendCommandAsync`. The streaming `audit export` path uses `HttpCompletionOption.ResponseHeadersRead` to avoid buffering large payloads in memory.
### Authentication and roles
The CLI encodes `--username` and `--password` as an HTTP Basic Auth header on every request. The server performs the LDAP bind, group lookup, and role resolution. The CLI never contacts LDAP directly and never caches credentials between invocations.
Role enforcement is applied by the `ManagementActor`. Operations require the appropriate role:
| Role | Covers |
|------|--------|
| `Admin` | Security settings, site management, SMTP config, audit-config queries |
| `Design` | Templates, shared scripts, external systems, DB connections, API methods, notification lists |
| `Deployment` | Instance lifecycle (deploy, enable, disable, delete), data connection bindings, bundle import/export |
| `OperationalAudit` | Reading the Audit Log (`audit query`) |
| `AuditExport` | Exporting the Audit Log (`audit export`) |
A request lacking the required role exits with code `2` (authorization failure). A bad-credential response (HTTP 401) exits with code `1`.
### Output formats
Every command accepts `--format json` (default) or `--format table`. For JSON output, successful server responses are written to stdout verbatim — the server controls the JSON shape and no re-serialization is applied. Locally-constructed output (errors, `debug stream` events) is serialized by `OutputFormatter` with indentation and camelCase. Table output renders a padded plain-text table derived from the response JSON — arrays become rows, single objects become a two-column `Property / Value` table. Errors are always written as `{ "error": "...", "code": "..." }` to stderr, regardless of format.
## Architecture
`Program.cs` builds the `RootCommand` tree, attaches the four global options (`--url`, `--username`, `--password`, `--format`), and registers each command group:
```csharp
rootCommand.Add(TemplateCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(InstanceCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(SiteCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(DeployCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(DataConnectionCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(ExternalSystemCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(NotificationCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(SecurityCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(AuditLogCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(AuditCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(HealthCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(DebugCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(SharedScriptCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(DbConnectionCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(ApiMethodCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
rootCommand.Add(BundleCommands.Build(urlOption, formatOption, usernameOption, passwordOption));
```
Every command's action delegates to `CommandHelpers.ExecuteCommandAsync`, which owns the URL/credential resolution, `ManagementHttpClient` lifetime, and exit-code mapping. This consolidation means the authorization-failure exit code (`2`) is enforced uniformly — including for the `bundle` group, which uses `ExecuteCommandAsync` with a longer 5-minute timeout and a per-command `onSuccess` handler rather than a separate HTTP path.
`CliConfig.Load()` is called at the start of every invocation. It merges `~/.scadabridge/config.json`, environment variables, and any defaults. A malformed or unreadable config file emits a warning to stderr and falls through to the environment variable / command-line precedence chain without crashing.
## Usage
### Running the CLI
```bash
# Build
dotnet build src/ZB.MOM.WW.ScadaBridge.CLI
# Minimal invocation (URL and credentials required)
scadabridge --url http://localhost:9000 --username multi-role --password password template list
# Using Traefik (HA; routes to the active node automatically)
scadabridge --url http://localhost:9000 --username multi-role --password password site list
# Table output
scadabridge --url http://localhost:9000 --username multi-role --password password \
--format table instance list --site-id 1
```
Credentials are safer supplied via environment variables than on the command line, where they appear in process listings and shell history:
```bash
export SCADABRIDGE_MANAGEMENT_URL=http://localhost:9000
export SCADABRIDGE_USERNAME=multi-role
export SCADABRIDGE_PASSWORD=password
scadabridge template create --name "PumpStation" --description "Standard pump station"
scadabridge template attribute add --template-id 3 --name Speed --data-type Float --default-value 0
scadabridge instance create --name "PS-01" --template-id 3 --site-id 1
scadabridge instance deploy --id 7
```
### Command groups
| Group | Subcommands | Role required |
|-------|-------------|---------------|
| `template` | `list`, `get`, `create`, `update`, `delete`, `validate`; `attribute add/update/delete`; `alarm add/update/delete`; `script add/update/delete`; `composition add/delete`; `native-alarm-source add/list/remove` | `Design` |
| `instance` | `list`, `get`, `create`, `set-bindings`, `set-overrides`, `alarm-override set/delete/list`, `native-alarm-source set/clear`, `set-area`, `diff`, `deploy`, `enable`, `disable`, `delete` | `Deployment` |
| `site` | `list`, `get`, `create`, `delete`, `deploy-artifacts`; `area list/create/update/delete` | `Admin` |
| `deploy` | `instance`, `artifacts`, `status` | `Deployment` |
| `data-connection` | `list`, `get`, `create`, `update`, `delete` | `Design` / `Deployment` |
| `external-system` | `list`, `get`, `create`, `update`, `delete` | `Design` |
| `notification` | `list`, `get`, `create`, `update`, `delete`; `smtp list/update` | `Design` / `Admin` |
| `security` | `api-key list/create/update/delete/set-methods`; `role-mapping list/create/update/delete`; `scope-rule list/add/delete` | `Admin` |
| `shared-script` | `list`, `get`, `create`, `update`, `delete` | `Design` |
| `db-connection` | `list`, `get`, `create`, `update`, `delete` | `Design` |
| `api-method` | `list`, `get`, `create`, `update`, `delete` | `Design` |
| `bundle` | `export`, `preview`, `import` | `Deployment` |
| `audit` | `query`, `export`, `verify-chain` | `OperationalAudit` / `AuditExport` |
| `audit-config` | `query` (config-change audit trail; was `audit-log` pre-M8) | `Admin` |
| `health` | `summary`, `site`, `event-log`, `parked-messages` | `Deployment` |
| `debug` | `snapshot`, `stream` | `Deployment` |
### Selected examples
```bash
# Query the operational Audit Log for failed API outbound events in the last 24 hours
scadabridge audit query --since 24h --channel ApiOutbound --errors-only --format table
# Export a full audit window to CSV
scadabridge audit export --since 2026-05-01T00:00:00Z --until 2026-06-01T00:00:00Z \
--format csv --output /tmp/audit-may-2026.csv
# Export a Transport bundle for selected templates (with transitive dependencies)
scadabridge bundle export --output /tmp/pump-station.scadabundle \
--templates "PumpStation,BaseModule" --include-dependencies
# Preview a bundle diff before importing
scadabridge bundle preview --input /tmp/pump-station.scadabundle
# Import with overwrite conflict policy
scadabridge bundle import --input /tmp/pump-station.scadabundle --on-conflict overwrite
# Stream live attribute and alarm changes for a running instance
scadabridge debug stream --id 7
# Query deployment records for a specific instance
scadabridge deploy status --instance-id 7 --page-size 20
```
### Exit codes
| Code | Meaning |
|------|---------|
| `0` | Success |
| `1` | Command error (connection failure, validation error, server error) |
| `2` | Authorization failure (insufficient role; HTTP 403 or `FORBIDDEN`/`UNAUTHORIZED` error code) |
## Configuration
`~/.scadabridge/config.json` is loaded on every invocation. A malformed or unreadable file emits a warning to stderr; it does not abort the invocation.
```json
{
"managementUrl": "http://localhost:9000",
"defaultFormat": "json"
}
```
| Key | Default | Description |
|-----|---------|-------------|
| `managementUrl` | — | Base URL for the Management API. Overridden by `SCADABRIDGE_MANAGEMENT_URL` env var, then by `--url`. |
| `defaultFormat` | `json` | Default output format when `--format` is not supplied. Overridden by `SCADABRIDGE_FORMAT` env var, then by `--format`. |
Credentials are intentionally never stored in the config file — they are sourced from environment variables or supplied per-invocation on the command line. Storing them in the file would persist them to disk in plaintext.
### Environment variables
| Variable | Overrides | Description |
|----------|-----------|-------------|
| `SCADABRIDGE_MANAGEMENT_URL` | `managementUrl` in config file | Management API base URL |
| `SCADABRIDGE_FORMAT` | `defaultFormat` in config file | Default output format |
| `SCADABRIDGE_USERNAME` | — | LDAP username; overridden by `--username` |
| `SCADABRIDGE_PASSWORD` | — | LDAP password; overridden by `--password`. Preferred over `--password` to avoid leaking credentials into process listings and shell history. |
### URL precedence
The management URL is resolved in this order: `--url` flag → `SCADABRIDGE_MANAGEMENT_URL` env var → `managementUrl` in config file. If none is set, the command exits with code `1` and a `NO_URL` error.
## Dependencies & Interactions
- [Management Service (#18)](./ManagementService.md) — the server-side counterpart. Every CLI command (except `audit query`/`audit export`) translates to a named management command dispatched through `POST /management` to the `ManagementActor`. Role enforcement and LDAP authentication are applied there. The `ManagementCommandRegistry` in Commons maps command types to their names; both sides must stay in sync.
- [Traefik Proxy (#20)](./TraefikProxy.md) — the recommended connection target. Pointing `--url` at the Traefik address ensures requests reach the active central node without per-command failover logic in the CLI. The `debug stream` command's WebSocket connection (SignalR `/hubs/debug-stream`) also traverses Traefik, which proxies the WebSocket upgrade natively.
- [Audit Log (#23)](./AuditLog.md) — the `audit` command group targets the `GET /api/audit/query` and `GET /api/audit/export` REST endpoints exposed by the Audit Log component, bypassing the management command envelope. The `audit-config` group (formerly `audit-log`) targets the configuration-change audit trail (`IAuditService`) via the standard management envelope.
- [Security & Auth (#10)](./Security.md) — the server resolves LDAP credentials and maps group memberships to ScadaBridge roles (`Admin`, `Design`, `Deployment`, `OperationalAudit`, `AuditExport`). The CLI does not interact with LDAP directly.
- [Commons (#16)](./Commons.md) — owns the management command record types and the `ManagementCommandRegistry` that maps each type to its wire name. The CLI project references Commons for these contracts.
- [Transport (#24)](./Transport.md) — the `bundle` command group drives the Transport feature: `bundle export` requests a base64-encoded bundle from the server and streams it to a local `.scadabundle` file; `bundle preview` uploads a file and returns the diff manifest; `bundle import` uploads a file and applies it with a configurable conflict policy.
- Design spec: [Component-CLI.md](../requirements/Component-CLI.md).
## Troubleshooting
### No management URL
```json
{"error":"No management URL specified. Use --url, set SCADABRIDGE_MANAGEMENT_URL, or add 'managementUrl' to ~/.scadabridge/config.json.","code":"NO_URL"}
```
The URL is not set via `--url`, `SCADABRIDGE_MANAGEMENT_URL`, or the config file. Set one of these before running any command.
### Connection failed
```json
{"error":"Connection failed: Connection refused (localhost:9000)","code":"CONNECTION_FAILED"}
```
The central node or Traefik is not reachable at the configured URL. Verify the cluster is running and the URL matches the Traefik port (default `9000` in Docker) or the node's direct port (`9001`/`9002`).
### Authorization failure (exit 2)
The server returned HTTP 403 or an error code of `FORBIDDEN`/`UNAUTHORIZED`. The authenticated user's LDAP groups do not map to a role with permission for the requested operation. Use `security role-mapping list` (requires `Admin`) to inspect role mappings. The `multi-role` test user (`password`) holds `Admin`, `Design`, and `Deployment` roles.
### Malformed config file warning
```text
warning: ignoring malformed or unreadable /home/user/.scadabridge/config.json: ...
```
`CliConfig.Load()` caught a `JsonException`, `IOException`, or `UnauthorizedAccessException` reading the config file. The invocation continues using environment variables and command-line options. Fix or recreate the config file.
### `audit-log` deprecation warning
```text
Warning: 'audit-log' is deprecated and will be removed in a future release. Use 'audit-config' instead.
```
The `audit-log` command group was renamed to `audit-config` in M8 of Audit Log (#23). The old name still works but emits this warning to stderr. Migrate any scripts from `scadabridge audit-log ...` to `scadabridge audit-config ...`.
### Bundle timeout
`bundle export`, `bundle preview`, and `bundle import` all use a 5-minute per-command timeout (compared to the 30-second default). If a bundle operation times out, the server-side operation may still be running. Re-try with a smaller selection or check the central node logs.
## Related Documentation
- [CLI design specification](../requirements/Component-CLI.md)
- [Management Service](./ManagementService.md)
- [Traefik Proxy](./TraefikProxy.md)
- [Audit Log](./AuditLog.md)
- [Security](./Security.md)
- [Commons](./Commons.md)
+223
View File
@@ -0,0 +1,223 @@
# Central UI
The Central UI is a Blazor Server web application that hosts every management, configuration, deployment, monitoring, and audit workflow for the ScadaBridge system. It runs on the central cluster only; sites have no user interface.
## Overview
Central UI (#9) is built on ASP.NET Core + Blazor Server with Bootstrap CSS. All UI logic executes server-side; updates reach the browser through Blazor's built-in SignalR circuit. No third-party component frameworks are used — tables, grids, forms, and custom controls are implemented directly as Blazor + Bootstrap components.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.CentralUI/`, split into:
- `Auth/` — cookie authentication state provider, login/logout/ping Minimal API endpoints, site-scope service.
- `Audit/` — CSV export Minimal API endpoint.
- `Components/Layout/``MainLayout`, `LoginLayout`, and `NavMenu` (the policy-gated rail navigation).
- `Components/Pages/` — pages, grouped by nav section: `Admin/`, `Audit/`, `Deployment/`, `Design/`, `Monitoring/`, `Notifications/`, `SiteCalls/`.
- `Components/Shared/` — reusable non-page components (`DataTable`, `MonacoEditor`, `ToastNotification`, `SessionExpiry`, dialog infrastructure, and others).
- `Components/Health/` — KPI tile components: `SiteCallKpiTiles` and `AuditKpiTiles`. Notification Outbox KPIs are rendered inline in `Health.razor`, not as a separate tile component.
- `Components/Audit/` — the `AuditFilterBar`, `AuditResultsGrid`, `AuditDrilldownDrawer`, and execution-tree components used by the Audit Log page.
- `ScriptAnalysis/` — Roslyn-backed script analysis service and Minimal API endpoint group (`/api/script-analysis`) used by the Monaco editor, exposing seven POST endpoints: `/diagnostics`, `/completions`, `/hover`, `/signature-help`, `/format`, `/inlay-hints`, and `/run`.
- `Services/` — scoped UI services: `AuditLogQueryService`, `AuditLogExportService`, `BrowseService`, and `BindingTester`.
The single DI entry point is `ServiceCollectionExtensions.AddCentralUI`, registered only by the central-role Host composition root. `EndpointExtensions.MapCentralUI<TApp>` wires the Minimal API endpoints and the Razor component routes onto the ASP.NET Core pipeline.
## Key Concepts
### Authentication and session model
Authentication is a standard LDAP bind via `ILdapAuthService` (from Security, #10). On success, `POST /auth/login` maps the user's LDAP groups to ScadaBridge roles via `IGroupRoleMapper<string>`, constructs a `ClaimsIdentity` carrying role claims and site-scope `SiteId` claims, and calls `context.SignInAsync` to write an HttpOnly/Secure ASP.NET Core cookie with `IsPersistent = true` and `SlidingExpiration = true`. No fixed `ExpiresUtc` is stamped — the idle timeout is owned by the cookie middleware configured in the Security component.
`POST /auth/token` offers the same LDAP flow and returns a JWT bearer token for the CLI.
Because Blazor Server's `HttpContext` is only valid during the initial HTTP request that establishes the SignalR circuit, `CookieAuthenticationStateProvider` snapshots the authenticated `ClaimsPrincipal` once at construction time and serves that snapshot for the entire circuit lifetime:
```csharp
public CookieAuthenticationStateProvider(IHttpContextAccessor httpContextAccessor)
{
var user = httpContextAccessor.HttpContext?.User
?? new ClaimsPrincipal(new ClaimsIdentity());
_circuitAuthState = Task.FromResult(new AuthenticationState(user));
}
public override Task<AuthenticationState> GetAuthenticationStateAsync()
=> _circuitAuthState;
```
Reading `IHttpContextAccessor` on each `GetAuthenticationStateAsync` call would return `null` (or a stale context) for the lifetime of a long-lived circuit, causing `<AuthorizeView>` re-renders to see an unauthenticated principal.
### Session expiry detection
Because `CookieAuthenticationStateProvider` serves a frozen principal, the circuit can never observe a server-side cookie expiry by polling Blazor's auth state. `SessionExpiry` (a headless component rendered in `MainLayout`) polls `GET /auth/ping` via a JavaScript `fetch` call every minute. The ping endpoint returns `200` while the cookie is still valid and `401` once it lapses. A `401` redirects the browser to `/login` with a full-page navigation. The cookie middleware re-validates (and slides) the cookie on every ping, so the poll itself does not artificially extend a genuinely idle session beyond the configured timeout.
### Site-scoped authorization
`SiteScopeService` (scoped, registered in `AddCentralUI`) reads `SiteId` claims from the circuit principal. Deployment users whose LDAP mapping carries site-scope rules carry one integer `SiteId` claim per permitted site; Admin and Design users carry none (system-wide). Pages that enumerate sites or issue cross-site commands call `SiteScopeService.FilterSitesAsync` and `IsSiteAllowedAsync` to enforce the grant before any operation.
### Repository access pattern
CentralUI pages read and write the configuration database directly through `ICentralUiRepository` and other scoped EF Core repositories — there is no Akka round-trip for design-time CRUD. Operations that must reach site actors (deployment commands, debug stream subscriptions, OPC UA browse, KPI queries) go through `CommunicationService` (from Communication, #5).
## Architecture
### Layout and navigation
`MainLayout` wraps every authenticated page in `ThemeShell` (from the shared `ZB.MOM.WW.Theme` package), which provides the brand bar, responsive hamburger, and side-rail chassis. `NavMenu` fills the rail's `<Nav>` slot with policy-gated navigation groups:
| Nav group | Policy gate | Key pages |
|-----------|-------------|-----------|
| Dashboard | Authenticated | `/` |
| Admin | `RequireAdmin` | LDAP Mappings, Sites, API Keys, Import Bundle |
| Design | `RequireDesign` | Templates, Shared Scripts, Connections, External Systems, Export Bundle |
| Deployment | `RequireDeployment` | Topology, Deployments, Debug View |
| Notifications | Mixed (per item) | SMTP Configuration (Admin), Notification Lists (Design), Notification Report / KPIs (Deployment) |
| Site Calls | `RequireDeployment` | Site Calls |
| Monitoring | Health: all; Event Logs + Parked: Deployment | Health Dashboard, Event Logs, Parked Messages |
| Audit | `OperationalAudit` | Audit Log, Configuration Audit Log |
`LoginLayout` is a minimal layout used by `/login` — it renders only `@Body` with no nav sidebar, session-expiry watchdog, or dialog host. `SessionExpiry` is rendered exclusively in `MainLayout`; it self-guards against redirect loops by checking whether the current URL is already the login page (`IsOnLoginPage`) and skipping polling if so.
`DialogHost` — a single instance rendered in `MainLayout` — is the rendering target for all modal dialogs raised via the `IDialogService` / `DialogService` scoped service. Pages call `Dialog.ConfirmAsync` or `Dialog.PromptAsync` and await the result without managing modal state themselves.
### Real-time update channels
Three distinct mechanisms push live data to the browser:
**Deployment status (SignalR push):** The `Deployments` page subscribes to `IDeploymentStatusNotifier.StatusChanged` on `OnInitializedAsync`. `DeploymentManager` raises this event on every status write; the handler marshals to the Blazor renderer via `InvokeAsync` and calls `StateHasChanged`, pushing the re-render over the existing SignalR circuit. No polling timer is needed.
```razor
protected override async Task OnInitializedAsync()
{
await LoadDataAsync();
DeploymentStatusNotifier.StatusChanged += OnDeploymentStatusChanged;
}
```
**Debug view (gRPC streaming + SignalR relay):** `DebugStreamService` (from Communication, #5) creates a `DebugStreamBridgeActor` per session. The bridge actor opens a gRPC server-streaming subscription to the site's `SiteStreamGrpcServer` for the selected instance, then requests an initial `DebugViewSnapshot` via ClusterClient. Ongoing `AttributeValueChanged` and `AlarmStateChanged` events flow via the gRPC stream to the bridge actor, which delivers them to the `DebugView` page via callbacks that call `InvokeAsync(StateHasChanged)`. A pulsing "Live" badge in the status strip indicates active streaming. Streams are subscribe-on-demand and stop when the page closes.
**Health dashboard and KPI tiles (10-second polling timer):** The `Health` page initializes a `System.Threading.Timer` at 10-second intervals. Each tick calls `ICentralHealthAggregator.GetAllSiteStates()` for in-memory site state and issues three async KPI queries:
```csharp
private async Task RefreshNow()
{
_siteStates = HealthAggregator.GetAllSiteStates();
await LoadOutboxKpis();
await LoadSiteCallKpis();
await LoadAuditKpis();
}
```
KPI queries go through `CommunicationService` (Notification Outbox and Site Calls KPIs via actor Ask) and `IAuditLogQueryService` (Audit KPIs via `IAuditLogRepository`). A transient fault on any one KPI group degrades only that group's tiles to em dashes — the rest of the dashboard continues to render.
### Script analysis endpoint
`ScriptAnalysisService` compiles user script fragments as Roslyn C# Scripting globals against `SandboxScriptHost` (template/shared scripts) or `InboundScriptHost` (Inbound API methods). It surfaces diagnostics and completions in the shape Monaco's provider APIs expect. Diagnostics are cached by code hash via `IMemoryCache` (200-entry limit) to short-circuit repeated requests for the same content. `ScriptAnalysisEndpoints` registers an endpoint group under `/api/script-analysis` with seven POST endpoints (`/diagnostics`, `/completions`, `/hover`, `/signature-help`, `/format`, `/inlay-hints`, `/run`); each is called by the Monaco JS providers in `monaco-init.js` on a 500 ms debounce (diagnostics) or on editor events (hover, completions, etc.).
The sandbox enforces the script trust model: a `SemanticModel` check raises a diagnostic for any use of forbidden APIs (`System.IO`, `Process`, `Thread`, reflection, raw networking).
### Audit Log query service
`AuditLogQueryService` (registered with an explicit factory to resolve the `IServiceScopeFactory` constructor) opens a fresh DI scope — and therefore a fresh `ScadaBridgeDbContext` — per operation. This prevents the page's query-string auto-load from racing the filter bar's site enumeration on the shared circuit-scoped context:
```csharp
await using var scope = _scopeFactory!.CreateAsyncScope();
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var result = await repository.QueryAsync(filter, effective, ct);
return result.Select(AuditEventView.From).ToList();
```
`GetKpiSnapshotAsync` aggregates audit volume and error rate over a trailing one-hour window from the repository, then sums `SiteAuditBacklog.PendingCount` across all site states from `ICentralHealthAggregator` to fill the backlog tile.
`GetDistinctSourceNodesAsync` caches the result for 60 seconds behind an in-memory lock to keep filter bar rendering cheap during failover events when node membership changes.
### CSV export endpoint
`GET /api/centralui/audit/export` is gated on the `AuditExport` policy (a superset of `OperationalAudit`). The endpoint streams directly to `Response.Body` via `IAuditLogExportService.ExportAsync` — no buffering through the Blazor circuit. The default row cap is 100,000; a `maxRows=` query-string override is accepted. The `Content-Disposition: attachment` header and `Cache-Control: no-store` are stamped before the first body write.
## Usage
### Registration (Host)
The central-role Host calls `AddCentralUI` during composition to register all Blazor and UI services, then `MapCentralUI<App>` on the endpoint builder to wire routes:
```csharp
// In Host's central-role composition root:
services.AddCentralUI();
// In the endpoint pipeline:
app.MapCentralUI<App>();
```
`MapCentralUI` registers `MapAuthEndpoints`, `MapScriptAnalysisEndpoints`, `MapAuditExportEndpoints`, and `MapRazorComponents<TApp>` with interactive server render mode.
### Accessing the UI
The UI is fronted by the Traefik reverse proxy (component #20), which routes to the active central node. In the Docker topology the management port is `9000` (Traefik), with direct access at `9001` (central-a) and `9002` (central-b).
On central failover the SignalR circuit is interrupted. Blazor's built-in reconnection logic re-establishes the circuit on the new active node. Because authentication state is in the cookie (not server memory), the user's session survives failover without re-login — provided the new node shares the same ASP.NET Data Protection keys (stored in the configuration database).
Active debug view streams and in-progress deployment status subscriptions are lost on failover and must be re-opened by the user.
### Audit Log page deep links
Any page that wants to pre-filter the Audit Log passes query-string parameters to `/audit/log`:
| Parameter | Example | Effect |
|-----------|---------|--------|
| `correlationId` | `?correlationId=<guid>` | Pins to a single operation (notification, cached call, inbound request) |
| `executionId` | `?executionId=<guid>` | Shows all rows for one script execution |
| `channel` | `?channel=ApiOutbound` | Pre-selects channel filter |
| `actor` | `?actor=my-key` | Pre-fills actor search |
The same keys are accepted by `GET /api/centralui/audit/export` for filtered CSV export.
## Dependencies & Interactions
- [Security (#10)](./Security.md) — `ILdapAuthService` for LDAP bind at login; `IGroupRoleMapper<string>` for LDAP-group → role mapping; `JwtTokenService` for bearer-token generation (`/auth/token`) and claim-type constants; `AuthorizationPolicies` for every `[Authorize(Policy = …)]` attribute. The Security component owns the cookie middleware configuration (sliding window, idle timeout).
- [Configuration Database (#17)](./ConfigurationDatabase.md) — all central CRUD (templates, instances, sites, external systems, notifications, audit log, etc.) via scoped EF Core repositories (`ICentralUiRepository`, `IAuditLogRepository`, `ISiteRepository`, `ITemplateEngineRepository`, and others). The Data Protection keys that make sessions portable across failover are also stored here.
- [Communication (#5)](./Communication.md) — `CommunicationService` for cross-site commands (deploy, disable, enable, delete, browse OPC UA nodes, read tag values); `DebugStreamService` for gRPC-backed debug stream sessions; KPI request/response messages for Notification Outbox and Site Calls KPIs.
- [Deployment Manager (#2)](./DeploymentManager.md) — `IDeploymentStatusNotifier` for real-time deployment status push; deployment command routing.
- [Template Engine (#1)](./TemplateEngine.md) — `TemplateService`, `TemplateFolderService`, `ValidationService` for template authoring, on-demand validation, and diff calculation.
- [Health Monitoring (#11)](./HealthMonitoring.md) — `ICentralHealthAggregator` for in-memory site health state on the Health Dashboard and audit backlog KPI tile.
- [Audit Log (#23)](./AuditLog.md) — `IAuditLogRepository` (via `AuditLogQueryService`) for the Audit Log page query/drilldown/export; `IAuditLogQueryService.GetKpiSnapshotAsync` for the three Audit KPI tiles on the Health Dashboard.
- [Notification Outbox (#21)](./NotificationOutbox.md) — Notification Report page queries and Retry/Discard actions; Notification KPI tiles on the Health Dashboard.
- [Site Call Audit (#22)](./SiteCallAudit.md) — Site Calls page queries and Retry/Discard relay; Site Call KPI tiles on the Health Dashboard.
- [Transport (#24)](./Transport.md) — `IBundleExporter` and `IBundleImporter` for the Export Bundle and Import Bundle multi-step wizards.
- [TreeView](./TreeView.md) — the template folder tree sidebar on `/design/templates` uses the shared `TreeView` component; see that document for the reusable tree implementation.
- Design spec: [Component-CentralUI.md](../requirements/Component-CentralUI.md).
## Troubleshooting
### Circuit reconnects but user sees "Not Authorized"
This indicates the shared ASP.NET Data Protection keys are not available to the node that the circuit reconnected to. The keys are stored in the configuration database; verify the central nodes share the same key ring and that the configuration database is reachable on both nodes.
### Health Dashboard KPI tiles show "—"
Each KPI group degrades independently. A "—" on the Notification Outbox tiles means `CommunicationService.GetNotificationKpisAsync` threw or the actor returned `Success = false`; the inline error message below the tiles carries the detail. Audit KPI tiles degrade if `IAuditLogRepository.GetKpiSnapshotAsync` fails — check the central SQL connection.
### Debug view connects but shows no events
The debug stream bridge actor opens a gRPC subscription to the site's `SiteStreamGrpcServer`. If the gRPC port is unreachable (e.g., `GrpcNodeAAddress` or `GrpcNodeBAddress` misconfigured on the site entity), the bridge actor logs the failure and the stream never starts. Check the site's gRPC address configuration on the Sites admin page and verify port 8083 is accessible from the central nodes.
### Session expires unexpectedly
The idle timeout is a sliding window — any authenticated HTTP request (including the `/auth/ping` from `SessionExpiry`) slides the expiry. If sessions lapse before the configured idle timeout, confirm that the Blazor circuit is producing at least one HTTP request per timeout window (the `SessionExpiry` poll interval is one minute). If the cookie middleware's `ExpireTimeSpan` is misconfigured, check the Security component's `AddCookie` call.
### Script editor shows no diagnostics
`ScriptAnalysisService` uses Roslyn from in-process `IMemoryCache` (200-entry size limit). If the cache is evicted under memory pressure, the next keystroke re-analyzes. If diagnostics never appear, check that `ScriptAnalysisEndpoints` is registered (it is mapped via `MapCentralUI`), and that the Monaco JS module's network requests to `/api/script-analysis/diagnostics` (and related endpoints) are completing successfully.
## Related Documentation
- [Central UI design specification](../requirements/Component-CentralUI.md)
- [Security](./Security.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Communication](./Communication.md)
- [Deployment Manager](./DeploymentManager.md)
- [Template Engine](./TemplateEngine.md)
- [Health Monitoring](./HealthMonitoring.md)
- [Audit Log](./AuditLog.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Transport](./Transport.md)
- [TreeView](./TreeView.md)
+295
View File
@@ -0,0 +1,295 @@
# Cluster Infrastructure
The Cluster Infrastructure component manages Akka.NET cluster formation, active/standby failover, split-brain resolution, and the singleton hosting that all other ScadaBridge components depend on. Every site and central cluster is a two-node active/standby pair governed by the same configuration contract and bootstrap logic.
## Overview
Cluster Infrastructure (#13) is a **design responsibility** spanning two projects rather than a single buildable project:
- **`src/ZB.MOM.WW.ScadaBridge.ClusterInfrastructure/`** owns the cluster configuration contract: `ClusterOptions` (seed nodes, failure-detection timings, split-brain settings), `ClusterOptionsValidator`, and the `AddClusterInfrastructure` DI extension that registers the validator. It does not start an actor system.
- **`src/ZB.MOM.WW.ScadaBridge.Host/`** owns the cluster bootstrap and runtime wiring: `AkkaHostedService` builds the Akka HOCON from `ClusterOptions` and `NodeOptions`, starts the `ActorSystem`, wires `CoordinatedShutdown`, and creates all role-specific actors including the cluster singletons.
This split is deliberate. The Host is the single deployable binary and the only project that performs Akka.NET bootstrap, so all cluster bring-up lives there. `ClusterInfrastructure` is the portable configuration contract that the Host consumes — it can be referenced by tests and other components without pulling in the Host.
Both central and site clusters run this same topology: two nodes, one active (cluster leader), one standby, with automatic failover and no manual intervention required for dual-node recovery.
## Key Concepts
### Active/standby via cluster leadership
Akka.NET cluster leadership determines which node is "active". The cluster leader is the oldest node in the cluster, as tracked by the keep-oldest split-brain resolver. `ActiveNodeGate` (in the Host) exposes `IsActiveNode` by checking whether `cluster.SelfMember.Status == MemberStatus.Up` and `cluster.State.Leader == cluster.SelfAddress`. Cluster singletons — which run on the oldest `Up` member — automatically migrate to the surviving node on failover.
### Configuration contract vs. bootstrap split
`ClusterOptions` holds the cluster-wide formation and failure-detection settings. Node-identity settings — remoting hostname/port, role (`Central` or `Site`), site identifier, gRPC port — live in `NodeOptions` (`ScadaBridge:Node` section), owned by the Host. This split prevents the configuration contract from acquiring a hard dependency on Host-specific concerns.
### Singleton hosting
Cluster Infrastructure provides the hosting platform; each singleton is owned and created by the component responsible for it. The Host's `RegisterCentralActors` and `RegisterSiteActorsAsync` methods wire every singleton via `ClusterSingletonManager` and a companion `ClusterSingletonProxy` so other actors can address it through a stable path regardless of which node currently hosts it.
## Architecture
### HOCON assembly
`AkkaHostedService.BuildHocon` constructs the Akka HOCON document from the bound options at startup. All interpolated values pass through `QuoteHocon` (string escaping) and `DurationHocon` (millisecond rendering) so the document is never corrupted by hostnames or timing values containing special characters or sub-second precision.
The snippet below is abbreviated to highlight the cluster stanzas. The full method also emits three additional stanzas: `akka.extensions` (registers `DistributedPubSubExtensionProvider`), `akka.remote.dot-netty.tcp` (binds `NodeOptions.NodeHostname` and `NodeOptions.RemotingPort`), and `akka.remote.transport-failure-detector` (heartbeat interval and acceptable-heartbeat-pause from `CommunicationOptions.TransportHeartbeatInterval` / `TransportFailureThreshold`).
```csharp
// Abbreviated — see AkkaHostedService.BuildHocon for the full method.
public static string BuildHocon(
NodeOptions nodeOptions,
ClusterOptions clusterOptions,
IEnumerable<string> roles,
TimeSpan transportHeartbeat,
TimeSpan transportFailure)
{
var seedNodesStr = string.Join(",",
clusterOptions.SeedNodes.Select(QuoteHocon));
var rolesStr = string.Join(",", roles.Select(QuoteHocon));
return $@"
audit-telemetry-dispatcher {{
type = ForkJoinDispatcher
throughput = 100
dedicated-thread-pool {{
thread-count = 2
}}
}}
akka {{
// akka.extensions, akka.remote.dot-netty.tcp, and
// akka.remote.transport-failure-detector also emitted here (see full method).
actor {{
provider = cluster
}}
cluster {{
seed-nodes = [{seedNodesStr}]
roles = [{rolesStr}]
min-nr-of-members = {clusterOptions.MinNrOfMembers}
split-brain-resolver {{
active-strategy = {QuoteHocon(clusterOptions.SplitBrainResolverStrategy)}
stable-after = {DurationHocon(clusterOptions.StableAfter)}
keep-oldest {{
down-if-alone = {(clusterOptions.DownIfAlone ? "on" : "off")}
}}
}}
failure-detector {{
heartbeat-interval = {DurationHocon(clusterOptions.HeartbeatInterval)}
acceptable-heartbeat-pause = {DurationHocon(clusterOptions.FailureDetectionThreshold)}
}}
run-coordinated-shutdown-when-down = on
}}
coordinated-shutdown {{
run-by-clr-shutdown-hook = on
}}
}}";
}
```
The HOCON also defines the `audit-telemetry-dispatcher` (a two-thread `ForkJoinDispatcher`) so `SiteAuditTelemetryActor`'s SQLite reads and gRPC pushes never contend with the default dispatcher used by hot-path actors.
### Split-brain resolution
The keep-oldest strategy is the only strategy `ClusterOptionsValidator` permits for ScadaBridge's two-node clusters. Quorum strategies (`keep-majority`, `static-quorum`) cannot distinguish a crash from a partition with two nodes — both sides would be below quorum and both would shut down. Keep-oldest with `down-if-alone = on` ensures at most one node runs the cluster at any time:
- On a network partition, the older node stays active; the younger node downs itself.
- If the oldest node finds itself alone (no reachable members), it downs itself rather than running in isolation. Without `down-if-alone`, the oldest node could run as a single-node cluster while the younger node forms its own — producing two live clusters with divergent singleton state.
### Failure detection and failover timeline
Detection uses two independent Akka heartbeat channels:
- **Cluster failure detector** (`akka.cluster.failure-detector`): monitors membership, triggers `Unreachable` events that the split-brain resolver acts on.
- **Transport failure detector** (`akka.remote.transport-failure-detector`): monitors the underlying TCP transport between nodes; configured separately from `CommunicationOptions.TransportHeartbeatInterval` / `TransportFailureThreshold`.
With the defaults in `ClusterOptions`, the total failover budget is approximately 25 seconds:
| Phase | Duration | Source |
|-------|----------|--------|
| Failure detection (`acceptable-heartbeat-pause`) | 10 s | `ClusterOptions.FailureDetectionThreshold` |
| Split-brain stable-after | 15 s | `ClusterOptions.StableAfter` |
| Singleton restart | < 1 s | Actor `PreStart` |
### Graceful shutdown and singleton handover
When a node is stopped cleanly, `CoordinatedShutdown` runs before the CLR exits (`run-by-clr-shutdown-hook = on`). The cluster-leave phase signals Akka to migrate singletons before the actor system terminates, so handover happens in seconds rather than waiting for the full failure-detection timeout. `SiteCallAuditActor` has an explicit graceful-stop task registered on `PhaseClusterLeave` with a 10-second timeout to drain any in-flight EF Core upsert before handover opens:
```csharp
siteCallAuditShutdown.AddTask(
Akka.Actor.CoordinatedShutdown.PhaseClusterLeave,
"drain-site-call-audit-singleton",
async () =>
{
try
{
await siteCallAuditSingletonManager.GracefulStop(TimeSpan.FromSeconds(10));
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"SiteCallAudit singleton did not drain within the graceful-stop "
+ "timeout; falling through to PoisonPill handover");
}
return Akka.Done.Instance;
});
```
### Cluster roles and singleton scoping
Each node carries one or more cluster roles set in the HOCON `roles` list. Site nodes carry both a base `"Site"` role and a site-specific role (`"site-{SiteId}"`, e.g. `"site-site-a"`). Singletons on site clusters are scoped to the site-specific role so each site's singleton runs on exactly one node of that site's cluster, not on any other site's nodes. Central singletons use no role scope — all central nodes share the `"Central"` role.
### Dual-node recovery
Because both nodes are configured as seed nodes, whichever node starts first after a simultaneous failure forms a new cluster; the second joins when it comes up. No startup ordering dependency exists, and no manual intervention is required. The keep-oldest resolver handles the "both starting fresh" case naturally — there is no pre-existing cluster to conflict with.
### Cluster singletons hosted
The Host wires the following singletons. Cluster Infrastructure provides the `ClusterSingletonManager` / `ClusterSingletonProxy` pattern; each singleton's behaviour is documented in the owning component.
**Central singletons (active central node, no role scope):**
| Singleton name | Actor class | Owner |
|----------------|-------------|-------|
| `notification-outbox` | `NotificationOutboxActor` | Notification Outbox (#21) |
| `audit-log-ingest` | `AuditLogIngestActor` | Audit Log (#23) |
| `site-call-audit` | `SiteCallAuditActor` | Site Call Audit (#22) |
**Site singletons (active site node, scoped to `"site-{SiteId}"` role):**
| Singleton name | Actor class | Owner |
|----------------|-------------|-------|
| `deployment-manager` | `DeploymentManagerActor` | Site Runtime (#3) |
| `event-log-handler` | `EventLogHandlerActor` | Site Event Logging (#12) |
`SiteAuditTelemetryActor` (Audit Log #23) is **not** a singleton — it runs on every site node and reads node-local SQLite. It is created directly with `ActorOf` and bound to the `audit-telemetry-dispatcher`.
## Usage
### Registering the configuration contract
Every host calls `AddClusterInfrastructure` to register `ClusterOptionsValidator`:
```csharp
services.AddClusterInfrastructure();
```
This registers `ClusterOptionsValidator` as an `IValidateOptions<ClusterOptions>` singleton. Because the Host binds `ClusterOptions` with `ValidateOnStart`, a misconfigured `ScadaBridge:Cluster` section (wrong strategy, `MinNrOfMembers != 1`, `DownIfAlone = false`, fewer than two seed nodes) throws an `OptionsValidationException` at startup rather than booting into a broken cluster.
### Checking active-node status
Components that must run only on the active node resolve `IActiveNodeGate` (registered by the Host's Central composition root):
```csharp
public bool IsActiveNode
{
get
{
var system = _akkaService.ActorSystem;
if (system == null) return false;
var cluster = Cluster.Get(system);
var self = cluster.SelfMember;
if (self.Status != MemberStatus.Up) return false;
var leader = cluster.State.Leader;
return leader != null && leader == self.Address;
}
}
```
This returns `false` while the actor system is warming up — the safe-by-default answer matching the standby case. The Inbound API uses this gate to return HTTP 503 on standby nodes.
## Configuration
`ClusterOptions` is bound from `ScadaBridge:Cluster`. `NodeOptions` is bound from `ScadaBridge:Node`.
### `ScadaBridge:Cluster`
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| `SeedNodes` | `List<string>` | (required) | Akka seed-node URIs. Must contain at least 2 entries; both nodes list both themselves and their partner. |
| `SplitBrainResolverStrategy` | `string` | `"keep-oldest"` | Must be `"keep-oldest"`. Quorum strategies are rejected by `ClusterOptionsValidator`. |
| `StableAfter` | `TimeSpan` | `00:00:15` | Cluster must be stable for this duration before the resolver acts to down unreachable nodes. |
| `HeartbeatInterval` | `TimeSpan` | `00:00:02` | Cluster failure-detector heartbeat frequency. Must be less than `FailureDetectionThreshold`. |
| `FailureDetectionThreshold` | `TimeSpan` | `00:00:10` | `acceptable-heartbeat-pause` for the cluster failure detector. |
| `MinNrOfMembers` | `int` | `1` | Must be `1`. A value of `2` blocks the cluster singleton after failover. |
| `DownIfAlone` | `bool` | `true` | Must be `true`. See split-brain resolution above. |
### `ScadaBridge:Node`
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| `Role` | `string` | (required) | `"Central"` or `"Site"`. |
| `NodeHostname` | `string` | (required) | Hostname this node advertises to the Akka cluster remoting layer. |
| `NodeName` | `string` | `""` | Semantic label stamped on audit rows (`SourceNode`). Conventional values: `node-a`/`node-b` for sites, `central-a`/`central-b` for central. |
| `SiteId` | `string?` | (required for Site) | Site identifier; appended to the site-specific cluster role (`site-{SiteId}`). |
| `RemotingPort` | `int` | `8081` | Akka.NET TCP remoting port. Code default is `8081`; the site deployment overrides this to `8082` via `appsettings.Site.json`. |
| `GrpcPort` | `int` | `8083` | Kestrel HTTP/2 port for `SiteStreamGrpcServer` (site nodes only). Must differ from `RemotingPort`. |
| `MetricsPort` | `int` | `8084` | Kestrel HTTP/1.1 port for the Prometheus `/metrics` scrape endpoint (site nodes only). Must differ from `RemotingPort` and `GrpcPort`. |
### Representative docker configuration (central node A)
```json
{
"ScadaBridge": {
"Node": {
"Role": "Central",
"NodeName": "central-a",
"NodeHostname": "scadabridge-central-a",
"RemotingPort": 8081
},
"Cluster": {
"SeedNodes": [
"akka.tcp://scadabridge@scadabridge-central-a:8081",
"akka.tcp://scadabridge@scadabridge-central-b:8081"
],
"SplitBrainResolverStrategy": "keep-oldest",
"StableAfter": "00:00:15",
"HeartbeatInterval": "00:00:02",
"FailureDetectionThreshold": "00:00:10",
"MinNrOfMembers": 1
}
}
}
```
`DownIfAlone` is not present in the docker files because its default value of `true` is correct and `ClusterOptionsValidator` rejects `false`.
## Dependencies & Interactions
- [Host (#15)](./Host.md) — owns the Akka.NET bootstrap. `AkkaHostedService` consumes `ClusterOptions` and `NodeOptions`, assembles the HOCON, starts the `ActorSystem`, creates all role-specific actors, and wires `CoordinatedShutdown`. The `ClusterInfrastructure` project has no compile-time dependency on the Host; the dependency is reversed at runtime.
- [Commons (#16)](./Commons.md) — provides `INodeIdentityProvider` (implemented by `NodeIdentityProvider` in the Host), which supplies the `NodeName` label that audit writers stamp on the `SourceNode` column. Also provides `IClusterNodeProvider` (implemented by `AkkaClusterNodeProvider` in the Host), which the Health Monitoring component uses to report per-node up/down status.
- [Health Monitoring (#11)](./HealthMonitoring.md) — uses `IClusterNodeProvider` to list cluster members and determine whether the local node is primary; uses `IActiveNodeGate` (central only) to gate active-node-only health paths. The active/standby distinction reported to central health originates here.
- [Site Runtime (#3)](./SiteRuntime.md) — the Deployment Manager singleton is the most operationally critical singleton this infrastructure hosts. It re-creates the full Instance Actor hierarchy from local SQLite on failover. Staggered Instance Actor startup after failover is Site Runtime's responsibility; this component provides the singleton placement guarantee.
- [Notification Outbox (#21)](./NotificationOutbox.md), [Site Call Audit (#22)](./SiteCallAudit.md), [Audit Log (#23)](./AuditLog.md) — each hosts one or more central singletons wired by `RegisterCentralActors`. Cluster Infrastructure provides the `ClusterSingletonManager`/`ClusterSingletonProxy` boilerplate and the graceful-shutdown hooks; the business logic lives in the owning component.
- [CentralSite Communication (#5)](./Communication.md) — `CentralCommunicationActor` and `SiteCommunicationActor` are created and registered with `ClusterClientReceptionist` inside the same `AkkaHostedService` startup, making them addressable by remote `ClusterClient` instances. The transport-level heartbeat (`TransportHeartbeatInterval`, `TransportFailureThreshold`) is configured separately from the cluster failure-detector and comes from `CommunicationOptions`.
- [Inbound API (#14)](./InboundAPI.md) — resolves `IActiveNodeGate` to return HTTP 503 on standby central nodes. Gate returns `false` until the actor system is `Up` and this node is the cluster leader.
- Design spec: [Component-ClusterInfrastructure.md](../requirements/Component-ClusterInfrastructure.md).
## Troubleshooting
### Node fails to join cluster on startup
`ClusterOptionsValidator` rejects fewer than two seed nodes, a non-`keep-oldest` strategy, `MinNrOfMembers != 1`, or `DownIfAlone = false` at startup with an `OptionsValidationException`. Check that both seed-node URIs reference the Akka remoting port, not the gRPC port (8083) or metrics port (8084) — on site nodes, `StartupValidator` explicitly rejects seed entries whose port matches `GrpcPort`.
### Singleton not starting after failover
If the surviving node is `Up` but singletons do not start, `MinNrOfMembers` is the first thing to check. A value of `2` keeps the surviving node waiting for a second member indefinitely. The validator enforces `1`, but a manually patched `appsettings.json` that bypasses the validator could produce this.
### Two live clusters (split-brain)
If `DownIfAlone = false` were accepted (the validator rejects it), the oldest node could run alone while the younger forms its own cluster, producing two live clusters with divergent singleton state and dual MS SQL writers on central. `ClusterOptionsValidator` makes this configuration impossible to boot.
### Graceful shutdown takes longer than expected
If a clean node stop takes up to 25 seconds instead of seconds, `CoordinatedShutdown` may not be running — check that `run-by-clr-shutdown-hook = on` is present in the assembled HOCON (it is emitted unconditionally by `BuildHocon`) and that the Windows Service stop signal reaches the process rather than being killed. A `SIGKILL` / `TerminateProcess` bypasses `CoordinatedShutdown` entirely; the surviving node then has to wait the full failure-detection window.
## Related Documentation
- [Cluster Infrastructure design specification](../requirements/Component-ClusterInfrastructure.md)
- [Host](./Host.md)
- [Site Runtime](./SiteRuntime.md)
- [Health Monitoring](./HealthMonitoring.md)
- [CentralSite Communication](./Communication.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Audit Log](./AuditLog.md)
- [Commons](./Commons.md)
+328
View File
@@ -0,0 +1,328 @@
# Commons
Commons is the foundational shared library that all other ScadaBridge components depend on — it defines the POCO entity classes, repository interfaces, service interfaces, message contracts, shared enums, and utility types that the system builds on top of.
## Overview
Commons (#16) is not a runtime component. It has no actors, no hosted services, and no DI registrations of its own. Its single role is to hold the shared type vocabulary — entity shapes, interface contracts, and message definitions — so that every component agrees on the same types without depending on each other.
The project enforces minimal dependencies by design: it references the `ZB.MOM.WW.Audit` package (for the canonical `AuditEvent` type) and the core .NET SDK. It must not reference Akka.NET, ASP.NET Core, Entity Framework Core, or any persistence or framework library, because it is referenced by all other projects and a framework dependency here becomes a transitive constraint on everything.
Source lives in `src/ZB.MOM.WW.ScadaBridge.Commons/`, organized into seven top-level namespaces: `Types/`, `Interfaces/`, `Entities/`, `Messages/`, `Observability/`, `Serialization/`, and `Validators/`.
## Key Concepts
### Persistence-ignorant entity classes
All configuration database entity classes live in `Entities/` as plain C# classes with no EF attributes, no EF base classes, and no persistence-framework annotations. Navigation properties (for example `Template.Attributes`) are plain `ICollection<T>` — EF Fluent API configuration is the Configuration Database component's job, not Commons'. The entities may include constructors that enforce required fields:
```csharp
// Entities/Templates/Template.cs
public class Template
{
public int Id { get; set; }
public string Name { get; set; }
public string? Description { get; set; }
public int? ParentTemplateId { get; set; }
public int? FolderId { get; set; }
public ICollection<TemplateAttribute> Attributes { get; set; } = new List<TemplateAttribute>();
public ICollection<TemplateAlarm> Alarms { get; set; } = new List<TemplateAlarm>();
public ICollection<TemplateScript> Scripts { get; set; } = new List<TemplateScript>();
public ICollection<TemplateComposition> Compositions { get; set; } = new List<TemplateComposition>();
public ICollection<TemplateNativeAlarmSource> NativeAlarmSources { get; set; } = new List<TemplateNativeAlarmSource>();
public bool IsDerived { get; set; }
public int? OwnerCompositionId { get; set; }
public Template(string name)
{
Name = name ?? throw new ArgumentNullException(nameof(name));
}
}
```
### Repository interfaces
Commons defines one repository interface per consuming component. Implementations live entirely in the Configuration Database component. Each interface accepts and returns the POCO entity classes from Commons. Most repository interfaces expose `SaveChangesAsync()` to support the unit-of-work pattern without requiring a dependency on EF Core; the append-only audit repositories (`IAuditLogRepository`, `ISiteCallAuditRepository`) do not — they use upsert/insert-only operations that do not require an explicit save step.
### Message contracts and additive-only evolution
Messages in `Messages/` are `record` types or immutable classes. Because sites and central may temporarily run different software versions, the rule is additive-only: new fields may be added with defaults; existing fields must not be removed or have their types changed. Contracts that cross the site→central gRPC boundary — `CachedCallTelemetry`, `AuditTelemetryEnvelope`, `NotificationSubmit`, and the pull reconciliation messages — are the most version-sensitive and have this rule explicitly called out in their XML docs.
### Pure-helper carve-out
Commons may contain stateless, side-effect-free helper types that transform or validate the data types it already defines. Anything that would require I/O, shared mutable state across calls beyond a self-contained instance, or knowledge of another component is excluded. Current examples: `Result<T>`, `ScriptParameters`, `ValueFormatter`, `DynamicJsonElement`, `StaleTagMonitor`, `OpcUaEndpointConfigSerializer`, and `OpcUaEndpointConfigValidator`.
## Architecture
### Namespace and folder structure
```text
ZB.MOM.WW.ScadaBridge.Commons/
├── Types/ # Enums/, Alarms/, Audit/, DataConnections/,
│ # Flattening/, InboundApi/, Notifications/,
│ # Transport/, Scripts/ + top-level utility types
├── Interfaces/ # Protocol/, Repositories/, Services/, Transport/,
│ # Security/
├── Entities/ # Templates/, Instances/, Sites/, ExternalSystems/,
│ # Notifications/, InboundApi/, Security/,
│ # Deployment/, Scripts/, Audit/
├── Messages/ # Deployment/, Lifecycle/, Health/, Communication/,
│ # Streaming/, DebugView/, ScriptExecution/,
│ # Artifacts/, DataConnection/, Instance/,
│ # Integration/, Notification/, InboundApi/,
│ # RemoteQuery/, Audit/, Management/
├── Observability/ # ScadaBridgeTelemetry (meter + instrument definitions)
├── Serialization/ # OpcUaEndpointConfigSerializer, MxGatewayEndpointConfigSerializer
└── Validators/ # OpcUaEndpointConfigValidator, MxGatewayEndpointConfigValidator
```
Namespaces mirror folders: `ZB.MOM.WW.ScadaBridge.Commons.Entities.Templates`, `ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories`, and so on.
### Entity classes by domain area
| Folder | Classes |
|---|---|
| `Entities/Templates/` | `Template`, `TemplateAttribute`, `TemplateAlarm`, `TemplateNativeAlarmSource`, `TemplateScript`, `TemplateComposition`, `TemplateFolder` |
| `Entities/Instances/` | `Instance`, `InstanceAttributeOverride`, `InstanceConnectionBinding`, `InstanceAlarmOverride`, `InstanceNativeAlarmSourceOverride`, `Area` |
| `Entities/Sites/` | `Site`, `DataConnection` |
| `Entities/ExternalSystems/` | `ExternalSystemDefinition`, `ExternalSystemMethod`, `DatabaseConnectionDefinition` |
| `Entities/Notifications/` | `NotificationList`, `NotificationRecipient`, `SmtpConfiguration`, `Notification` |
| `Entities/InboundApi/` | `ApiMethod` |
| `Entities/Security/` | `LdapGroupMapping`, `SiteScopeRule` |
| `Entities/Deployment/` | `DeploymentRecord`, `SystemArtifactDeploymentRecord`, `DeployedConfigSnapshot` |
| `Entities/Scripts/` | `SharedScript` |
| `Entities/Audit/` | `AuditLogEntry` (config-change audit), `SiteCall` (SiteCalls operational mirror) |
The `Instance` entity illustrates the typical POCO shape — required fields enforced by a constructor, navigation collections as plain `ICollection<T>`, and no persistence annotations:
```csharp
// Entities/Instances/Instance.cs
public class Instance
{
public int Id { get; set; }
public int TemplateId { get; set; }
public int SiteId { get; set; }
public int? AreaId { get; set; }
public string UniqueName { get; set; }
public InstanceState State { get; set; }
public ICollection<InstanceAttributeOverride> AttributeOverrides { get; set; } = new List<InstanceAttributeOverride>();
public ICollection<InstanceAlarmOverride> AlarmOverrides { get; set; } = new List<InstanceAlarmOverride>();
public ICollection<InstanceConnectionBinding> ConnectionBindings { get; set; } = new List<InstanceConnectionBinding>();
public ICollection<InstanceNativeAlarmSourceOverride> NativeAlarmSourceOverrides { get; set; } = new List<InstanceNativeAlarmSourceOverride>();
public Instance(string uniqueName)
{
UniqueName = uniqueName ?? throw new ArgumentNullException(nameof(uniqueName));
}
}
```
### Repository interfaces by consuming component
| Interface | Consuming component | Scope |
|---|---|---|
| `ITemplateEngineRepository` | Template Engine | Templates, attributes, alarms, native alarm sources, scripts, compositions, folders, instances, overrides, connection bindings, areas, shared scripts |
| `IDeploymentManagerRepository` | Deployment Manager | Deployment records, snapshots, system artifact deployments |
| `ISecurityRepository` | Security & Auth | LDAP group mappings, site scope rules |
| `IInboundApiRepository` | Inbound API | API keys, API method definitions |
| `IExternalSystemRepository` | External System Gateway | External system definitions, methods, database connection definitions |
| `INotificationRepository` | Notification Service | Notification lists, recipients, SMTP configuration |
| `INotificationOutboxRepository` | Notification Outbox | `Notifications` table: ingest, due-row polling, status transitions, KPI queries, bulk purge |
| `ISiteCallAuditRepository` | Site Call Audit | `SiteCalls` table: ingest, upsert-on-newer-status, KPI queries, bulk purge |
| `IAuditLogRepository` | Audit Log | `AuditLog` table: idempotent ingest, keyset-paged query, partition switch-out, KPI snapshots, execution tree walk |
| `ISiteRepository` | Central UI, Site Runtime | Sites, data connections, site assignments |
| `ICentralUiRepository` | Central UI | Read-spanning queries for display |
`IAuditLogRepository` enforces the append-only contract at the API level — it exposes no Update and no single-row Delete. Bulk purge is `SwitchOutPartitionAsync` only. Ingest is idempotent on `EventId`:
```csharp
// Interfaces/Repositories/IAuditLogRepository.cs
public interface IAuditLogRepository
{
Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default);
Task<IReadOnlyList<AuditEvent>> QueryAsync(
AuditLogQueryFilter filter,
AuditLogPaging paging,
CancellationToken ct = default);
Task<long> SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default);
Task<IReadOnlyList<DateTime>> GetPartitionBoundariesOlderThanAsync(
DateTime threshold,
CancellationToken ct = default);
Task<AuditLogKpiSnapshot> GetKpiSnapshotAsync(
TimeSpan window,
DateTime? nowUtc = null,
CancellationToken ct = default);
Task<IReadOnlyList<ExecutionTreeNode>> GetExecutionTreeAsync(
Guid executionId,
CancellationToken ct = default);
Task<IReadOnlyList<string>> GetDistinctSourceNodesAsync(CancellationToken ct = default);
}
```
### Cross-cutting service interfaces
`Interfaces/Services/` holds service interfaces for cross-cutting concerns that multiple components consume but do not implement in Commons itself.
| Interface | Purpose | Implemented by |
|---|---|---|
| `IAuditService` | Configuration-change audit log entry (`LogAsync`). Central components call this through the UoW. | Configuration Database |
| `IAuditWriter` | Site hot-path audit write (`WriteAsync`). Best-effort; must never throw back at the caller. | Audit Log |
| `ICentralAuditWriter` | Central direct-write for central-originated audit rows; insert-if-not-exists on `EventId`. | Audit Log |
| `ISiteAuditQueue` | Hands off site audit rows to the gRPC telemetry forwarder. | Audit Log |
| `ICachedCallLifecycleObserver` / `ICachedCallTelemetryForwarder` | Bridge between the S&F Engine's lifecycle transitions and the `CachedCallTelemetry` packet. | Audit Log |
| `INodeIdentityProvider` | Resolves the current node's `SourceNode` label (`node-a`, `central-b`, etc.). | Host |
| `IOperationTrackingStore` | Site-local SQLite tracking status store for `Tracking.Status(id)`. | Site Runtime |
| `IPartitionMaintenance` | Central partition-switch / retention purge hook. | Audit Log |
| `IDatabaseGateway` | Script-facing ADO.NET database access via named connections. | External System Gateway |
| `IExternalSystemClient` | Script-facing `ExternalSystem.Call()` / `CachedCall()` invocation. | External System Gateway |
| `IInstanceLocator` | Resolves instance unique name to site identifier for `Route.To()`. | Management Service |
| `IAuditActorAccessor` | Resolves the authenticated principal's actor string for audit rows (inbound API middleware). | Security & Auth |
Transport bundle interfaces (`IBundleExporter`, `IBundleImporter`, `IBundleSessionStore`, `IAuditCorrelationContext`) live in `Interfaces/Transport/` and are defined in Commons so the Configuration Database and Central UI can depend on the abstraction without taking a Transport component dependency.
### Key shared types
**`Result<T>`** is the system-wide discriminated result type. A failed result always carries a non-blank error message; callers pattern-match via `Match`:
```csharp
// Types/Result.cs
public sealed class Result<T>
{
public bool IsSuccess { get; }
public bool IsFailure => !IsSuccess;
public T Value => IsSuccess ? _value! : throw new InvalidOperationException("...");
public string Error => IsFailure ? _error! : throw new InvalidOperationException("...");
public static Result<T> Success(T value) => new(value);
public static Result<T> Failure(string error) => new(error);
public TResult Match<TResult>(Func<T, TResult> onSuccess, Func<string, TResult> onFailure) =>
IsSuccess ? onSuccess(_value!) : onFailure(_error!);
}
```
**`TrackedOperationId`** is the strongly-typed GUID that identifies a cached outbound operation end-to-end — it is the idempotency key on every `AuditLog` row for that lifecycle and the primary key on the central `SiteCalls` row:
```csharp
// Types/TrackedOperationId.cs
public readonly record struct TrackedOperationId(Guid Value)
{
public static TrackedOperationId New() => new(Guid.NewGuid());
public static TrackedOperationId Parse(string s) => new(Guid.Parse(s));
public static bool TryParse(string? s, out TrackedOperationId result) { ... }
public override string ToString() => Value.ToString("D");
}
```
**`AlarmConditionState`** is the unified, read-only alarm condition model shared by computed and native alarms. Computed alarms populate it from `State` + `Priority`; native alarms mirror it from the OPC UA or MxAccess source:
```csharp
// Types/Alarms/AlarmConditionState.cs
public record AlarmConditionState(
bool Active,
bool Acknowledged,
bool? Confirmed, // null when the condition is not confirmable
AlarmShelveState Shelve,
bool Suppressed,
int Severity); // 01000 unified scale
```
**`ScadaBridgeAuditEventFactory`** is the single construction point for a canonical `AuditEvent`. Every audit emit site builds its row through `Create` so the domain-vocabulary-to-canonical-field mapping (`Channel`/`Kind`/`Status``Action`/`Category`/`Outcome`; all other ScadaBridge domain fields → `DetailsJson`) is applied identically everywhere with no per-site drift.
### Protocol abstraction
`Interfaces/Protocol/` defines the Data Connection Layer's protocol-neutral interfaces.
`IDataConnection` is the base interface for reading, writing, and subscribing to device data regardless of protocol. `IBrowsableDataConnection` is an optional capability interface for address-space browsing. `IAlarmSubscribableConnection` is an optional capability interface for connections that can mirror native alarms — implementations expose `SubscribeAlarmsAsync` and `UnsubscribeAlarmsAsync`, delivering transitions as protocol-neutral `NativeAlarmTransition` records via `AlarmTransitionCallback`. The `DataConnectionActor` consumes these via capability checks (runtime `is` cast), keeping protocol knowledge out of the core actor logic.
### Message contracts
`Messages/` organizes contracts by concern rather than by sender/receiver:
| Folder | Key types |
|---|---|
| `Deployment/` | `DeployInstanceCommand`, `DeploymentStatusResponse`, `FlattenedConfigurationSnapshot` |
| `Lifecycle/` | `DisableInstanceCommand`, `EnableInstanceCommand`, `DeleteInstanceCommand`, `InstanceLifecycleResponse` |
| `Health/` | `SiteHealthReport`, `HeartbeatMessage`, `NodeStatus`, `TagQualityCounts` |
| `Streaming/` | `AttributeValueChanged`, `AlarmStateChanged` (additively enriched for both computed and native alarms) |
| `Integration/` | `CachedCallTelemetry`, `AuditTelemetryEnvelope`, `PullAuditEventsRequest/Response` |
| `Notification/` | `NotificationSubmit`, `NotificationSubmitAck`, `NotificationStatusQuery/Response` |
| `Audit/` | `IngestAuditEventsCommand/Reply`, `IngestCachedTelemetryCommand/Reply`, `UpsertSiteCallCommand/Reply` |
| `RemoteQuery/` | Event log queries, parked-message queries, `ParkedOperationRelayMessages` |
| `Management/` | All HTTP Management API commands per domain area, `ManagementEnvelope`, `TransportCommands` |
`CachedCallTelemetry` carries one combined packet per lifecycle event so central can write the `AuditLog` row and the `SiteCalls` upsert in a single MS SQL transaction:
```csharp
// Messages/Integration/CachedCallTelemetry.cs
public sealed record CachedCallTelemetry(
AuditEvent Audit,
SiteCallOperational Operational);
```
`AlarmStateChanged` demonstrates the additive-only evolution rule in practice — the original positional constructor still compiles; native alarm fields are `init` properties with safe defaults, so existing computed-alarm emitters need no change:
```csharp
// Messages/Streaming/AlarmStateChanged.cs
public record AlarmStateChanged(
string InstanceUniqueName,
string AlarmName,
AlarmState State,
int Priority,
DateTimeOffset Timestamp) : ISiteStreamEvent
{
public AlarmLevel Level { get; init; } = AlarmLevel.None;
public AlarmKind Kind { get; init; } = AlarmKind.Computed;
// Condition uses a private backing field so the getter can return a
// computed default (AlarmConditionStateFactory.ForComputed(State, Priority))
// when no explicit value has been set via the init accessor.
private AlarmConditionState? _condition;
public AlarmConditionState Condition
{
get => _condition ?? AlarmConditionStateFactory.ForComputed(State, Priority);
init => _condition = value;
}
public string SourceReference { get; init; } = string.Empty;
// ... additional native-alarm fields with empty defaults
}
```
### Observability
`Observability/ScadaBridgeTelemetry` defines the singleton `Meter` named `ZB.MOM.WW.ScadaBridge` and the application-wide instrument definitions. Components call the static emit helpers (`RecordDeploymentApplied`, `SiteConnectionOpened`, etc.) rather than creating their own meters. Instruments are no-ops until an OTel listener attaches, so uninstrumented hosts pay no overhead.
## Usage
Commons is consumed through direct project references — all other components in the solution reference it. There is nothing to register or configure; the types are available as soon as the project reference is in place.
When adding a new entity class: add the POCO to the appropriate `Entities/<DomainArea>/` subfolder with no EF attributes, then add the corresponding repository method signature to the relevant interface in `Interfaces/Repositories/`. The Configuration Database component owns the EF mapping and the implementation.
When adding a new message contract: add an immutable `record` to the appropriate `Messages/<Concern>/` subfolder. If the message will cross the site→central version-skew boundary, apply the additive-only rule immediately — use `init` properties with defaults for any fields beyond the initial set so older receivers can safely ignore unknown fields.
## Dependencies & Interactions
- **Minimal dependencies** — Commons references the core .NET SDK and the `ZB.MOM.WW.Audit` package (for the canonical `AuditEvent` type). It does not reference Akka.NET, ASP.NET Core, Entity Framework Core, or any other third-party library.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — implements every repository interface defined here (`ITemplateEngineRepository`, `IAuditLogRepository`, etc.) via EF Core Fluent API; maps the POCO entity classes to the underlying MS SQL schema.
- **All other components** — reference Commons for shared types, entity classes, interface contracts, and message definitions. The dependency graph is strictly one-way: Commons knows nothing about its consumers.
- [Audit Log (#23)](./AuditLog.md) — implements `IAuditWriter`, `ICentralAuditWriter`, `ISiteAuditQueue`, `ICachedCallLifecycleObserver`, and `ICachedCallTelemetryForwarder`; consumes `ScadaBridgeAuditEventFactory`, `AuditDetailsCodec`, `AuditRowProjection`, and the audit message contracts defined here.
- [Site Call Audit (#22)](./SiteCallAudit.md) — consumes `ISiteCallAuditRepository` and the `CachedCallTelemetry` / `UpsertSiteCallCommand` message types.
- [Notification Outbox (#21)](./NotificationOutbox.md) — consumes `INotificationOutboxRepository` and the `NotificationSubmit` / `NotificationSubmitAck` contracts.
- [Transport (#24)](./Transport.md) — its interfaces (`IBundleExporter`, `IBundleImporter`, `IBundleSessionStore`, `IAuditCorrelationContext`) and value objects (`BundleManifest`, `ImportPreview`, etc.) are defined in Commons so Configuration Database and Central UI can depend on the abstraction without a Transport project reference.
- Design spec: [Component-Commons.md](../requirements/Component-Commons.md).
## Related Documentation
- [Commons design specification](../requirements/Component-Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Audit Log](./AuditLog.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Transport](./Transport.md)
- [Data Connection Layer](./DataConnectionLayer.md)
- [Health Monitoring](./HealthMonitoring.md)
+277
View File
@@ -0,0 +1,277 @@
# CentralSite Communication
The CentralSite Communication component is the transport layer that connects the central cluster to every site cluster. It provides two independent transports — Akka.NET `ClusterClient` for command/control and gRPC server-streaming for real-time data — wired together through a pair of actors that each cluster registers with the `ClusterClientReceptionist`.
## Overview
Communication (#5) runs on every node in every cluster. The component code lives in `src/ZB.MOM.WW.ScadaBridge.Communication/`, organised as follows:
- `Actors/``CentralCommunicationActor`, `SiteCommunicationActor`, `DebugStreamBridgeActor`, `StreamRelayActor`.
- `Grpc/``SiteStreamGrpcServer`, `SiteStreamGrpcClient`, `SiteStreamGrpcClientFactory`, `ISiteStreamSubscriber`, and the proto DTO mappers.
- `Protos/``sitestream.proto` (proto source; generated C# is vendored in `SiteStreamGrpc/`).
- `CommunicationService.cs` — typed Ask-pattern façade used by callers on the central side.
- `DebugStreamService.cs` — session manager for debug stream bridge actors.
- `CommunicationOptions.cs` — configuration options class.
- `ServiceCollectionExtensions.cs` — DI registration (`AddCommunication`).
DI registration is called from the Host composition root via `AddCommunication`. The actors themselves are created inside `AkkaHostedService.RegisterCentralActors` / `RegisterSiteActors` because they must be created within the actor system, not by the DI container.
## Key Concepts
### Two transports, two concerns
| Transport | Direction | Purpose |
|-----------|-----------|---------|
| Akka.NET `ClusterClient` | bidirectional (command/control) | Deployments, lifecycle, subscribe/unsubscribe handshake, snapshots, heartbeats, health reports, telemetry, notifications |
| gRPC server-streaming (`SiteStreamService`) | site → central | Real-time attribute value and alarm state changes |
The transports are independent. A gRPC stream interruption does not affect in-flight `ClusterClient` commands, and vice versa.
### Hub-and-spoke topology
Sites do not communicate with each other. All inter-cluster traffic flows through central. Central maintains one `ClusterClient` per site; each site maintains a single `ClusterClient` pointed at both central nodes.
### `SiteEnvelope` routing
Central-side callers wrap outbound messages in a `SiteEnvelope(SiteId, Message)`. `CentralCommunicationActor` resolves the site's `ClusterClient` by `SiteId` and forwards the inner message to `/user/site-communication` on the site:
```csharp
// CommunicationService.cs — deployment pattern
public async Task<DeploymentStatusResponse> DeployInstanceAsync(
string siteId, DeployInstanceCommand command, CancellationToken cancellationToken = default)
{
var envelope = new SiteEnvelope(siteId, command);
return await GetActor().Ask<DeploymentStatusResponse>(
envelope, _options.DeploymentTimeout, cancellationToken);
}
```
`CentralCommunicationActor.HandleSiteEnvelope` extracts the inner message and routes it via the cached `ClusterClient`:
```csharp
private void HandleSiteEnvelope(SiteEnvelope envelope)
{
if (!_siteClients.TryGetValue(envelope.SiteId, out var entry))
{
_log.Warning("No ClusterClient for site {0}, cannot route message {1}",
envelope.SiteId, envelope.Message.GetType().Name);
return; // caller's Ask times out — no central buffering
}
entry.Client.Tell(
new ClusterClient.Send("/user/site-communication", envelope.Message),
Sender);
}
```
### No central buffering
If a site is unreachable when a command arrives, the caller's Ask times out. Central never queues command/control messages on behalf of a site. This is deliberate: it keeps the central coordinator stateless with respect to site availability and pushes retry responsibility to the operator or to the Store-and-Forward Engine for messages that tolerate it.
## Architecture
### Central-side: `CentralCommunicationActor`
`CentralCommunicationActor` is a `ReceiveActor` created at `/user/central-communication` and registered with `ClusterClientReceptionist` so the site's `ClusterClient` can locate it. It owns:
- A `Dictionary<string, (IActorRef Client, ImmutableHashSet<string> ContactAddresses)>` keyed by site identifier — one `ClusterClient` per site.
- A `RefreshSiteAddresses` periodic timer (60-second cadence, starting immediately). Each tick fires `LoadSiteAddressesFromDb`, which reads every `Site` row from the database, parses `NodeAAddress` and `NodeBAddress` into Akka receptionist paths (`{addr}/system/receptionist`), and pipes a `SiteAddressCacheLoaded` message back to Self. `HandleSiteAddressCacheLoaded` creates, updates, or stops `ClusterClient` actors based on the diff.
- Proxy references to `NotificationOutboxActor` and `AuditLogIngestActor` cluster singletons, injected post-construction via `RegisterNotificationOutbox` / `RegisterAuditIngest` messages from the Host. Messages that arrive before the proxy is registered are answered with a non-accepted ack (notifications) or an empty reply (audit), so the site retries without data loss.
- Fanout of `SiteHealthReport` to the peer central node via `DistributedPubSub`, keyed on the `site-health-replica` topic, so both central nodes' aggregators stay in sync regardless of which central node the site's `ClusterClient` load-balanced the report to.
`ISiteClientFactory` / `DefaultSiteClientFactory` abstract `ClusterClient` construction for testability.
### Site-side: `SiteCommunicationActor`
`SiteCommunicationActor` is a `ReceiveActor` created at `/user/site-communication` and registered with `ClusterClientReceptionist`. It owns:
- An `IActorRef? _centralClient` — the site's outbound `ClusterClient` to central. Injected post-construction via `RegisterCentralClient`.
- A `Timers`-based heartbeat (default 5-second interval, first tick after 1 second). Each tick sends a `HeartbeatMessage` with `IsActive` stamped from the Akka `Cluster` leader check — the node is active when its `MemberStatus` is `Up` and it holds cluster leadership.
- Dispatch to local handlers for every inbound command pattern. Handlers for event-log, parked-message, integration, and artifact patterns are registered post-construction via `RegisterLocalHandler`; unregistered patterns receive an inline error reply so the central Ask does not stall.
Site-to-central messages (health reports, audit batches, notification submissions) are sent via:
```csharp
_centralClient.Tell(
new ClusterClient.Send("/user/central-communication", msg), Sender);
```
For request/response messages, the original `Sender` is forwarded as the `ClusterClient.Send` sender so any reply from central routes straight back to the waiting Ask on the site, not through `SiteCommunicationActor`. For fire-and-forget messages (e.g. `SiteHealthReport`), `Self` is used as the sender instead, because no reply is expected.
### Address loading and the 60-second refresh
`CentralCommunicationActor` calls `ISiteRepository.GetAllSitesAsync` inside a background `Task.Run` (to avoid blocking the actor thread on a database round-trip) and pipes the result as `SiteAddressCacheLoaded`. The actor-lifecycle `CancellationTokenSource` is threaded into the repository call so a slow MS SQL query is cancelled when the actor stops.
A malformed address for one site does not abort the refresh loop — the actor catches the parse failure, logs a warning, skips that site, and processes the rest. The refresh also runs immediately on startup (`TimeSpan.Zero` initial delay) so the cache is populated before the first command arrives.
`CommunicationService.RefreshSiteAddresses()` triggers an on-demand refresh when a site record is added, edited, or deleted from the Central UI or CLI.
### gRPC real-time data transport
Real-time attribute value and alarm state changes are delivered over `SiteStreamService`, a gRPC server-streaming service defined in `sitestream.proto`.
**Site-side**`SiteStreamGrpcServer` (Kestrel HTTP/2, port 8083):
- Implements `SiteStreamService.SiteStreamServiceBase`.
- For each `SubscribeInstance` call, creates a `StreamRelayActor` (named `stream-relay-{correlationId}-{seq}`) and subscribes it to `ISiteStreamSubscriber` (implemented by `SiteStreamManager` in the Site Runtime project — `SiteStreamGrpcServer` holds only the interface so it does not reference `SiteRuntime` directly).
- Bridges events via a `BoundedChannel<SiteStreamEvent>` (capacity 1000, `DropOldest`) from the actor thread to the async gRPC write loop.
- Enforces a `GrpcMaxConcurrentStreams` limit (default 100) and a `GrpcMaxStreamLifetime` session timeout (default 4 hours) to evict zombie streams.
- Validates `correlation_id` against `ActorPath.IsValidPathElement` before use in an actor name, rejecting invalid values with `StatusCode.InvalidArgument`.
- During `CoordinatedShutdown`, `CancelAllStreams()` flips `_shuttingDown`, refuses new subscriptions with `StatusCode.Unavailable`, and cancels all active `CancellationTokenSource`s.
`StreamRelayActor` is a lightweight `ReceiveActor` that converts `AttributeValueChanged` and `AlarmStateChanged` domain events to proto `SiteStreamEvent` messages and writes them to the channel writer:
```csharp
// StreamRelayActor.cs
private void HandleAttributeValueChanged(AttributeValueChanged msg)
{
var protoEvent = new SiteStreamEvent
{
CorrelationId = _correlationId,
AttributeChanged = new AttributeValueUpdate
{
InstanceUniqueName = msg.InstanceUniqueName,
AttributePath = msg.AttributePath,
AttributeName = msg.AttributeName,
Value = ValueFormatter.FormatDisplayValue(msg.Value),
Quality = MapQuality(msg.Quality),
Timestamp = Timestamp.FromDateTimeOffset(msg.Timestamp)
}
};
WriteToChannel(protoEvent);
}
```
**Central-side**`SiteStreamGrpcClient` / `SiteStreamGrpcClientFactory`:
- `SiteStreamGrpcClientFactory` (singleton) caches one `SiteStreamGrpcClient` per site identifier. On `GetOrCreate`, it compares the cached client's `Endpoint` to the requested endpoint and atomically replaces a stale client (different endpoint — NodeA→NodeB failover flip, or an edited address) with a fresh one.
- `SiteStreamGrpcClient` opens a `GrpcChannel` with HTTP/2 keepalive (`KeepAlivePingDelay` default 15 s, `KeepAlivePingTimeout` default 10 s, `KeepAlivePingPolicy.Always`). `SubscribeAsync` is a plain `async Task` that calls `SubscribeInstance` and reads the response stream with `await foreach`, invoking `onEvent` for each received event and `onError` on any non-cancellation exception. The caller (`DebugStreamBridgeActor.OpenGrpcStream`) launches it inside a `Task.Run` so the long-running stream loop runs off the actor thread.
### Debug stream session lifecycle
`DebugStreamService` manages one `DebugStreamBridgeActor` per active debug session. On `StartStreamAsync`, it resolves the instance's site and gRPC addresses, creates the bridge actor, and holds the session in a `ConcurrentDictionary`.
`DebugStreamBridgeActor` (one per session, short-lived, no persistence):
1. In `PreStart`, sends `SubscribeDebugViewRequest` to `CentralCommunicationActor` (ClusterClient, for the initial snapshot).
2. On receiving `DebugViewSnapshot`, fires `onEvent(snapshot)` and calls `OpenGrpcStream`.
3. `OpenGrpcStream` calls `_grpcFactory.GetOrCreate(siteId, endpoint)` and launches `client.SubscribeAsync(...)` as a background task. Domain events are marshalled back to the actor via `Self.Tell` for thread safety.
4. On a gRPC error, flips to the other node endpoint and retries (first retry immediate, subsequent retries with `ReconnectDelay` default 5 s). The retry budget (`MaxRetries = 3`) is recovered only after `StabilityWindow` (default 60 s) of uninterrupted connection — a stream that delivers one event then immediately fails does not count as stable.
5. On `StopDebugStream`, cancels the gRPC subscription and sends `UnsubscribeDebugViewRequest` to the site via ClusterClient.
### Proto definition summary
```proto
// Protos/sitestream.proto
service SiteStreamService {
rpc SubscribeInstance(InstanceStreamRequest) returns (stream SiteStreamEvent);
rpc IngestAuditEvents(AuditEventBatch) returns (IngestAck);
rpc IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck);
rpc PullAuditEvents(PullAuditEventsRequest) returns (PullAuditEventsResponse);
}
```
`SubscribeInstance` carries the real-time data stream. The other three RPCs (`IngestAuditEvents`, `IngestCachedTelemetry`, `PullAuditEvents`) serve the Audit Log component's gRPC telemetry push and reconciliation pull paths — `SiteStreamGrpcServer` hosts them on the same port because sites already listen there.
`SiteStreamEvent` uses a `oneof event { AttributeValueUpdate, AlarmStateUpdate }` discriminator. `AlarmStateUpdate` carries the full native alarm condition (fields 821) alongside the base computed-alarm fields (17), added additively so old clients ignoring unknown fields continue to work.
## Usage
Central callers interact through `CommunicationService`, which wraps each command pattern in a typed `Ask` with a per-pattern timeout:
| Pattern | Method | Timeout |
|---------|--------|---------|
| Instance deployment | `DeployInstanceAsync` | 120 s |
| Instance lifecycle | `DisableInstanceAsync`, `EnableInstanceAsync`, `DeleteInstanceAsync` | 30 s |
| Artifact deployment | `DeployArtifactsAsync` | 60 s |
| Integration routing | `RouteIntegrationCallAsync` | 30 s |
| Debug snapshot | `RequestDebugSnapshotAsync` | 30 s |
| Remote queries | `QueryEventLogsAsync`, `QueryParkedMessagesAsync`, etc. | 30 s |
| OPC UA tag browse | `BrowseNodeAsync` | 30 s |
| Notification outbox (central-local) | `QueryNotificationOutboxAsync`, `RetryNotificationAsync`, etc. | 30 s |
| Site Call Audit (central-local) | `QuerySiteCallsAsync`, `RetrySiteCallAsync`, etc. | 30 s |
Notification Outbox and Site Call Audit actors are central-local singletons — their `CommunicationService` methods Ask the proxy directly without wrapping in `SiteEnvelope`.
For real-time streaming, callers use `DebugStreamService.StartStreamAsync`, which creates a `DebugStreamBridgeActor` and returns a session handle. Ongoing events arrive via the `onEvent` callback; session teardown is via `StopStreamAsync`.
## Configuration
All options are bound from the `Communication` section via `CommunicationOptions`:
| Key | Default | Description |
|-----|---------|-------------|
| `DeploymentTimeout` | `00:02:00` | Ask timeout for instance deployment commands. |
| `LifecycleTimeout` | `00:00:30` | Ask timeout for lifecycle commands (disable, enable, delete). |
| `ArtifactDeploymentTimeout` | `00:01:00` | Ask timeout for system-wide artifact deployment. |
| `QueryTimeout` | `00:00:30` | Ask timeout for remote queries and management commands. |
| `IntegrationTimeout` | `00:00:30` | Ask timeout for integration routing and Inbound API routing. |
| `DebugViewTimeout` | `00:00:10` | Ask timeout for debug subscribe/unsubscribe handshake. |
| `NotificationForwardTimeout` | `00:00:30` | Ask timeout for notification submission forwarding. |
| `CentralContactPoints` | `[]` | Site-side: Akka addresses of central nodes, e.g. `akka.tcp://scadabridge@central-a:8081`. |
| `GrpcKeepAlivePingDelay` | `00:00:15` | HTTP/2 keepalive PING interval on `SiteStreamGrpcClient`. |
| `GrpcKeepAlivePingTimeout` | `00:00:10` | HTTP/2 keepalive PING timeout. |
| `GrpcMaxStreamLifetime` | `04:00:00` | Per-stream session timeout; forces reconnect of zombie streams. |
| `GrpcMaxConcurrentStreams` | `100` | Max concurrent `SubscribeInstance` streams per site node. |
| `TransportHeartbeatInterval` | `00:00:05` | `SiteCommunicationActor` heartbeat cadence. |
| `TransportFailureThreshold` | `00:00:15` | Akka remoting failure-detection threshold. |
Three layers of dead-client detection protect the gRPC stream path:
| Layer | Detects | Timeline |
|-------|---------|----------|
| TCP RST | Clean process death, connection close | ~15 s |
| gRPC keepalive PING | Network partition, silent crash | ~25 s |
| Session timeout (`GrpcMaxStreamLifetime`) | Zombie streams with misconfigured keepalive | 4 h |
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns all message contracts used by this component: `DeployInstanceCommand`, `SiteEnvelope`, `HeartbeatMessage`, `SiteHealthReport`, `SiteHealthReportReplica`, `RegisterNotificationOutbox`, `RegisterAuditIngest`, `IngestAuditEventsCommand`, `IngestCachedTelemetryCommand`, and all other request/response records. Commons does not hold an Akka package reference, so `RegisterAuditIngest` (which carries an `IActorRef`) lives in this project.
- [Cluster Infrastructure (#13)](./ClusterInfrastructure.md) — provides `ClusterClientReceptionist` registration and the active/standby leader model that `SiteCommunicationActor`'s `IsActive` check and `CentralCommunicationActor`'s `DistributedPubSub` fanout both depend on.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — provides `ISiteRepository.GetAllSitesAsync` for address loading; site records carry `NodeAAddress`, `NodeBAddress`, `GrpcNodeAAddress`, `GrpcNodeBAddress`.
- [Deployment Manager (#2)](./DeploymentManager.md) — the primary consumer of command/control patterns 13. `CommunicationService` is injected into the Deployment Manager actor to send deployments, lifecycle commands, and artifact deployments to sites.
- [Site Runtime (#3)](./SiteRuntime.md) — `SiteCommunicationActor` forwards inbound commands to the `DeploymentManager` singleton proxy. `SiteStreamManager` (in Site Runtime) implements `ISiteStreamSubscriber` so `SiteStreamGrpcServer` can subscribe relay actors to instance event feeds without referencing Site Runtime directly.
- [Health Monitoring (#11)](./HealthMonitoring.md) — `CentralCommunicationActor` calls `ICentralHealthAggregator.MarkHeartbeat` and `ProcessReport` for every inbound heartbeat and health report. `DistributedPubSub` fanout keeps both central nodes' aggregators in sync.
- [Audit Log (#23)](./AuditLog.md) — `SiteStreamGrpcServer` hosts `IngestAuditEvents`, `IngestCachedTelemetry`, and `PullAuditEvents` RPCs. `CentralCommunicationActor` routes `IngestAuditEventsCommand` / `IngestCachedTelemetryCommand` ClusterClient messages to the `AuditLogIngestActor` proxy.
- [Notification Outbox (#21)](./NotificationOutbox.md) — `CentralCommunicationActor` routes `NotificationSubmit` / `NotificationStatusQuery` messages from sites to the `NotificationOutboxActor` proxy. `CommunicationService` Asks the proxy directly for central-UI outbox management calls.
- [Site Call Audit (#22)](./SiteCallAudit.md) — `CommunicationService` Asks the `SiteCallAuditActor` proxy directly for query and relay operations. `SiteCallAuditActor` issues `RetryParkedOperation` / `DiscardParkedOperation` relay commands to sites via `SiteEnvelope`; `SiteCommunicationActor` dispatches them to `_parkedMessageHandler`.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — the site S&F Engine drives `NotificationSubmit` forwarding and cached-call telemetry emission through `SiteCommunicationActor`. Parked-message queries and retry/discard relay commands flow back the other way.
- [Management Service (#18)](./ManagementService.md) — `ManagementActor` is registered with `ClusterClientReceptionist` at `/user/management` on central; the CLI connects via its own separate `ClusterClient`. This is a distinct `ClusterClient` usage from the inter-cluster hub-and-spoke connections managed by this component.
- Design spec: [Component-Communication.md](../requirements/Component-Communication.md).
## Troubleshooting
### A site's commands fail immediately
Check that `NodeAAddress` and `NodeBAddress` are populated in the site configuration — if both are empty, `CentralCommunicationActor` logs a warning and skips that site on every refresh, so no `ClusterClient` is created and all commands timeout. `CommunicationService.RefreshSiteAddresses()` triggers an on-demand refresh after an address is added.
### Commands are timing out but the site is reachable
A single malformed address string for one site can silently prevent `ClusterClient` creation for that site while other sites are unaffected. Check the logs for a `Warning` line from `HandleSiteAddressCacheLoaded` naming the offending site. The actor parse-guard catches the `ActorPath.Parse` exception per-site so the rest of the refresh proceeds.
A `Warning` at the `Status.Failure` handler in `CentralCommunicationActor` means `LoadSiteAddressesFromDb` itself threw (typically a SQL connection error); the cache is left stale until the next successful refresh.
### gRPC debug stream drops immediately after opening
`SiteStreamGrpcServer` rejects `correlation_id` values that contain characters invalid in Akka actor names (`/`, whitespace, etc.) with `StatusCode.InvalidArgument`. Verify that the calling `DebugStreamBridgeActor` generates a safe correlation ID.
After a site node failover, the `DebugStreamBridgeActor` attempts to reconnect to the other node endpoint (`_useNodeA` flips on each error). If both nodes are unreachable, the actor exhausts its 3-retry budget and calls `onTerminated`. The engineer must restart the debug session.
### Heartbeats arrive but health reports do not
`SiteCommunicationActor` sends heartbeats and health reports via separate paths. Health reports are sent only when the site's `HealthReportSender` publishes them (every 30 s by default). If heartbeats arrive but reports do not, the health-report sender on the site may have faulted — check site-side logs for errors in `HealthReportSender`.
## Related Documentation
- [CentralSite Communication design specification](../requirements/Component-Communication.md)
- [Commons](./Commons.md)
- [Cluster Infrastructure](./ClusterInfrastructure.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Deployment Manager](./DeploymentManager.md)
- [Site Runtime](./SiteRuntime.md)
- [Health Monitoring](./HealthMonitoring.md)
- [Audit Log](./AuditLog.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Management Service](./ManagementService.md)
+256
View File
@@ -0,0 +1,256 @@
# Configuration Database
The Configuration Database component is the exclusive EF Core data-access layer for the central MS SQL configuration store. It owns the `ScadaBridgeDbContext`, every `IEntityTypeConfiguration<T>` Fluent mapping, all repository implementations, the `IAuditService` and `IAuditCorrelationContext` implementations, the `AuditLogPartitionMaintenance` service, and the EF Core migration history. No other component references EF Core or touches the configuration database directly.
## Overview
The component lives in `src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/` and is central-only — site clusters never load it. Its responsibilities break down into four areas:
- **DbContext + Fluent mappings** — `ScadaBridgeDbContext` maps ~30 Commons POCO entity types to SQL Server using `IEntityTypeConfiguration<T>` classes in `Configurations/`, registered wholesale via `modelBuilder.ApplyConfigurationsFromAssembly(...)`.
- **Repository implementations** — eleven scoped repositories implement the interfaces declared in Commons, covering every domain area from template authoring to audit log ingest.
- **Config-change audit** — `AuditService` implements `IAuditService`, staging an `AuditLogEntry` into the change tracker so it commits atomically with the entity change; `AuditCorrelationContext` threads a `BundleImportId` through `AsyncLocal<T>` so bundle-import audit rows are correlated without cross-contaminating concurrent import sessions.
- **Partition maintenance** — `AuditLogPartitionMaintenance` implements `IPartitionMaintenance`, rolling `pf_AuditLog_Month` forward by issuing `ALTER PARTITION FUNCTION … SPLIT RANGE` for each missing future monthly boundary.
The single DI entry point is `ServiceCollectionExtensions.AddConfigurationDatabase(string connectionString)`.
## Key Concepts
### Persistence-ignorant Commons entities
POCO entity classes and repository interfaces are declared in Commons and are entirely free of EF Core attributes. All EF knowledge — column types, max-lengths, indexes, value converters, relationships — lives in the `Configurations/` classes here. Consuming components depend on Commons types only; they never reference this project or EF Core directly.
### Secret-column encryption
Three columns carry authentication secrets: `SmtpConfiguration.Credentials`, `ExternalSystemDefinition.AuthConfiguration`, and `DatabaseConnectionDefinition.ConnectionString`. Each uses `EncryptedStringConverter`, an EF `ValueConverter<string?, string?>` that wraps ASP.NET Data Protection. The protector is purpose-scoped to `"ZB.MOM.WW.ScadaBridge.ConfigurationDatabase.EncryptedColumn"` and its key ring is persisted to the database itself (via `IDataProtectionKeyContext`), so both central nodes share one key ring and can read each other's writes.
`ScadaBridgeDbContext` accepts two constructors: the `(DbContextOptions)` single-argument form used by design-time EF tooling, and the `(DbContextOptions, IDataProtectionProvider)` form used at runtime. The runtime form encrypts; the design-time form substitutes a `SchemaOnlyDataProtector` that produces the same column schema but throws `InvalidOperationException` on any actual read or write, preventing silent encryption with a throwaway key. `AddConfigurationDatabase` always registers the runtime overload:
```csharp
// ServiceCollectionExtensions.AddConfigurationDatabase (excerpt)
services.AddScoped(serviceProvider =>
{
var options = serviceProvider.GetRequiredService<DbContextOptions<ScadaBridgeDbContext>>();
var protectionProvider = serviceProvider.GetRequiredService<IDataProtectionProvider>();
return new ScadaBridgeDbContext(options, protectionProvider);
});
services.AddDataProtection()
.PersistKeysToDbContext<ScadaBridgeDbContext>();
```
A `SecretAwareModelCacheKeyFactory` folds `HasSecretEncryptionProvider` into the EF model cache key so a provider-bearing and a schema-only context never share a cached model.
### Append-only AuditLog and DB-role enforcement
The central `dbo.AuditLog` table has two dedicated SQL Server roles:
| Role | Grants |
|------|--------|
| `scadabridge_audit_writer` | `INSERT`, `SELECT` on `AuditLog` only — no `UPDATE`, no `DELETE` |
| `scadabridge_audit_purger` | `ALTER ON SCHEMA::dbo` (required for `SPLIT RANGE` and partition switch-out) |
Row-level `DELETE` on `AuditLog` is not granted even to the purge role; retention is always a partition switch, never a row delete.
## Architecture
### DbContext
`ScadaBridgeDbContext` exposes one `DbSet<T>` per mapped entity — templates, instances, sites, data connections, external systems, notifications, shared scripts, security mappings, deployment records, API methods, `AuditLogEntry`, `AuditLogRow` (the `dbo.AuditLog` persistence shape), `SiteCall`, and `DataProtectionKey`. `OnModelCreating` delegates all mapping to the `Configurations/` assembly scan, then applies secret-column encryption and strips computed-column SQL for non-SQL-Server providers (so integration tests using SQLite can still call `EnsureCreated`).
### Fluent API entity configurations
Each entity has its own `IEntityTypeConfiguration<T>` in `Configurations/`. Representative examples:
**`AuditLogEntityTypeConfiguration`** maps `AuditLogRow` to `dbo.AuditLog`. The table carries ten writable canonical columns plus five persisted computed columns derived from `DetailsJson` via `JSON_VALUE … PERSISTED` (`Kind`, `Status`, `SourceSiteId`, `ExecutionId`, `ParentExecutionId`) and one additional non-persisted computed column `IngestedAtUtc` (SWITCHOFFSET-based; SQL Server forbids PERSISTED on a non-deterministic expression). EF is configured with `ValueGeneratedOnAddOrUpdate()` and no write for the computed columns; the repository writes only the ten canonical columns and lets SQL Server derive the rest:
```csharp
// AuditLogEntityTypeConfiguration (excerpt)
builder.Property(e => e.Kind)
.HasConversion<string>()
.HasMaxLength(32)
.IsUnicode(false)
.HasComputedColumnSql("JSON_VALUE(DetailsJson,'$.kind')", stored: true)
.ValueGeneratedOnAddOrUpdate()
.IsRequired();
builder.Property(e => e.ExecutionId)
.HasComputedColumnSql(
"CAST(JSON_VALUE(DetailsJson,'$.executionId') AS uniqueidentifier)", stored: true)
.ValueGeneratedOnAddOrUpdate();
// Composite PK includes OccurredAtUtc for partition alignment
builder.HasKey(e => new { e.EventId, e.OccurredAtUtc });
builder.HasIndex(e => e.EventId).IsUnique()
.HasDatabaseName("UX_AuditLog_EventId");
```
**`TemplateConfiguration`** (representative of the domain-area configs) sets up the self-referencing parent FK, folder FK, cascade-delete relationships to attributes/alarms/scripts/compositions/native alarm sources, and the filtered unique index that enforces name uniqueness only on non-derived (base) templates.
**`SiteCallEntityTypeConfiguration`** maps `SiteCall` to `dbo.SiteCalls` with a `TrackedOperationId` PK stored as `varchar(36)` (GUID in `"D"` format) so the column shape matches the wire format and the site SQLite store — one consistent format for operational debugging.
### Repository implementations
All eleven repositories follow the same shape: they take `ScadaBridgeDbContext` by constructor injection, work with Commons POCO types, and never commit — callers invoke `SaveChangesAsync()` to commit the unit of work.
**`AuditLogRepository`** is the most specialized. Its `InsertIfNotExistsAsync` bypasses the change tracker and issues raw interpolated SQL because the computed columns must not appear in the INSERT column list:
```csharp
// AuditLogRepository.InsertIfNotExistsAsync (excerpt)
await _context.Database.ExecuteSqlInterpolatedAsync(
$@"IF NOT EXISTS (SELECT 1 FROM dbo.AuditLog WHERE EventId = {evt.EventId})
INSERT INTO dbo.AuditLog
(EventId, OccurredAtUtc, Actor, Action, Outcome, Category, Target, SourceNode, CorrelationId, DetailsJson)
VALUES
({evt.EventId}, {occurred}, {actor}, {evt.Action}, {outcome}, {category}, {evt.Target}, {evt.SourceNode}, {evt.CorrelationId}, {evt.DetailsJson});",
ct);
```
`FormattableString` interpolation parameterises every value so there is no injection surface. SQL error numbers `2601` and `2627` (unique-index violation) are swallowed as no-ops because the IF NOT EXISTS check has a race window; both the check-loser and the retrying telemetry path are semantically correct duplicates.
`QueryAsync` builds LINQ predicates over `AuditLogRow` using `AsNoTracking()`, translating filter dimensions (`Channels`, `Kinds`, `Statuses`, `SourceSiteIds`, `SourceNodes`, `ExecutionId`, `ParentExecutionId`, time range) to server-side SQL IN/equality predicates and using keyset pagination on `(OccurredAtUtc DESC, EventId DESC)`.
`GetExecutionTreeAsync` walks the `ParentExecutionId` graph in two phases: a loop climbs to the root (bounded at 32 levels), then a recursive CTE descends the full tree and LEFT JOINs back to `AuditLog` so stub nodes (purged or row-less executions) still appear with `RowCount = 0`.
`SwitchOutPartitionAsync` executes a drop-and-rebuild dance — dropping `UX_AuditLog_EventId`, creating a byte-identical staging table (including the computed-column definitions), switching the target partition to staging, dropping staging, and rebuilding the unique index — all inside a single `BEGIN TRY / BEGIN CATCH` block that guarantees the index is present whether the switch succeeds or rolls back.
### IAuditService — config-change audit
`AuditService` implements `IAuditService`, called by consuming components after each successful entity mutation. It constructs an `AuditLogEntry` with `Timestamp = DateTimeOffset.UtcNow`, serialises `afterState` to JSON tolerating reference cycles and capping depth at 32 to avoid unbounded payloads, stamps `BundleImportId` from the active `IAuditCorrelationContext`, and adds the entry to the change tracker only — the caller's `SaveChangesAsync()` commits the entry and the entity change atomically:
```csharp
// AuditService.LogAsync (excerpt)
var entry = new AuditLogEntry(user, action, entityType, entityId, entityName)
{
Timestamp = DateTimeOffset.UtcNow,
AfterStateJson = afterState != null ? SerializeAfterState(afterState) : null,
BundleImportId = _correlationContext.BundleImportId
};
await _context.AuditLogEntries.AddAsync(entry, cancellationToken);
```
`AuditCorrelationContext` backs `BundleImportId` with `AsyncLocal<Guid?>` so each logical call chain — each distinct bundle import invocation — carries its own value even when two imports share a DI scope. It is registered as scoped (to participate in the DI graph) but its in-memory state is per-call-context.
### Partition maintenance
`AuditLogPartitionMaintenance` implements `IPartitionMaintenance`. On each tick (driven by the `AuditLogPartitionMaintenanceService` hosted service in the Audit Log component) it reads the current max boundary from `sys.partition_range_values`, then issues `ALTER PARTITION SCHEME … NEXT USED` followed by `ALTER PARTITION FUNCTION … SPLIT RANGE` for each missing month up to the lookahead horizon. The NEXT USED re-issue before every SPLIT is required because SQL Server consumes the flag after the first split. A SPLIT failure propagates (rather than being swallowed) so a failed month blocks subsequent months and the next tick retries from the same boundary — no partition holes.
## Usage
### Registration
The Host calls `AddConfigurationDatabase` once, passing the `ScadaBridge:Database:ConfigurationDb` connection string:
```csharp
// Host composition root (excerpt)
services.AddConfigurationDatabase(
configuration["ScadaBridge:Database:ConfigurationDb"]!);
```
This registers `ScadaBridgeDbContext` as scoped (with the runtime encryption overload), all eleven repository interfaces bound to their implementations, `IAuditCorrelationContext``AuditCorrelationContext`, `IAuditService``AuditService`, `IInstanceLocator``InstanceLocator`, `IPartitionMaintenance``AuditLogPartitionMaintenance`, and the Data Protection key ring persisted to the database.
The obsolete zero-argument overload throws `InvalidOperationException` at startup (marked `error: true` on the `[Obsolete]` attribute) so a misconfigured host fails fast with a clear message rather than silently producing an empty DI registration.
### Consuming a repository
Consuming components resolve the Commons interface through DI and never reference this project:
```csharp
// Example: TemplateEngineRepository usage pattern
public class SomeManagementHandler
{
private readonly ITemplateEngineRepository _repo;
private readonly IAuditService _audit;
public async Task CreateTemplateAsync(Template template, string user, CancellationToken ct)
{
await _repo.AddTemplateAsync(template, ct);
await _audit.LogAsync(user, "Create", "Template",
template.Id.ToString(), template.Name, template, ct);
await _repo.SaveChangesAsync(ct); // single transaction
}
}
```
Repository `Add`/`Update`/`Delete` calls only stage changes on the change tracker. `SaveChangesAsync` on the context (exposed via the repository or accessed directly) is the unit-of-work commit.
### Migration management
`MigrationHelper.ApplyOrValidateMigrationsAsync` is called at startup after the `ScadaBridgeDbContext` is resolved. It first polls `CanConnectAsync` in a 2-second interval for up to 60 seconds (handling MSSQL container recovery lag), then:
- **Development** (`isDevelopment = true`): calls `dbContext.Database.MigrateAsync()` to auto-apply all pending migrations.
- **Production** (`isDevelopment = false`): calls `GetPendingMigrationsAsync()` and throws `InvalidOperationException` listing the pending migration names if any are outstanding. The host does not start until the schema is current.
Design-time tooling uses `DesignTimeDbContextFactory`, which reads the connection string from `ScadaBridge:Database:ConfigurationDb` in the Host's `appsettings.json` or from the `SCADABRIDGE_DESIGNTIME_CONNECTIONSTRING` environment variable. No hardcoded fallback exists — a missing connection string fails with an actionable message.
To generate production SQL scripts:
```bash
# All pending migrations as an idempotent script
dotnet ef migrations script --idempotent \
--project src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase \
--output migration.sql
# From a specific migration to another
dotnet ef migrations script FromMigration ToMigration \
--project src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase \
--output migration.sql
```
## Configuration
The connection string is the only configuration this component reads directly. It is injected as a constructor argument to `AddConfigurationDatabase` and sourced from the Host options:
| Key | Notes |
|-----|-------|
| `ScadaBridge:Database:ConfigurationDb` | SQL Server connection string. Required; startup fails without it. |
The `SCADABRIDGE_DESIGNTIME_CONNECTIONSTRING` environment variable is an alternative source for `dotnet ef` tooling only.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — all POCO entity classes (`Templates`, `Instances`, `Sites`, `AuditLogEntry`, `SiteCall`, …) and all repository interfaces (`ITemplateEngineRepository`, `IDeploymentManagerRepository`, `ISecurityRepository`, `IInboundApiRepository`, `IExternalSystemRepository`, `INotificationRepository`, `INotificationOutboxRepository`, `ISiteCallAuditRepository`, `IAuditLogRepository`, `ICentralUiRepository`, `ISiteRepository`) live there. Commons also declares `IAuditService`, `IAuditCorrelationContext`, `IPartitionMaintenance`, and `IInstanceLocator` — all implemented here.
- [Audit Log (#23)](./AuditLog.md) — `IAuditLogRepository` (implemented by `AuditLogRepository`) is the sole central write path for `dbo.AuditLog`. `AuditLogIngestActor`, `CentralAuditWriter`, and `SiteAuditReconciliationActor` all resolve it from a fresh per-message DI scope; the Audit Log component hosts the `AuditLogPartitionMaintenanceService` and `AuditLogPurgeActor` that drive the `IPartitionMaintenance` implementation registered here.
- [Template Engine (#1)](./TemplateEngine.md) — consumes `ITemplateEngineRepository` for all template, attribute, alarm, native alarm source, script, composition, instance, override, connection binding, and area operations.
- [Deployment Manager (#2)](./DeploymentManager.md) — consumes `IDeploymentManagerRepository` for deployment records and configuration snapshots.
- [Security & Auth (#10)](./Security.md) — consumes `ISecurityRepository` for LDAP group mappings and site scoping rules.
- [Inbound API (#14)](./InboundAPI.md) — consumes `IInboundApiRepository` for API method definitions.
- [External System Gateway (#7)](./ExternalSystemGateway.md) — consumes `IExternalSystemRepository` for external system and database connection definitions.
- [Notification Service (#8)](./NotificationService.md) — consumes `INotificationRepository` for notification lists, recipients, and SMTP configuration.
- [Notification Outbox (#21)](./NotificationOutbox.md) — consumes `INotificationOutboxRepository` for `dbo.Notifications` ingest, dispatcher polling, status transitions, KPI queries, and bulk purge of terminal rows.
- [Site Call Audit (#22)](./SiteCallAudit.md) — consumes `ISiteCallAuditRepository` for `dbo.SiteCalls` ingest, KPI queries, and bulk purge of terminal rows.
- [Central UI (#9)](./CentralUI.md) — consumes `ICentralUiRepository` for read-oriented cross-domain queries and the configuration audit log viewer.
- [Host (#15)](./Host.md) — provides the connection string, calls `AddConfigurationDatabase`, and invokes `MigrationHelper.ApplyOrValidateMigrationsAsync` at startup.
- All central components that modify configuration state — call `IAuditService.LogAsync()` and then `SaveChangesAsync()` so audit entries commit atomically with entity changes.
- Design spec: [Component-ConfigurationDatabase.md](../requirements/Component-ConfigurationDatabase.md)
## Troubleshooting
### Startup fails with "Database schema is out of date"
The host is running in production mode and `GetPendingMigrationsAsync` found unapplied migrations. Generate the idempotent SQL script (`dotnet ef migrations script --idempotent`) and apply it via SSMS before restarting the host.
### Startup stalls waiting for the database
`MigrationHelper` polls `CanConnectAsync` every 2 seconds for up to 60 seconds. If the 60-second deadline elapses the host throws `InvalidOperationException` naming the elapsed time and attempt count. Common causes: SQL Server container still in recovery, wrong connection string, database not yet attached.
### "Failed to decrypt an encrypted configuration column"
`EncryptedStringConverter.Unprotect` caught a `CryptographicException`. The Data Protection key ring is unavailable (keys deleted or the database was restored from a backup without the key rows) or the row was written by a different key ring. Restore the `DataProtectionKeys` table rows from a backup or re-provision the key ring and re-encrypt the affected column values.
### AuditLog partition switch fails mid-operation
`SwitchOutPartitionAsync` wraps the drop-and-rebuild dance in `BEGIN TRY / BEGIN CATCH`. On failure the CATCH block drops the staging table if it exists and rebuilds `UX_AuditLog_EventId` if it was dropped before the failure. The original exception is re-thrown so the Audit Log purge actor logs it and retries on the next daily tick. Verify that the `scadabridge_audit_purger` role still holds `ALTER ON SCHEMA::dbo` if the operation fails with a permissions error.
### Design-time `dotnet ef` tooling cannot find a connection string
Set `ScadaBridge:Database:ConfigurationDb` in the Host's `appsettings.json` (the factory looks for `../ZB.MOM.WW.ScadaBridge.Host` relative to the project directory) or export `SCADABRIDGE_DESIGNTIME_CONNECTIONSTRING`.
## Related Documentation
- [Configuration Database design specification](../requirements/Component-ConfigurationDatabase.md)
- [Audit Log](./AuditLog.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Commons](./Commons.md)
+309
View File
@@ -0,0 +1,309 @@
# Data Connection Layer
The Data Connection Layer is the site-only clean data pipe that abstracts protocol-specific communication behind a uniform actor/interface model, delivering live tag values and native alarm transitions to Instance Actors while hiding all connection lifecycle complexity from the rest of the system.
## Overview
The Data Connection Layer (#4) runs exclusively on site nodes. Central never reads from or writes to physical devices directly. The component code lives in `src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/`, with adapters in `Adapters/` and the actor hierarchy in `Actors/`.
Key collaborators are:
- `DataConnectionManagerActor` — the site-level router. One actor per site, child of the Site Runtime hierarchy. It owns the `connectionName → IActorRef` map and routes every inbound message to the right `DataConnectionActor`.
- `DataConnectionActor` — one per configured data connection. Owns a single `IDataConnection` adapter and models the full connection lifecycle as a Become/Stash state machine.
- `IDataConnection` (Commons) — the protocol adapter contract. Implemented by `OpcUaDataConnection` and `MxGatewayDataConnection`.
- `DataConnectionFactory` / `IDataConnectionFactory` — resolves a case-insensitive `ProtocolType` string to a fresh `IDataConnection` instance.
DI registration is through `ServiceCollectionExtensions.AddDataConnectionLayer`, which binds `DataConnectionOptions` from the `DataConnectionLayer` section and `OpcUaGlobalOptions` from the `OpcUa` section. The actors themselves are created inside the ActorSystem, not by DI.
## Key Concepts
### Clean data pipe
The DCL performs no alarm evaluation, no trigger evaluation, and no business logic. Its only job is to subscribe to tag paths requested by Instance Actors, deliver `TagValueUpdate` messages when values change, and route write requests down to the device. Every value change crosses exactly one boundary: device → adapter callback → `DataConnectionActor``InstanceActor`. The DCL never publishes to any actor other than the Instance Actors that subscribed.
### Become/Stash lifecycle state machine
`DataConnectionActor` uses Akka.NET's `Become`/`Stash` pattern to model three states cleanly:
| State | Behaviour |
|-------|-----------|
| `Connecting` | Stashes `SubscribeTagsRequest`, `WriteTagRequest`, `UnsubscribeTagsRequest`, `SubscribeAlarmsRequest`, `UnsubscribeAlarmsRequest`. Non-blocking for `ReadTagValuesCommand` (immediate synchronous failure reply) and `BrowseNodeCommand` (async failure via `PipeTo` when the adapter is `IBrowsableDataConnection`; immediate synchronous reply otherwise). |
| `Connected` | Processes all message types. On entering, calls `Stash.UnstashAll()`. |
| `Reconnecting` | Stashes new subscribe/write requests; allows `UnsubscribeTagsRequest` and `UnsubscribeAlarmsRequest` through for cleanup on instance stop. |
`BecomeConnecting` fires immediately in `PreStart`. `BecomeConnected` also calls `Stash.UnstashAll()` so queued subscribe requests from instance startup race are processed without any loss. The `IWithStash` and `IWithTimers` interfaces are declared on the actor; both are injected by the Akka.NET infrastructure.
### Adapter generation guard
Each `DataConnectionActor` holds an `_adapterGeneration` integer. When a new `IDataConnection` is created on failover, the generation is incremented. All subscription callbacks capture the generation at the time they are registered; a `TagValueReceived` or `AlarmTransitionReceived` whose generation does not match `_adapterGeneration` is silently dropped. This ensures stale callbacks from a disposed OPC UA SDK session or a closed gRPC stream never reach Instance Actors.
### Protocol extensibility
Adding a new protocol requires implementing `IDataConnection` (and optionally `IBrowsableDataConnection` and/or `IAlarmSubscribableConnection`) and calling `DataConnectionFactory.RegisterAdapter` with the new `ProtocolType` string. The manager and the actor are protocol-agnostic — they always work through the interfaces.
## Architecture
### Actor hierarchy
```text
DataConnectionManagerActor (one per site; child of DeploymentManagerActor)
├── DataConnectionActor "OpcSrv1" (one per configured connection)
├── DataConnectionActor "Gateway2"
└── ...
```
`DataConnectionManagerActor` is a `ReceiveActor`. It routes by connection name: `HandleRoute` calls `actor.Forward(request)` when the connection exists or sends a typed failure reply when it does not. The manager owns `ConnectionNotFound` failures; `DataConnectionActor` owns everything else (not connected, server errors, capability checks). The manager's `SupervisorStrategy` is `Resume` with up to ten restarts per minute, so a transient exception on a child never collapses the routing table.
`CreateConnectionCommand` (from the Site Runtime's `DeploymentManagerActor`) triggers `HandleCreateConnection`. The actor name is sanitised from the connection's display name to satisfy Akka's path character constraints.
### Connection state machine
```csharp
// DataConnectionActor.cs — PreStart and state transitions
protected override void PreStart()
{
_self = Self; // capture for off-thread use
_adapter.Disconnected += OnAdapterDisconnected;
BecomeConnecting();
}
private void OnAdapterDisconnected()
{
// fired from a background thread (gRPC stream, OPC UA keep-alive timer)
_self.Tell(new AdapterDisconnected()); // marshal onto actor loop
}
private void BecomeConnecting()
{
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Connecting);
Become(Connecting);
Self.Tell(new AttemptConnect());
}
private void BecomeConnected()
{
_lastConnectedAt = DateTimeOffset.UtcNow;
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Connected);
Become(Connected);
Stash.UnstashAll();
}
private void BecomeReconnecting()
{
_healthCollector.UpdateConnectionHealth(_connectionName, ConnectionHealth.Disconnected);
Become(Reconnecting);
PushBadQualityForAllTags(); // immediate bad quality — Instance Actors see it now
PushAlarmSourceUnavailable(); // native alarm subscribers notified
Timers.StartSingleTimer("reconnect", new AttemptConnect(), _options.ReconnectInterval);
}
```
`AttemptConnect` fires `_adapter.ConnectAsync(...)` and pipes `ConnectResult` back to the actor via `PipeTo`. On success, `BecomeConnected` is called; on failure, the counter increments and the reconnect timer is re-armed.
### Transparent re-subscribe
`ReSubscribeAll` runs immediately after a successful reconnect, before `BecomeConnected`. It derives the tag list from `_subscriptionsByInstance` (the durable record of what every Instance Actor asked for) rather than from `_subscriptionIds` (which are adapter handles cleared on every disconnect). All in-flight and unresolved sets are cleared so the new adapter starts clean. Subscriptions are re-issued in parallel via `PipeTo`, so the actor is never blocked during re-subscribe.
```csharp
// DataConnectionActor.cs — HandleReconnectResult
if (result.Success)
{
_consecutiveFailures = 0;
ReSubscribeAll(); // tag subscriptions
ReSubscribeAllAlarms(); // native alarm feeds
BecomeConnected();
}
```
### Failover between primary and backup endpoints
When `BackupConnectionDetails` is provided, the actor tracks consecutive failures and unstable disconnects. After `FailoverRetryCount` consecutive failures (or unstable connections shorter than `StableConnectionThreshold`), the actor disposes the current adapter, creates a fresh one with the other endpoint's config via `DataConnectionFactory.Create`, and re-arms the connect timer. Failover is round-robin: primary → backup → primary. There is no auto-failback; the connection stays on whichever endpoint is currently working.
### Write path
Writes are fire-and-forget from the script's view only in the sense that there is no store-and-forward. The `DataConnectionActor` pipes `_adapter.WriteAsync` back to the original sender:
```csharp
// DataConnectionActor.cs — HandleWrite
private void HandleWrite(WriteTagRequest request)
{
var sender = Sender;
var cts = new CancellationTokenSource(_options.WriteTimeout);
_adapter.WriteAsync(request.TagPath, request.Value, cts.Token).ContinueWith(t =>
{
cts.Dispose();
if (t.IsCompletedSuccessfully)
return new WriteTagResponse(request.CorrelationId, t.Result.Success,
t.Result.ErrorMessage, DateTimeOffset.UtcNow);
if (t.IsCanceled || t.Exception?.GetBaseException() is OperationCanceledException)
return new WriteTagResponse(request.CorrelationId, false,
$"Write timeout after {_options.WriteTimeout.TotalSeconds:F0}s", DateTimeOffset.UtcNow);
return new WriteTagResponse(request.CorrelationId, false,
t.Exception?.GetBaseException().Message, DateTimeOffset.UtcNow);
}).PipeTo(sender);
}
```
`WriteTagResponse` is returned synchronously to the calling script (via the Instance Actor, which awaits it). There is no store-and-forward for writes — buffering stale setpoints for later replay is unsafe in a control context.
### Tag path resolution retry
When `_adapter.SubscribeAsync` throws a resolution-level exception (node not found, device still booting), the tag is added to `_unresolvedTags` and marked `QualityCode.Bad`. A periodic `RetryTagResolution` timer fires at `TagResolutionRetryInterval`. Only tags that are not already in `_resolutionInFlight` are dispatched on each tick, preventing duplicate concurrent subscribe calls for a slow device. When resolution succeeds, the tag moves from `_unresolvedTags` to `_subscriptionIds`, `_resolvedTags` is incremented, and the timer is cancelled once the set is empty.
A separate in-flight set `_subscribesInFlight` tracks the initial `SubscribeAsync` for newly requested tags. Two `SubscribeTagsRequest` messages arriving for different instances that share a tag path both observe the tag as already handled, so only one adapter subscribe is issued.
### Tag subscriber reference counting
`_tagSubscriberCount` maps each tag path to the number of instances subscribed to it. When an instance unsubscribes, the count is decremented. The adapter `UnsubscribeAsync` call and the quality / resolution counters are updated only when the count reaches zero. This means a shared tag path (subscribed by two different instances bound to the same connection) remains active at the adapter until both instances stop.
## Usage
### Creating a connection
The Site Runtime's `DeploymentManagerActor` sends `CreateConnectionCommand` to `DataConnectionManagerActor` during instance deployment:
```csharp
// Commons/Messages/DataConnection/CreateConnectionCommand.cs
public record CreateConnectionCommand(
string ConnectionName,
string ProtocolType,
IDictionary<string, string> PrimaryConnectionDetails,
IDictionary<string, string>? BackupConnectionDetails = null,
int FailoverRetryCount = 3);
```
`DataConnectionManagerActor.HandleCreateConnection` calls `DataConnectionFactory.Create(ProtocolType, PrimaryConnectionDetails)` to produce the initial `IDataConnection` adapter and spawns a `DataConnectionActor` child.
### Subscribing and receiving values
Instance Actors send `SubscribeTagsRequest` and receive `TagValueUpdate` messages. The actor sends `TagValueUpdate` to the Instance Actor's ref for every value change notification from the adapter. On disconnect, `ConnectionQualityChanged` (with `QualityCode.Bad`) is sent to all subscribers so Instance Actors reflect staleness immediately without waiting for a per-tag callback.
### Browse (design-time only)
`BrowseNodeCommand` is routed to the named `DataConnectionActor`. If the adapter implements `IBrowsableDataConnection`, the actor calls `BrowseChildrenAsync` and pipes `BrowseNodeResult` back. The result is capped to ~100 KB before reply to stay within the Akka remote frame budget on the site→central crossing. Browse is never called on the hot path and Instance Actors never use it.
### Native alarm subscription
`NativeAlarmActor` (Site Runtime) sends `SubscribeAlarmsRequest` to the `DataConnectionManagerActor`, which forwards it to the named `DataConnectionActor`:
```csharp
// DataConnectionActor.cs — HandleSubscribeAlarms (key excerpt)
if (_adapter is not IAlarmSubscribableConnection alarmable)
{
subscriber.Tell(new SubscribeAlarmsResponse(
request.CorrelationId, request.InstanceUniqueName, false,
$"Connection '{_connectionName}' is not alarm-capable.", now));
return;
}
// register the subscriber for routing before issuing the adapter call
_alarmSourceSubscribers[request.SourceReference].Add(subscriber);
// open one feed per source reference; subsequent subscribers reuse it
alarmable.SubscribeAlarmsAsync(sourceRef, filter,
t => self.Tell(new AlarmTransitionReceived(t, generation)))
.ContinueWith(task => ...)
.PipeTo(self);
```
Incoming `AlarmTransitionReceived` is routed to all subscribers whose registered `SourceReference` is a prefix of the transition's `SourceObjectReference` or `SourceReference`. On disconnect, `PushAlarmSourceUnavailable` sends `NativeAlarmSourceUnavailable` to every alarm subscriber; on reconnect, `ReSubscribeAllAlarms` re-opens the feeds and the adapter replays a snapshot of currently-active conditions.
## Configuration
All settings under `DataConnectionLayer` in `appsettings.json` bind to `DataConnectionOptions`. Global protocol settings (`OpcUa`, `MxGateway` sections) bind to `OpcUaGlobalOptions` and `MxGatewayGlobalOptions`.
### Shared settings (`DataConnectionLayer` section)
| Key | Default | Description |
|-----|---------|-------------|
| `ReconnectInterval` | `00:00:05` | Fixed interval between reconnection attempts after a disconnect or failed connect. |
| `TagResolutionRetryInterval` | `00:00:10` | Retry interval for tag paths that could not be resolved on the device. |
| `WriteTimeout` | `00:00:30` | Timeout applied to each `WriteAsync` call. A hung write times out and returns an error to the calling script. |
| `StableConnectionThreshold` | `00:01:00` | A connection that drops before this duration is counted as an unstable disconnect toward `FailoverRetryCount`. |
### OPC UA global settings (`OpcUa` section)
| Key | Default | Description |
|-----|---------|-------------|
| `ApplicationName` | `ScadaBridge-DCL` | Application name used in the OPC UA certificate and session negotiation. |
| `TrustedIssuerStorePath` | `""` | File path to the trusted issuer certificate store. |
| `TrustedPeerStorePath` | `""` | File path to the trusted peer certificate store. |
| `RejectedCertificateStorePath` | `""` | File path to the rejected certificate store. |
Empty store paths fall back to a temp-path default so dev runs work without explicit configuration.
### Per-connection settings (OPC UA)
Stored in the `DataConnection.PrimaryConfiguration` JSON dict, parsed by `OpcUaEndpointConfigSerializer.FromFlatDict`.
| Key | Default | Description |
|-----|---------|-------------|
| `endpoint` / `EndpointUrl` | `opc.tcp://localhost:4840` | OPC UA server endpoint URL. |
| `SessionTimeoutMs` | `60000` | Session timeout in ms. |
| `OperationTimeoutMs` | `15000` | Transport operation timeout in ms. |
| `PublishingIntervalMs` | `1000` | Subscription publishing interval in ms. |
| `KeepAliveCount` | `10` | Keep-alive frames before session timeout. |
| `LifetimeCount` | `30` | Subscription lifetime in publish intervals. |
| `SamplingIntervalMs` | `1000` | Per-item server sampling rate in ms. |
| `QueueSize` | `10` | Per-item notification buffer size. |
| `SecurityMode` | `None` | `None`, `Sign`, or `SignAndEncrypt`. |
| `AutoAcceptUntrustedCerts` | `true` | Accept untrusted server certificates. |
### Per-connection settings (MxGateway)
Stored in `DataConnection.PrimaryConfiguration`, parsed by `MxGatewayEndpointConfigSerializer.FromFlatDict`.
| Key | Default | Description |
|-----|---------|-------------|
| `Endpoint` | `http://localhost:5000` | Gateway base URL. |
| `ApiKey` | — | Sent as `Authorization: Bearer <key>`. Redacted in logs. |
| `ClientName` | `scadabridge` | MXAccess client registration name. |
| `WriteUserId` | `0` | MXAccess user id on writes. `0` = no user context. |
| `UseTls` | `false` | Use TLS. |
| `CaFile` | — | CA certificate file path (TLS only). |
| `ServerName` | — | TLS server-name override. |
| `ReadTimeoutMs` | `5000` | `ReadBulk` per-call timeout in ms. |
### Per-connection failover settings
Carried on `CreateConnectionCommand` from the deployment artifact.
| Field | Default | Description |
|-------|---------|-------------|
| `BackupConnectionDetails` | `null` | If absent, the connection retries its single endpoint indefinitely. |
| `FailoverRetryCount` | `3` | Consecutive failures (or unstable disconnects) before switching endpoints. |
## Dependencies & Interactions
- [Site Runtime (#3)](./SiteRuntime.md) — `DeploymentManagerActor` sends `CreateConnectionCommand` / `RemoveConnectionCommand` to create and tear down connections during instance deployment. Instance Actors send `SubscribeTagsRequest` / `UnsubscribeTagsRequest` / `WriteTagRequest`. `NativeAlarmActor` (peer to `AlarmActor` under `InstanceActor`) sends `SubscribeAlarmsRequest` / `UnsubscribeAlarmsRequest` and receives `NativeAlarmTransitionUpdate` / `NativeAlarmSourceUnavailable`.
- [Commons (#16)](./Commons.md) — owns `IDataConnection`, `IAlarmSubscribableConnection`, `IBrowsableDataConnection` interfaces; `TagValue`, `QualityCode`, `NativeAlarmTransition`, `AlarmConditionState`, `AlarmTransitionKind` types; and all message contracts (`SubscribeTagsRequest/Response`, `TagValueUpdate`, `WriteTagRequest/Response`, `SubscribeAlarmsRequest/Response`, `NativeAlarmTransitionUpdate`, `NativeAlarmSourceUnavailable`, `CreateConnectionCommand`, `BrowseNodeCommand/Result`, `DataConnectionHealthReport`).
- [Health Monitoring (#11)](./HealthMonitoring.md) — `DataConnectionActor` calls `ISiteHealthCollector.UpdateConnectionHealth`, `UpdateTagResolution`, `UpdateTagQuality`, `UpdateConnectionEndpoint`, and `RemoveConnection` to keep the site health report current. `DataConnectionManagerActor` handles `GetAllHealthReports` by forwarding `GetHealthReport` to each child.
- [Site Event Logging (#12)](./SiteEventLogging.md) — `DataConnectionActor` logs connection lost, restored, and failover events via `ISiteEventLogger.LogEventAsync` (fire-and-forget). Absent in tests where `ISiteEventLogger` is null.
- Design spec: [Component-DataConnectionLayer.md](../requirements/Component-DataConnectionLayer.md).
## Troubleshooting
### Tags stuck in bad quality after reconnect
After a reconnect, `ReSubscribeAll` re-issues all subscriptions. Tags that resolve successfully immediately receive a seeded value from `ReadAsync` before the subscription's first change notification arrives, so the Instance Actor sees a good-quality value promptly. Tags that fail resolution land in `_unresolvedTags` and are retried at `TagResolutionRetryInterval`. If a tag is persistently bad, check whether the tag path exists on the device and whether the device is online. The health report's `ResolvedTags` vs `TotalSubscribedTags` counters expose the gap without requiring the debug view.
### Connection not failing over to backup
The failover counter uses two independent paths: `_consecutiveFailures` for connect-attempt failures, and `_consecutiveUnstableDisconnects` for connections that drop before `StableConnectionThreshold`. Both must reach `FailoverRetryCount` to trigger a switch. A connection that succeeds but immediately drops does not increment `_consecutiveFailures` — it increments `_consecutiveUnstableDisconnects` instead. Verify both counters are relevant to the observed failure pattern. Failover events are written to Site Event Logging as `Warning` entries.
### Browse hangs on "loading…"
The `DataConnectionActor` caps `BrowseNodeResult` to ~100 KB before returning it. If the picker hangs rather than showing a "results truncated" hint, the reply may have been silently discarded by Akka remoting before reaching central (the reply crosses the site→central frame). Check whether the connection actor is in `Connecting` or `Reconnecting` state — browse while disconnected returns `BrowseFailureKind.ConnectionNotConnected` immediately rather than hanging.
### Alarm feed not receiving transitions
Confirm that the connection's adapter implements `IAlarmSubscribableConnection` (currently `OpcUaDataConnection` and `MxGatewayDataConnection`). A non-capable adapter returns `SubscribeAlarmsResponse(Success = false)`. If the capability is present, check whether the connection is in `Connected` state — `SubscribeAlarmsRequest` is stashed while `Connecting` or `Reconnecting` and processed on entering `Connected`. On reconnect, `ReSubscribeAllAlarms` re-opens the feeds and the adapter replays a snapshot.
## Related Documentation
- [Data Connection Layer design specification](../requirements/Component-DataConnectionLayer.md)
- [Site Runtime](./SiteRuntime.md)
- [Commons](./Commons.md)
- [Health Monitoring](./HealthMonitoring.md)
- [Site Event Logging](./SiteEventLogging.md)
- [CentralSite Communication](./Communication.md)
+209
View File
@@ -0,0 +1,209 @@
# Deployment Manager
The Deployment Manager is the central-side pipeline that takes a validated, flattened instance configuration from the Template Engine, ships it to a site via the Communication Layer, and tracks the result — along with full instance lifecycle commands and system-wide artifact distribution to all connected sites.
## Overview
Deployment Manager (#2) runs exclusively on the central cluster. The site-side counterpart — the Deployment Manager singleton inside Site Runtime — receives and applies what central sends; that actor's design is covered in Site Runtime (#3).
The component code lives in `src/ZB.MOM.WW.ScadaBridge.DeploymentManager/`:
- `DeploymentService` — per-instance deploy, disable, enable, delete, diff, and status queries.
- `ArtifactDeploymentService` — system-wide artifact broadcast and per-site retry.
- `FlatteningPipeline` — wraps the Template Engine's `FlatteningService`, `ValidationService`, and `RevisionHashService` into a single call used by `DeploymentService`.
- `OperationLockManager` — ref-counted per-instance `SemaphoreSlim(1,1)` that serialises all mutating operations on one instance.
- `StateTransitionValidator` — encodes the allowed state-transition matrix for `InstanceState`.
- `DeploymentStatusNotifier` — singleton in-process event broadcaster that pushes `DeploymentStatusChange` to the Central UI's Blazor circuits instead of letting them poll.
Registration entry point: `ServiceCollectionExtensions.AddDeploymentManager`. Options are bound from `ScadaBridge:DeploymentManager` in `appsettings.json`.
## Key Concepts
### Deployment identity
Every instance deployment carries two correlated identifiers:
- **`DeploymentId`** — a new `Guid` (formatted `"N"`) minted by `DeploymentService` at the start of each `DeployInstanceAsync` call.
- **`RevisionHash`** — computed by the Template Engine's `RevisionHashService` over the fully resolved `FlattenedConfiguration`. The hash captures the template state at the moment of flattening, so concurrent last-write-wins template edits do not affect an in-flight deployment.
The pair travels inside `DeployInstanceCommand` to the site. The site uses the `DeploymentId` to detect an already-applied identical command (idempotent re-delivery) and uses the `RevisionHash` to reject a stale configuration that predates what is already running.
Central stores the `RevisionHash` on `DeploymentRecord` and, after a confirmed success, on `DeployedConfigSnapshot`. Comparing the snapshot hash against the current-template hash determines whether an instance is stale without a site round-trip.
### Per-instance operation lock
`OperationLockManager` holds a `Dictionary<string, LockEntry>` keyed by instance `UniqueName`. Each `LockEntry` wraps a `SemaphoreSlim(1,1)` with a reference count so the semaphore is created on first contention and disposed when the last waiter clears. The lock covers all four mutating operations — deploy, disable, enable, delete — so they can never interleave on a single instance. Operations on different instances proceed in parallel.
Lock acquisition throws `TimeoutException` after `DeploymentManagerOptions.OperationLockTimeout` (default 5 s). The operation lock is in-memory and is therefore lost on a central failover; the design treats any in-progress deployment at failover time as failed.
### State transition rules
`StateTransitionValidator` enforces the following matrix:
| `InstanceState` | Deploy | Disable | Enable | Delete |
|-----------------|--------|---------|--------|--------|
| `NotDeployed` | Yes | No | No | Yes |
| `Enabled` | Yes | Yes | No | Yes |
| `Disabled` | Yes* | No | Yes | Yes |
\* Deploying from `Disabled` transitions the instance to `Enabled` on confirmed success.
### Optimistic concurrency on deployment status
`DeploymentRecord` carries a `RowVersion byte[]` column. EF Core uses this as an optimistic-concurrency token on every `UPDATE` and `DELETE`. A concurrent write to the same record surfaces as `DbUpdateConcurrencyException` rather than silently overwriting the peer's state.
### Failover and in-progress deployments
The operation lock is in-memory. If the active central node fails mid-deployment, the new active node has no lock and no knowledge of what the site received. The `DeploymentRecord` is left `InProgress` (or `Failed` if the failure path ran before the node died). Before allowing a re-deploy, `DeploymentService` calls `TryReconcileWithSiteAsync`, which queries the site for its currently-applied revision hash and reconciles rather than re-sending if the site already has the target revision.
## Architecture
### Instance deploy pipeline
`DeployInstanceAsync` executes the following sequence:
1. **Load and validate state** — loads the `Instance` from `IDeploymentManagerRepository` and checks the transition via `StateTransitionValidator`.
2. **Acquire operation lock**`OperationLockManager.AcquireAsync` blocks competing operations on the same instance.
3. **Flatten and validate**`IFlatteningPipeline.FlattenAndValidateAsync` runs the Template Engine pipeline and returns a `FlatteningPipelineResult` containing the `FlattenedConfiguration`, `RevisionHash`, and a `ValidationResult`. Semantic validation failures (call targets, argument types, trigger operand types, connection binding completeness) are returned to the caller before any record is written.
4. **Pre-deploy site reconciliation** — when the prior `DeploymentRecord` for the instance is `InProgress` or `Failed` with a timeout marker (`"Communication failure:"`), the service queries the site via `CommunicationService.QueryDeploymentStateAsync`. If the site already holds the target revision hash, the prior record is updated to `Success` and no new deployment is sent.
5. **Write `InProgress` record** — a single `DeploymentRecord` insert directly at `InProgress` status (no transient `Pending` hop). `IDeploymentStatusNotifier.NotifyStatusChanged` fires to push the status to the UI.
6. **Send `DeployInstanceCommand`** — the command carries `DeploymentId`, `InstanceUniqueName`, `RevisionHash`, `FlattenedConfigurationJson`, `DeployedBy`, and `Timestamp`.
7. **Commit terminal status** — the `DeploymentRecord` is updated to `Success` or `Failed` and saved before any post-success side effects run. This ordering ensures the recorded outcome can never be lost if a post-success write fails.
8. **Post-success side effects**`ApplyPostSuccessSideEffectsAsync` sets `Instance.State = Enabled` (or preserves `Disabled` on the reconciliation path) and upserts the `DeployedConfigSnapshot`. These writes are best-effort: a failure here is logged at `Error` but does not flip the already-committed `Success` record back to `Failed`.
9. **Audit log**`IAuditService.LogAsync` records `Deploy` / `DeployFailed` / `DeployReconciled` with the `DeploymentId`, status, and user.
Any exception in the site round-trip (steps 67) writes `DeploymentStatus.Failed` using `CancellationToken.None` so a cancelled outer token cannot prevent the failure record from being persisted:
```csharp
// DeploymentService.DeployInstanceAsync — exception handler
var isTimeout = ex is TimeoutException or OperationCanceledException;
record.Status = DeploymentStatus.Failed;
record.ErrorMessage = isTimeout
? $"{TimeoutFailurePrefix} {ex.Message}"
: $"Deployment error: {ex.Message}";
record.CompletedAt = DateTimeOffset.UtcNow;
await _repository.UpdateDeploymentRecordAsync(record, CancellationToken.None);
await _repository.SaveChangesAsync(CancellationToken.None);
NotifyStatusChange(record);
```
The `TimeoutFailurePrefix` constant (`"Communication failure:"`) is the marker that `ShouldQuerySiteBeforeRedeploy` checks on the next deploy attempt.
### Pre-deploy site reconciliation
`TryReconcileWithSiteAsync` is invoked only when a prior deployment record exists and `ShouldQuerySiteBeforeRedeploy` returns true:
```csharp
private static bool ShouldQuerySiteBeforeRedeploy(DeploymentRecord prior) =>
prior.Status == DeploymentStatus.InProgress
|| (prior.Status == DeploymentStatus.Failed
&& prior.ErrorMessage != null
&& prior.ErrorMessage.StartsWith(TimeoutFailurePrefix, StringComparison.Ordinal));
```
If the site responds that it is running the target `RevisionHash`, the stale prior record is updated to `Success` (with the hash corrected to the target), `ApplyPostSuccessSideEffectsAsync` runs with `forceEnabledState: false` to avoid undoing an intentional disable, and the caller receives the reconciled record. A query failure falls through to a normal deploy; the site's own stale-rejection logic is the safety net.
### Deployed config snapshot and diff
`DeployedConfigSnapshot` is a one-per-instance row that stores the `DeploymentId`, `RevisionHash`, and the full `FlattenedConfiguration` JSON as of the last confirmed success. `DeploymentService.GetDeploymentComparisonAsync` re-flattens the current template state, compares the hash, and feeds both configs to `DiffService.ComputeDiff` if the hashes differ, producing a `ConfigurationDiff` with added, removed, and changed attributes, alarms, scripts, and connection bindings.
### Artifact deployment
`ArtifactDeploymentService.DeployToAllSitesAsync` deploys the full system-wide artifact set to every site in parallel. It fetches system-wide artifacts (shared scripts, external systems with serialised methods, database connections, notification lists, SMTP configurations) once via `FetchGlobalArtifactsAsync` before the per-site loop, avoiding N×1 re-queries. Per-site data connections are fetched inside each per-site command build because they legitimately vary per site.
All per-site `DeployArtifactsCommand` messages share one `DeploymentId` so the audit log, UI summary, and persisted `SystemArtifactDeploymentRecord` all reference the same logical deployment. Each site runs under a `cts.CancelAfter(ArtifactDeploymentTimeoutPerSite)` linked source. Successful sites are never rolled back on other failures; individual failed sites are retryable via `RetryForSiteAsync`.
```csharp
// ArtifactDeploymentService — parallel per-site dispatch
var tasks = sites.Select(async site =>
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(_options.ArtifactDeploymentTimeoutPerSite);
var command = siteCommands[site.Id];
var response = await _communicationService.DeployArtifactsAsync(
site.SiteIdentifier, command, cts.Token);
return new SiteArtifactResult(
site.SiteIdentifier, site.Name, response.Success, response.ErrorMessage);
}).ToList();
```
Cross-site artifact version skew is supported by design: a site that missed an artifact deployment continues operating with its current versions until an operator retries.
### Status notification
`DeploymentStatusNotifier` is a DI singleton that exposes `event Action<DeploymentStatusChange>? StatusChanged`. `DeploymentService` calls `NotifyStatusChanged` at every point a `DeploymentRecord` status is written. The Central UI's deployment page subscribes at render time and re-renders over its Blazor Server SignalR circuit without polling. Each subscriber is invoked individually inside a try/catch so a disposed Blazor circuit cannot break the deployment pipeline.
## Usage
`DeploymentService` and `ArtifactDeploymentService` are scoped services, typically resolved by `ManagementService` actor handlers (triggered by `MgmtDeployArtifactsCommand`, `GetDeploymentDiffCommand`, and the instance lifecycle commands) or directly by Central UI Blazor components. Engineers interact through the Central UI; automated bulk operations (deploy all stale instances) decompose into individual `DeployInstanceAsync` calls.
Lifecycle commands (`DisableInstanceAsync`, `EnableInstanceAsync`, `DeleteInstanceAsync`) follow the same lock-then-command pattern as deploy, with `LifecycleCommandTimeout` applied as a linked `CancellationTokenSource` deadline:
```csharp
// DeploymentService — lifecycle command pattern (disable shown)
using var lockHandle = await _lockManager.AcquireAsync(
instance.UniqueName, _options.OperationLockTimeout, cancellationToken);
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(_options.LifecycleCommandTimeout);
response = await _communicationService.DisableInstanceAsync(siteId, command, cts.Token);
```
A timeout on a lifecycle command writes a `DisableTimedOut` / `EnableTimedOut` / `DeleteTimedOut` audit entry via `TryLogLifecycleTimeoutAsync` using `CancellationToken.None`, mirroring the `DeployFailed` audit pattern. The site-side `Instance` state is only updated in the central DB after the site confirms success; a timeout leaves the DB state unchanged.
Delete is stricter than disable/enable: if the site confirms but the central `DeleteInstanceAsync` repository call subsequently fails, the instance record is orphaned. The service logs at `Error`, records a `DeleteOrphaned` audit entry, and returns a descriptive failure so an operator can reconcile — it does not retry automatically.
## Configuration
Options are registered via `AddDeploymentManager` and bound from `ScadaBridge:DeploymentManager`.
| Key | Default | Description |
|-----|---------|-------------|
| `OperationLockTimeout` | `00:00:05` | Maximum wait for the per-instance operation lock before throwing `TimeoutException`. |
| `LifecycleCommandTimeout` | `00:00:30` | Maximum round-trip for a disable, enable, or delete command before the operation is declared timed out. |
| `ArtifactDeploymentTimeoutPerSite` | `00:02:00` | Per-site deadline for a `DeployArtifactsCommand` response. Sites exceeding this are recorded as failed; others are unaffected. |
## Dependencies & Interactions
- [Template Engine (#1)](./TemplateEngine.md) — `FlatteningPipeline` delegates to `FlatteningService`, `ValidationService`, and `RevisionHashService`. Template state is captured at flatten time; last-write-wins edits made after flatten do not affect the in-flight deployment. `DiffService.ComputeDiff` powers the deployment diff view.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — owns the EF Core implementation of `IDeploymentManagerRepository`, which stores `DeploymentRecord`, `DeployedConfigSnapshot`, and `SystemArtifactDeploymentRecord`. `IAuditService` (also registered by the Configuration Database component) writes all deployment audit rows.
- [CentralSite Communication (#5)](./Communication.md) — `CommunicationService` provides `DeployInstanceAsync`, `QueryDeploymentStateAsync`, `DeployArtifactsAsync`, `DisableInstanceAsync`, `EnableInstanceAsync`, and `DeleteInstanceAsync`. The communication layer routes by `SiteIdentifier` (string), not DB id; `DeploymentService.ResolveSiteIdentifierAsync` resolves the numeric `SiteId` before each cross-cluster call and treats a missing site row as a hard failure.
- [Commons (#16)](./Commons.md) — owns `DeploymentRecord`, `DeployedConfigSnapshot`, `SystemArtifactDeploymentRecord`, `DeploymentStatus`, `InstanceState`, `DeployInstanceCommand`, `DeployArtifactsCommand`, `DeploymentStateQueryRequest/Response`, `InstanceLifecycleResponse`, and the `IDeploymentManagerRepository` interface.
- [Site Runtime (#3)](./SiteRuntime.md) — receives `DeployInstanceCommand` and `DeployArtifactsCommand` via the Communication Layer. Site-side apply is all-or-nothing per instance: the Deployment Manager singleton at the site stores the config, compiles all scripts, and creates or replaces the Instance Actor as a unit. A failure at any step is reported back with the specific error message and the previous configuration remains active.
- [Central UI (#9)](./CentralUI.md) — engineers trigger deployments, view diffs, manage instance lifecycle, and deploy system-wide artifacts through the UI. The deployment status page subscribes to `IDeploymentStatusNotifier.StatusChanged` for real-time push updates via Blazor Server SignalR.
- [Management Service (#18)](./ManagementService.md) — the actor-layer entry point for deployment commands received over ClusterClient. It resolves `DeploymentService` and `ArtifactDeploymentService` from a per-message DI scope and forwards `MgmtDeployArtifactsCommand`, `GetDeploymentDiffCommand`, and instance lifecycle requests.
- [Security & Auth (#10)](./Security.md) — the Deployment role is required for all deploy and artifact operations; site-scoped permissions are enforced by the Central UI and Management Service before commands reach `DeploymentService`.
## Troubleshooting
### An instance is stuck InProgress after a central failover
The operation lock is in-memory. On failover the new active node has no lock entry, and the deployment record remains `InProgress`. When the engineer issues a re-deploy, `TryReconcileWithSiteAsync` queries the site; if the site already applied the config the record is updated to `Success` without re-sending. If the site did not apply it, a new deployment proceeds. No manual DB edits are required in the normal failover case.
### A deployment record shows Failed with "Communication failure:"
The site round-trip timed out or was cancelled before a response arrived. The site may or may not have applied the config. On the next deploy attempt the reconciliation query determines the ground truth. If the query also fails (site unreachable), a new `DeployInstanceCommand` is sent; the site rejects it with "already applied" if it ran the previous one.
### DeleteOrphaned audit entry
The site destroyed the Instance Actor but the central DB removal failed. The instance record exists in the central DB but has no corresponding site actor. It cannot be deleted through the normal UI path (the site will reject the delete command because the instance does not exist). Reconcile by removing the central record directly via the Management API or database, referencing the `CommandId` in the audit entry.
### Artifact deployment partially failed
`DeployToAllSitesAsync` returns an `ArtifactDeploymentSummary` with per-site `SiteArtifactResult`. Failed sites do not block or roll back successful ones. Use `RetryForSiteAsync` when the failed site is reachable again; it re-fetches all global artifacts and re-sends to the single site.
## Related Documentation
- [Deployment Manager design specification](../requirements/Component-DeploymentManager.md)
- [Template Engine](./TemplateEngine.md)
- [Site Runtime](./SiteRuntime.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [CentralSite Communication](./Communication.md)
- [Commons](./Commons.md)
- [Central UI](./CentralUI.md)
- [Management Service](./ManagementService.md)
- [Security & Auth](./Security.md)
+226
View File
@@ -0,0 +1,226 @@
# External System Gateway
The External System Gateway gives site scripts two runtime capabilities: invoking HTTP/REST APIs on named external systems, and executing SQL writes against named database connections. Both capabilities expose a dual call mode — synchronous (blocking, result returned) and cached (store-and-forward on transient failure) — so scripts choose the right delivery guarantee per operation without knowing the underlying retry machinery.
## Overview
External System Gateway (#7) runs exclusively at the site. Definitions — external system endpoints with their authentication and method catalogue, and database connection strings — are authored centrally and deployed to the site's local SQLite by the Deployment Manager. The site never reaches back to the configuration database at call time; the repository resolves each definition from SQLite on the hot path.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/`, with all four source files at the root:
- `ExternalSystemClient.cs``IExternalSystemClient` implementation; `CallAsync` (synchronous) and `CachedCallAsync` (store-and-forward on transient failure), plus the `DeliverBufferedAsync` entry point consumed by the Store-and-Forward Engine during retry sweeps.
- `DatabaseGateway.cs``IDatabaseGateway` implementation; `GetConnectionAsync` (ADO.NET `SqlConnection`) and `CachedWriteAsync` (S&F-buffered SQL), plus its own `DeliverBufferedAsync` for the retry path.
- `ErrorClassifier.cs` — static helper that maps HTTP status codes and exception types to `TransientExternalSystemException` / `PermanentExternalSystemException`.
- `ExternalSystemGatewayOptions.cs` — options class bound from `ScadaBridge:ExternalSystemGateway`.
- `ServiceCollectionExtensions.cs``AddExternalSystemGateway` extension; registers `ExternalSystemClient` and `DatabaseGateway` as scoped services and applies per-system connection limits to named `HttpClient` instances.
Both services are DI-scoped. Script Execution Actors (short-lived, per-invocation) resolve them; blocking I/O from both runs on a dedicated Akka.NET dispatcher to keep the default dispatcher free for coordination actors.
## Key Concepts
### Definitions at rest
An `ExternalSystemDefinition` carries the base `EndpointUrl`, `AuthType` (`"apikey"`, `"basic"`, or `"none"`), `AuthConfiguration` (the credential payload), and per-system retry settings (`MaxRetries`, `RetryDelay`). Its child `ExternalSystemMethod` records each carry `HttpMethod`, `Path` (relative to the base URL), and JSON-serialized `ParameterDefinitions` / `ReturnDefinition`. A `DatabaseConnectionDefinition` carries an ADO.NET `ConnectionString` and its own `MaxRetries` / `RetryDelay`.
Definitions are resolved from the site SQLite repository on every call via name-keyed indexed queries (`GetExternalSystemByNameAsync`, `GetDatabaseConnectionByNameAsync`) rather than a fetch-all-then-filter scan, because definitions are read on every script invocation.
### Dual call modes
Every API call and every database write has two modes:
| Mode | API surface | Failure behaviour | Return value |
|------|-------------|-------------------|--------------|
| Synchronous | `ExternalSystem.Call()` / `Database.Connection()` | All failures returned to script | Response JSON / `DbConnection` |
| Cached | `ExternalSystem.CachedCall()` / `Database.CachedWrite()` | Transient → buffered; permanent → returned | `ExternalCallResult` (buffered) / `void` |
`CachedCallAsync` attempts immediate delivery first; only a transient failure routes to the Store-and-Forward Engine. `CachedWriteAsync` makes no immediate SQL attempt — it resolves the connection definition and enqueues directly.
### Error classification
`ErrorClassifier` is the authority on HTTP and exception transience for the synchronous call path:
- **HTTP status codes**: 5xx, 408 (Request Timeout), 429 (Too Many Requests) → transient. All other non-success 4xx → permanent.
- **Exceptions**: `HttpRequestException`, `TaskCanceledException`, `TimeoutException`, `OperationCanceledException` → transient.
`JsonException` during buffered-delivery payload deserialization is classified as permanent inline inside `DeliverBufferedAsync` (both `ExternalSystemClient` and `DatabaseGateway`), not via `ErrorClassifier` — a malformed payload will not become well-formed on retry, so it is parked immediately.
Transient failures on `CachedCall` / `CachedWrite` are silently buffered (logged at `Debug`). Permanent failures on the synchronous (`InvokeHttpAsync`) path are logged at `Warning` and returned to the calling script. Permanent failures detected during buffered retry delivery (`DeliverBufferedAsync`) are logged at `Error` before parking.
## Architecture
### HTTP invocation (`ExternalSystemClient`)
`InvokeHttpAsync` constructs the request, applies auth, dispatches, and classifies the response. The gateway creates a named `HttpClient` per system (`ExternalSystem_{systemName}`) through `IHttpClientFactory`, with `SocketsHttpHandler.MaxConnectionsPerServer` capped by `MaxConcurrentConnectionsPerSystem`. The framework default `HttpClient.Timeout` (100 s) is deliberately overridden to `Timeout.InfiniteTimeSpan` so the gateway's own `CancellationTokenSource(DefaultHttpTimeout)` is the sole timeout source — without this, configured timeouts above 100 s would be silently clipped.
Parameter routing by verb:
- `POST`, `PUT`, `PATCH` → JSON body (`application/json`).
- `GET`, `DELETE` → URL query string (null-valued parameters omitted; no trailing `?` when all values are null).
Auth application:
- `apikey``AuthConfiguration` format `"HeaderName:KeyValue"` or bare key value (default header `X-API-Key`).
- `basic``AuthConfiguration` format `"username:password"`, Base64-encoded as `Authorization: Basic ...`.
- `none` — silent no-op.
- Missing or malformed `AuthConfiguration` for a type that requires credentials logs a `Warning` but does not abort the call.
Error body embedded in script-visible messages is capped at 2 048 characters so a misbehaving endpoint cannot inflate error strings.
```csharp
// ExternalSystemClient.cs
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// The caller asked to abandon the work — do not reclassify as transient.
throw;
}
catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
{
// Our own timeout elapsed — a transient failure per the design.
throw ErrorClassifier.AsTransient(
$"Timeout calling {system.Name} after {_options.DefaultHttpTimeout.TotalSeconds:0.##}s", ex);
}
catch (Exception ex) when (ErrorClassifier.IsTransient(ex))
{
throw ErrorClassifier.AsTransient($"Connection error to {system.Name}: {ex.Message}", ex);
}
```
### `CachedCallAsync` — the buffered path
On a transient failure, `CachedCallAsync` serializes `{SystemName, MethodName, Parameters}` as JSON and calls `StoreAndForwardService.EnqueueAsync` with `StoreAndForwardCategory.ExternalSystem`. Three details matter for correct S&F integration:
- **`attemptImmediateDelivery: false`** — the HTTP attempt has already been made; passing `true` would dispatch the same request twice.
- **`MaxRetries` / `RetryDelay` defaulting** — `ExternalSystemDefinition.MaxRetries` defaults to `0`, and the S&F engine treats a stored `0` as "no limit". A `0` is therefore passed as `null` so the engine's own bounded default applies, avoiding unbounded retry loops on unconfigured systems.
- **`messageId: trackedOperationId`** — pins the S&F message GUID to the caller-supplied `TrackedOperationId` so the retry loop can emit per-attempt and terminal audit telemetry under the same tracking id.
```csharp
// ExternalSystemClient.cs — transient branch of CachedCallAsync
await _storeAndForward.EnqueueAsync(
StoreAndForwardCategory.ExternalSystem,
systemName,
payload,
originInstanceName,
system.MaxRetries > 0 ? system.MaxRetries : null,
system.RetryDelay > TimeSpan.Zero ? system.RetryDelay : null,
attemptImmediateDelivery: false,
messageId: trackedOperationId?.ToString(),
executionId: executionId,
sourceScript: sourceScript,
parentExecutionId: parentExecutionId);
return new ExternalCallResult(true, null, null, WasBuffered: true);
```
### `DeliverBufferedAsync` — S&F retry delivery
The Store-and-Forward Engine calls `ExternalSystemClient.DeliverBufferedAsync` and `DatabaseGateway.DeliverBufferedAsync` during retry sweeps. Both methods:
1. Deserialize the payload JSON; treat `JsonException` as permanent (return `false` → park).
2. Re-resolve the definition by name; if gone, return `false` → park.
3. Execute the operation. `PermanentExternalSystemException` → park. `TransientExternalSystemException` propagates → engine retries.
### Database gateway (`DatabaseGateway`)
`GetConnectionAsync` resolves the `DatabaseConnectionDefinition`, opens a `SqlConnection` against `ConnectionString`, and returns the open connection. The caller owns disposal. If `OpenAsync` throws (unreachable server, bad credentials), the connection is disposed before the exception propagates.
`CachedWriteAsync` serializes `{ConnectionName, Sql, Parameters}` and enqueues to S&F under `StoreAndForwardCategory.CachedDbWrite`, with the same `MaxRetries` / `RetryDelay` defaulting logic as `CachedCallAsync`.
During retry delivery, `JsonElement` parameter values are converted with a numeric type preference of `long``decimal``double`. This matters because a script's decimal SQL parameter is serialized as an untagged JSON number; naively casting to `double` loses precision for money and measurement values.
```csharp
// DatabaseGateway.cs — JsonElementToParameterValue
JsonValueKind.Number => element.TryGetInt64(out var l)
? l
: element.TryGetDecimal(out var dec)
? dec
: element.GetDouble(),
```
## Usage
Scripts interact through `IExternalSystemClient` and `IDatabaseGateway`, which the Script Runtime Context exposes as `ExternalSystem` and `Database` respectively. Scripts never construct gateway types directly.
**Synchronous external system call** — blocks until the response arrives or the timeout elapses:
```csharp
// Script code (via ScriptRuntimeContext)
var result = await ExternalSystem.Call("MES", "GetRecipe", new { RecipeId = 42 });
if (result.Success)
{
var name = result.Response.recipeName; // dynamic JSON access
}
```
**Cached external system call** — returns immediately with a `TrackedOperationId`; the actual HTTP request is attempted once and, on transient failure, buffered for retry:
```csharp
var tracked = await ExternalSystem.CachedCall("MES", "PostProductionResult", payload);
// tracked.WasBuffered == true when queued to S&F
```
**Synchronous database access** — caller controls the connection lifetime:
```csharp
await using var conn = await Database.Connection("HistorianDB");
using var cmd = conn.CreateCommand();
cmd.CommandText = "SELECT TOP 1 Value FROM dbo.Tags WHERE Name = @name";
cmd.Parameters.AddWithValue("@name", tagName);
var value = await cmd.ExecuteScalarAsync();
```
**Cached database write** — enqueued immediately; returns nothing (`Task`):
```csharp
await Database.CachedWrite("MES_DB",
"INSERT INTO dbo.ProductionLog (BatchId, Qty) VALUES (@batchId, @qty)",
new { batchId = id, qty = quantity });
```
Call status is observable via `Tracking.Status(trackedOperationId)` — answered site-locally against the S&F tracking table, or centrally via the Site Call Audit page.
## Configuration
Options are bound from `ScadaBridge:ExternalSystemGateway` into `ExternalSystemGatewayOptions` by `AddExternalSystemGateway`.
| Key | Default | Description |
|-----|---------|-------------|
| `DefaultHttpTimeout` | `00:00:30` | Per-call HTTP round-trip timeout. Applied via `CancellationTokenSource`; overrides the framework 100 s default. |
| `MaxConcurrentConnectionsPerSystem` | `10` | `SocketsHttpHandler.MaxConnectionsPerServer` applied to each named `HttpClient` (`ExternalSystem_{name}`). Does not affect other host `HttpClient` instances. |
Per-system retry settings (`MaxRetries`, `RetryDelay`) are properties of `ExternalSystemDefinition` and `DatabaseConnectionDefinition`, authored by operators in the Central UI and deployed as part of the system artifact. The gateway passes these directly to the Store-and-Forward Engine on enqueue.
There is no separate configuration section for database connections — connection strings reside in `DatabaseConnectionDefinition.ConnectionString`, deployed via artifact. Pool tuning (max pool size, connection lifetime) can be embedded in the connection string itself.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `IExternalSystemClient`, `IDatabaseGateway`, `ExternalCallResult`, `TrackedOperationId`, `ExternalSystemDefinition`, `ExternalSystemMethod`, `DatabaseConnectionDefinition`, `IExternalSystemRepository`, and the `StoreAndForwardCategory` enum values consumed here.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — receives buffered `ExternalSystem` and `CachedDbWrite` payloads from `CachedCallAsync` / `CachedWriteAsync`; drives retry sweeps by calling `DeliverBufferedAsync` on both gateway types; assigns `TrackedOperationId` tracking rows; owns the site-local operation tracking table read by `Tracking.Status()`.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — provides `IExternalSystemRepository`, implemented against the site SQLite replica. Central uses the same interface against MS SQL for definition management.
- [Site Runtime (#3)](../requirements/Component-SiteRuntime.md) — Script Execution Actors resolve `IExternalSystemClient` and `IDatabaseGateway` from DI and expose them to script code as `ExternalSystem` and `Database`. Actors run on a dedicated blocking I/O dispatcher to isolate HTTP and SQL waits from the actor system's default dispatcher.
- [Site Call Audit (#22)](./SiteCallAudit.md) — receives cached-call lifecycle telemetry (via the combined `CachedCallTelemetry` packet) so cached call status is observable centrally; the gateway's S&F delivery writes the tracking row that `Tracking.Status()` reads.
- [Audit Log (#23)](./AuditLog.md) — audit rows for `ApiOutbound` and `DbOutbound` channels are emitted by the Script Runtime Context around gateway calls; gateway itself does not write audit rows directly. The `trackedOperationId`, `executionId`, and `parentExecutionId` threaded through `CachedCallAsync` / `CachedWriteAsync` keep audit rows correlated across the retry lifecycle.
## Troubleshooting
### A cached call is stuck retrying
If the external system definition or database connection has `MaxRetries = 0` and the operator intended "no retries", the S&F engine interprets `0` as "no limit" (retry forever). The gateway normalizes `0` to `null` on enqueue so the engine's bounded default applies. Verify the definition's `MaxRetries` field is set to the intended value in the Central UI and redeployed.
### Timeout is not being respected
`ExternalSystemGatewayOptions.DefaultHttpTimeout` applies only when `HttpClient.Timeout` is `Timeout.InfiniteTimeSpan`. The gateway sets this explicitly on every factory-supplied client. If a custom `HttpMessageHandler` upstream resets `Timeout`, the gateway's `CancellationTokenSource(DefaultHttpTimeout)` is still the controlling token because `SendAsync` is called with the linked token, not the raw `cancellationToken`.
### Auth header not sent
The gateway logs a `Warning` when `AuthType` is `"apikey"` or `"basic"` but `AuthConfiguration` is empty or absent, and when `AuthType` is `"basic"` but `AuthConfiguration` has no `:` separator. Check the site log for `ApplyAuth:` warning messages. The credential value is never logged — only the system name and auth type.
### A buffered call is parked immediately
A `JsonException` during `DeliverBufferedAsync` payload deserialization is treated as permanent (the same malformed payload will fail every time). The message is parked rather than retried. Check the site log for `"malformed JSON payload; parking"` alongside the message GUID, then inspect the S&F store for the payload to identify the serialization issue.
## Related Documentation
- [External System Gateway design specification](../requirements/Component-ExternalSystemGateway.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Audit Log](./AuditLog.md)
- [Commons](./Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
+184
View File
@@ -0,0 +1,184 @@
# Health Monitoring
The Health Monitoring component collects operational metrics from site cluster subsystems, forwards them to central on a 30-second cadence, and exposes an in-memory aggregated view that the Central UI health dashboard polls.
## Overview
Health Monitoring (#11) runs on both site and central nodes with different roles on each side. It exists as a display-only, in-memory signal layer — no historical persistence, no alerting. The philosophy is current-status-only: the dashboard answers "what is the system doing right now?" rather than "what happened over the last hour?".
The component code lives in `src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/`:
- `SiteHealthCollector` / `ISiteHealthCollector` — the site-side thread-safe accumulator that other site subsystems push metrics into. Script Actors, Alarm Actors, DCL connection actors, and the Audit Log bridge all call into this singleton.
- `HealthReportSender` — a `BackgroundService` that ticks every `ReportInterval` (30 s), atomically drains the collector into a `SiteHealthReport`, stamps a monotonic sequence number, and fires it to central over `IHealthReportTransport` (Akka Tell, fire-and-forget). Only the active node sends — the standby runs the loop but skips the send.
- `CentralHealthAggregator` / `ICentralHealthAggregator` — the central-side `BackgroundService` and in-memory store. Receives `SiteHealthReport` messages, applies them atomically via compare-and-swap on `SiteHealthState` records, and runs a periodic offline-detection sweep.
- `CentralHealthReportLoop` — central-only counterpart to `HealthReportSender`: generates a synthetic `SiteHealthReport` for the central cluster itself (site ID `$central`) so central appears as a first-class card on `/monitoring/health`. Only the Primary (leader) node generates reports.
- `SiteHealthState` — immutable record holding the latest report, last-heartbeat timestamp, sequence number, and online/offline flag for one site. Handed directly to UI callers; never torn because the aggregator swaps it atomically.
DI entry points are split by role: `AddSiteHealthMonitoring` for site nodes (registers `ISiteHealthCollector` + starts `HealthReportSender`); `AddCentralHealthAggregation` for central nodes (registers `CentralHealthAggregator` + starts `CentralHealthReportLoop`); `AddHealthMonitoring` for nodes that need the collector but not the sender (shared use). All three are idempotent with respect to `HealthMonitoringOptionsValidator` registration.
## Key Concepts
### Monotonic sequence numbers
Sequence numbers are seeded at construction with the current Unix epoch in milliseconds rather than starting at zero. This ensures that, after a site failover, the newly-active node's first report always carries a higher sequence number than any report the previous active node sent — the central aggregator's sequence guard would otherwise silently discard the new active's reports as stale. The same seeding applies to `CentralHealthReportLoop` for the `$central` synthetic site.
### Raw error counts per interval
`ScriptErrorCount`, `AlarmEvaluationErrorCount`, `DeadLetterCount`, `SiteAuditWriteFailures`, and `AuditRedactionFailure` are raw counts accumulated since the previous report, not rates. `SiteHealthCollector.CollectReport` atomically reads and resets each counter via `Interlocked.Exchange`. If the transport `Send` throws, `HealthReportSender` restores the drained counts back into the collector via `Interlocked.Add` so they roll forward into the next report rather than being silently lost. Concurrent increments that arrive during a failed send accumulate against zero and are preserved by the restore `Add`.
### Online/offline detection
Online status is driven by `LastHeartbeatAt`, not by `LastReportReceivedAt`. Heartbeats arrive from `SiteCommunicationActor` every ~5 s (`CommunicationOptions.TransportHeartbeatInterval`), so the 60 s `OfflineTimeout` tolerates roughly twelve missed heartbeats before declaring a site offline. A single-node failover — where the standby is alive but the active cannot produce a full report — therefore does not trigger a false offline transition.
The synthetic `$central` site has no heartbeat source; its only signal is the 30 s `CentralHealthReportLoop` self-report. It therefore gets a longer `CentralOfflineTimeout` (default 6 × `ReportInterval` = 180 s / 3 min), equivalent to ~6 missed report periods. The validator rejects any configuration where `CentralOfflineTimeout < OfflineTimeout`.
The offline-check `PeriodicTimer` runs at half the shorter of the two timeouts so whichever site class has the tighter window is swept at least twice within it.
### Dead-letter monitoring
`SiteHealthCollector.IncrementDeadLetter` is called by the site's Akka `EventStream` dead-letter subscriber. Each call atomically increments `_deadLetterCount`; the count appears in the next health report as `DeadLetterCount`. Dead letters indicate messages sent to actors that no longer exist — typically stale references or timing races after instance redeploy. The health dashboard surfaces the count per report interval for quick triage; Site Event Log Viewer provides the per-message detail.
## Architecture
### Site-side collection
`SiteHealthCollector` holds all per-interval counters (`_scriptErrorCount`, `_alarmErrorCount`, `_deadLetterCount`, `_siteAuditWriteFailures`, `_auditRedactionFailures`) as `int` fields touched only through `Interlocked` operations, and snapshot state (`ConcurrentDictionary` for connection health, tag resolution, and S&F buffer depths) that is overwritten rather than incremented. This split means `CollectReport` can atomically reset the counters in one pass while taking a point-in-time copy of the dictionaries, with no locks:
```csharp
// SiteHealthCollector.CollectReport (abbreviated)
public SiteHealthReport CollectReport(string siteId)
{
var scriptErrors = Interlocked.Exchange(ref _scriptErrorCount, 0);
var alarmErrors = Interlocked.Exchange(ref _alarmErrorCount, 0);
var deadLetters = Interlocked.Exchange(ref _deadLetterCount, 0);
var auditFailures = Interlocked.Exchange(ref _siteAuditWriteFailures, 0);
var redactFailures = Interlocked.Exchange(ref _auditRedactionFailures, 0);
return new SiteHealthReport(
SiteId: siteId,
SequenceNumber: 0, // caller stamps monotonic seq
ReportTimestamp: _timeProvider.GetUtcNow(),
ScriptErrorCount: scriptErrors,
DeadLetterCount: deadLetters,
SiteAuditWriteFailures: auditFailures,
AuditRedactionFailure: redactFailures,
/* ... connection snapshots, instance counts, S&F depths */
SiteAuditBacklog: _siteAuditBacklog);
}
```
The `SequenceNumber` field in the returned record is always `0`; `HealthReportSender` overwrites it with the atomically-incremented monotonic counter immediately before calling `_transport.Send`.
### Site-side report send
`HealthReportSender` is an active-node-only sender: at the top of each tick it checks `_collector.IsActiveNode` and skips the remainder when `false`. The active/standby flag is set by the Deployment Manager singleton ownership check, not by this component.
The send itself is synchronous and fire-and-forget (`IHealthReportTransport.Send` wraps an Akka `Tell`). A transport exception is caught, logged, and the interval counts are restored before re-throwing — the outer `catch (Exception)` swallows the rethrow so the background service never terminates from a single failed send.
### Central aggregation
`CentralHealthAggregator` stores one `SiteHealthState` record per site in a `ConcurrentDictionary<string, SiteHealthState>`. Every write (from `ProcessReport` or `MarkHeartbeat`) uses a compare-and-swap loop:
```csharp
// CentralHealthAggregator.ProcessReport (core CAS path)
var updated = existing with
{
LatestReport = report,
LastReportReceivedAt = now,
LastHeartbeatAt = now,
LastSequenceNumber = report.SequenceNumber,
IsOnline = true
};
if (_siteStates.TryUpdate(report.SiteId, updated, existing))
return;
// CAS lost — retry with fresh value
```
Sequence numbers guard against stale reports from a pre-failover node overwriting the new active's fresher state. A heartbeat for an unknown site (e.g., just after a central restart) registers the site as online with a null `LatestReport` so the site is not shown as "unknown" during the failover window.
The offline sweep runs on a `PeriodicTimer` at `ComputeCheckInterval(_options)` — half the shorter of `OfflineTimeout` and `CentralOfflineTimeout`. It checks `LastHeartbeatAt` (not report time) and applies a single non-retried CAS: if the CAS loses, the site was just heard from and leaving it online is correct.
### Audit Log metrics bridge
Audit Log registers `AddAuditLogHealthMetricsBridge` on site nodes after `AddSiteHealthMonitoring`. This replaces the default no-op failure counters with two bridges that forward directly into `ISiteHealthCollector`:
- `HealthMetricsAuditWriteFailureCounter` — called by `FallbackAuditWriter` on every primary SQLite failure; increments `SiteAuditWriteFailures`.
- `HealthMetricsAuditRedactionFailureCounter` — called by the payload redactor on every over-redaction event; increments `AuditRedactionFailure`.
A third collaborator, `SiteAuditBacklogReporter`, is a hosted service that polls `ISiteAuditQueue.GetBacklogStatsAsync` every 30 s and pushes a `SiteAuditBacklogSnapshot` into `ISiteHealthCollector.UpdateSiteAuditBacklog`. The snapshot (`PendingCount`, `OldestPendingUtc`, `OnDiskBytes`) rides as `SiteAuditBacklog` on the next health report. The poll runs in a separate service rather than inline in `CollectReport` to keep the report path free of synchronous SQLite I/O.
On central, `AuditCentralHealthSnapshot` (in the Audit Log component) is the symmetric counterpart: it accumulates `CentralAuditWriteFailures`, `AuditRedactionFailure`, and a per-site `SiteAuditTelemetryStalled` map fed by `SiteAuditTelemetryStalledTracker`. These are read by the central health dashboard alongside the aggregated site states. See [Audit Log](./AuditLog.md) for the full counter and stall-detection design.
## Usage
Site subsystems call `ISiteHealthCollector` directly — it is a DI singleton. Examples of callers by subsystem:
| Caller | Method | Metric in report |
|--------|--------|-----------------|
| Script Actor (on unhandled exception) | `IncrementScriptError()` | `ScriptErrorCount` |
| Alarm Actor (on eval failure) | `IncrementAlarmError()` | `AlarmEvaluationErrorCount` |
| Akka EventStream dead-letter subscriber | `IncrementDeadLetter()` | `DeadLetterCount` |
| DCL connection actor | `UpdateConnectionHealth(name, health)` | `DataConnectionStatuses` |
| DCL connection actor | `UpdateTagResolution(name, total, resolved)` | `TagResolutionCounts` |
| DCL connection actor | `UpdateTagQuality(name, good, bad, uncertain)` | `DataConnectionTagQuality` |
| Audit Log bridge | `IncrementSiteAuditWriteFailures()` | `SiteAuditWriteFailures` |
| Audit Log bridge | `IncrementAuditRedactionFailure()` | `AuditRedactionFailure` |
| `SiteAuditBacklogReporter` | `UpdateSiteAuditBacklog(snapshot)` | `SiteAuditBacklog` |
| `HealthReportSender` | `SetParkedMessageCount(count)` | `ParkedMessageCount` |
Central consumers resolve `ICentralHealthAggregator` and call `GetAllSiteStates()` or `GetSiteState(siteId)` to read a snapshot-safe dictionary of `SiteHealthState` records. The health dashboard polls this on a ~10 s timer. Because `SiteHealthState` is an immutable record swapped atomically, a consumer can hold the reference without risk of a torn read.
## Configuration
Options class: `HealthMonitoringOptions`, bound from the `ScadaBridge:HealthMonitoring` section. Validated at startup by `HealthMonitoringOptionsValidator` (registered with `ValidateOnStart`) so a bad configuration fails with a clear key-naming message rather than an opaque `ArgumentOutOfRangeException` inside a `PeriodicTimer` constructor.
| Key | Default | Constraint | Description |
|-----|---------|-----------|-------------|
| `ScadaBridge:HealthMonitoring:ReportInterval` | `00:00:30` (30 s) | Must be `> 0` | Interval at which site nodes emit health reports to central. Also the `CentralHealthReportLoop` self-report cadence. |
| `ScadaBridge:HealthMonitoring:OfflineTimeout` | `00:01:00` (60 s) | Must be `> 0` | Silence window after which a real site is marked offline. Driven by `LastHeartbeatAt`, not last report time. |
| `ScadaBridge:HealthMonitoring:CentralOfflineTimeout` | `00:03:00` (3 min) | Must be `>= OfflineTimeout` | Grace window for the `$central` synthetic site, which has no heartbeat source. Defaults to 6× `ReportInterval`. |
The offline-check cadence is derived at runtime as `min(OfflineTimeout, CentralOfflineTimeout) / 2` — not directly configurable.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — defines `SiteHealthReport`, `SiteHealthReportReplica`, `NodeStatus`, `SiteAuditBacklogSnapshot`, and the `ISiteHealthCollector` / `ICentralHealthAggregator` interfaces consumed throughout. `SiteHealthReport` is an additive record; new fields use default values so existing producers remain valid.
- [CentralSite Communication (#5)](./Communication.md) — transports `SiteHealthReport` messages from site to central via Akka remoting (fire-and-forget Tell through `IHealthReportTransport`). Also delivers heartbeats from `SiteCommunicationActor` that `CentralHealthAggregator.MarkHeartbeat` uses to keep sites online between reports. `SiteHealthReportReplica` is broadcast on DistributedPubSub so both central nodes maintain identical aggregator state.
- [Site Runtime (#3)](./SiteRuntime.md) — Script Actors call `IncrementScriptError`; Alarm Actors call `IncrementAlarmError`; the Deployment Manager singleton ownership check drives `SetActiveNode`.
- [Data Connection Layer (#4)](./DataConnectionLayer.md) — connection actors call `UpdateConnectionHealth`, `UpdateTagResolution`, `UpdateConnectionEndpoint`, `UpdateTagQuality`, and `RemoveConnection` on `ISiteHealthCollector`.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — `HealthReportSender` queries `StoreAndForwardStorage` for `GetParkedMessageCountAsync` and `GetBufferDepthByCategoryAsync`; the results populate `ParkedMessageCount` and `StoreAndForwardBufferDepths` (keyed by `StoreAndForwardCategory` name).
- [Cluster Infrastructure (#13)](./ClusterInfrastructure.md) — `IClusterNodeProvider` supplies the cluster node list to `HealthReportSender` (for the node-list payload); `HealthReportSender`'s active/standby gate is `_collector.IsActiveNode`, which is set externally by `DeploymentManagerActor.PreStart`/`PostStop`. `CentralHealthReportLoop` reads both `GetClusterNodes()` and `SelfIsPrimary` from `IClusterNodeProvider`. Heartbeat cadence (default 5 s) is owned by Cluster Infrastructure / `SiteCommunicationActor`.
- [Audit Log (#23)](./AuditLog.md) — `AddAuditLogHealthMetricsBridge` wires `HealthMetricsAuditWriteFailureCounter` and `HealthMetricsAuditRedactionFailureCounter` into the site collector, and registers `SiteAuditBacklogReporter` to poll the site-local SQLite drain backlog. On central, `AuditCentralHealthSnapshot` exposes `CentralAuditWriteFailures`, `AuditRedactionFailure`, and per-site `SiteAuditTelemetryStalled` alongside the aggregated site states on the health dashboard.
- [Central UI (#9)](./CentralUI.md) — the health dashboard resolves `ICentralHealthAggregator` and polls `GetAllSiteStates()` on a ~10 s timer. Notification Outbox and Site Call Audit KPIs are computed on demand from their own central tables by those components; Health Monitoring does not own or cache them.
- [Host (#15)](./Host.md) — implements `ISiteIdentityProvider` (supplies `SiteId` for report payloads) and `IClusterNodeProvider`, and calls the appropriate `Add*` entry points from the role-specific composition root.
## Troubleshooting
### A site flaps online/offline during single-node failover
The 60 s `OfflineTimeout` is driven by heartbeats, not reports. The standby node keeps sending heartbeats even when the active is down. If the site still shows as offline during a failover window shorter than 60 s, check that `SiteCommunicationActor` is running on the standby (it is not a singleton — both nodes run it) and that heartbeats are reaching central. Temporarily increasing `OfflineTimeout` reduces false-offline transitions at the cost of slower genuine-offline detection.
### Reports from the new active are silently discarded after failover
This happens when the new active's process-start sequence numbers fall below the prior active's last sequence number. `HealthReportSender` seeds `_sequenceNumber` with `TimeProvider.GetUtcNow().ToUnixTimeMilliseconds()` at construction, so this should not occur unless the new node's clock is significantly behind the old node's. Check time synchronization between site nodes.
### `$central` shows as offline
`CentralHealthReportLoop` only generates reports when `IClusterNodeProvider.SelfIsPrimary` is true. If both central nodes are healthy but the `$central` entry shows offline, the primary node's loop may have stalled or the Akka cluster may be in a split-brain state. Check `CentralHealthReportLoop` logs for `"Failed to generate central health report"` errors.
### `SiteAuditWriteFailures` non-zero
Non-zero `SiteAuditWriteFailures` in consecutive reports indicates the site-local SQLite audit writer is throwing persistently and rows are being routed to the `RingBufferFallback`. Check disk space and SQLite file health at the site node. See [Audit Log](./AuditLog.md) — the fallback ring is drop-oldest; sustained failure loses rows.
## Related Documentation
- [Health Monitoring design specification](../requirements/Component-HealthMonitoring.md)
- [Audit Log](./AuditLog.md)
- [CentralSite Communication](./Communication.md)
- [Site Runtime](./SiteRuntime.md)
- [Data Connection Layer](./DataConnectionLayer.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Cluster Infrastructure](./ClusterInfrastructure.md)
- [Commons](./Commons.md)
+257
View File
@@ -0,0 +1,257 @@
# Host
The Host is the single deployable binary for ScadaBridge. The same executable runs on every node — central and site alike — and selects its component set entirely from configuration, with no separate build targets or conditional compilation.
## Overview
Host (#15) is the composition root: it reads `ScadaBridge:Node:Role` from `appsettings.json` (layered with a role-specific override file selected by the `SCADABRIDGE_CONFIG` environment variable), runs pre-DI startup validation, wires every applicable component into the DI container and Akka.NET actor system, and then hands off to ASP.NET Core's `WebApplication` host.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.Host/`, split across:
- `Program.cs` — the entry point: configuration loading, `StartupValidator`, role-branched DI registration, Kestrel setup, middleware pipeline, and endpoint mapping.
- `Actors/AkkaHostedService.cs` — owns the `ActorSystem` lifetime; builds HOCON from bound options; registers role-specific actors as cluster singletons or plain `ActorOf` calls.
- `Actors/DeadLetterMonitorActor.cs` — subscribes to the `DeadLetter` event stream and increments the health metric.
- `Health/ActiveNodeGate.cs` — production `IActiveNodeGate` backed by Akka cluster leadership; used by the Inbound API endpoint filter to gate traffic on standby nodes.
- `Health/AkkaClusterNodeProvider.cs` — feeds `IClusterNodeProvider` from live Akka cluster membership for health reporting.
- `SiteServiceRegistration.cs` — extracted site-role DI registrations reused by both `Program.cs` and integration test harnesses.
- `StartupValidator.cs` — pre-DI configuration preflight that fails fast before any actor system is created.
- `StartupRetry.cs` — bounded exponential-backoff helper for startup preconditions (database migrations).
- `LoggerConfigurationFactory.cs` — builds the Serilog `LoggerConfiguration` with node-identity enrichment.
## Key Concepts
### Role selection via `SCADABRIDGE_CONFIG`
The configuration builder layers `appsettings.json`, then `appsettings.{SCADABRIDGE_CONFIG}.json`. The `SCADABRIDGE_CONFIG` environment variable selects the role-specific file (`Central` or `Site`); when absent, it falls back to `DOTNET_ENVIRONMENT`. `DOTNET_ENVIRONMENT`/`ASPNETCORE_ENVIRONMENT` remain `Development` for dev tooling (static assets, EF migrations) independently of which role is active.
```csharp
var scadabridgeConfig = Environment.GetEnvironmentVariable("SCADABRIDGE_CONFIG")
?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT")
?? "Production";
var configuration = new ConfigurationBuilder()
.AddJsonFile("appsettings.json", optional: false)
.AddJsonFile($"appsettings.{scadabridgeConfig}.json", optional: true)
.AddEnvironmentVariables()
.AddCommandLine(args)
.Build();
```
The resolved `ScadaBridge:Node:Role` value then branches the entire DI and Akka bootstrap.
### Pre-DI startup validation
`StartupValidator.Validate` runs before any DI or actor system setup. It assembles all errors, then throws a single `InvalidOperationException` listing every problem. This avoids the confusing partial-startup failures that occur when validation is deferred to first resolve. Site nodes additionally validate that `GrpcPort`, `MetricsPort`, and `RemotingPort` are all distinct and that no seed-node entry points at the gRPC port.
### Akka HOCON construction
`AkkaHostedService.BuildHocon` assembles the HOCON configuration document from strongly-typed options rather than inline strings. Every interpolated value passes through `QuoteHocon` (escapes backslashes and double-quotes) to prevent a hostname, seed-node URI, or split-brain strategy value from corrupting the document. Durations are rendered in milliseconds (`DurationHocon`) so sub-second timing values (e.g. a 750 ms heartbeat) are preserved exactly.
The actor system name is always `scadabridge`. Site nodes carry two cluster roles: the generic `"Site"` role and a per-site role (`"site-{SiteId}"`) used to scope cluster singletons to a specific site.
### `/health/ready` — readiness gating
Central nodes register `DatabaseHealthCheck<ScadaBridgeDbContext>` (tagged `Ready`) and `AkkaClusterHealthCheck` (tagged `Ready`). The `/health/ready` endpoint returns 200 only when both pass. Readiness is explicitly not tied to cluster leadership: a fully operational standby central node still reports ready because `ActiveNodeHealthCheck` carries only the `Active` tag, not `Ready`.
Load balancers and orchestrators should poll `/health/ready` to determine when a freshly started or failed-over node can receive traffic.
### `/health/active` — active-node routing for Traefik
`ActiveNodeHealthCheck` carries the `Active` tag and is served at `/health/active`. It returns 200 only on the cluster leader. Traefik polls this endpoint and routes inbound traffic — Central UI, Inbound API, Management API — exclusively to the node that answers 200. See [TraefikProxy](./TraefikProxy.md) for the upstream routing rules.
The same leadership check backs `ActiveNodeGate`, the `IActiveNodeGate` implementation the Inbound API endpoint filter consults before executing a method script. A standby node therefore refuses inbound API calls even if traffic somehow reaches it directly.
```csharp
public bool IsActiveNode
{
get
{
var system = _akkaService.ActorSystem;
if (system == null)
return false;
var cluster = Cluster.Get(system);
var self = cluster.SelfMember;
if (self.Status != MemberStatus.Up)
return false;
var leader = cluster.State.Leader;
return leader != null && leader == self.Address;
}
}
```
## Architecture
### Central composition root
`Program.cs` (Central branch) calls `WebApplication.CreateBuilder`, registers shared and central-only components, builds the `WebApplication`, applies or retries database migrations, and mounts the middleware pipeline and endpoints. The order is intentional: `UseAuthentication` and `UseAuthorization` run before `UseAuditWriteMiddleware` so `HttpContext.User` is populated when the audit row is written.
Before branching on role, `AkkaHostedService.StartAsync` creates one actor unconditionally on every node:
- `DeadLetterMonitorActor` — plain `ActorOf`; subscribes to the `DeadLetter` event stream on `PreStart`. Runs on both central and site nodes.
`AkkaHostedService.RegisterCentralActors` creates:
- `CentralCommunicationActor` — registered with `ClusterClientReceptionist` so site `ClusterClient`s can reach it.
- `ManagementActor` — also registered with `ClusterClientReceptionist`; the CLI connects via `ClusterClient` without joining the cluster.
- `NotificationOutboxActor` — cluster singleton (no role scope); a proxy is handed to `CentralCommunicationActor` so forwarded `NotificationSubmit` messages from sites are routed to it.
- `AuditLogIngestActor` — cluster singleton; proxy registered with both `CentralCommunicationActor` and (if present) the `SiteStreamGrpcServer`.
- `SiteCallAuditActor` — cluster singleton; a graceful-stop task is added to the `cluster-leave` coordinated-shutdown phase with a 10-second drain window.
### Site composition root
`Program.cs` (Site branch) calls `WebApplication.CreateBuilder` with a Kestrel configuration that binds two listeners: HTTP/2 only on `GrpcPort` (default 8083) for the gRPC server, and HTTP/1+2 on `MetricsPort` (default 8084) for the Prometheus `/metrics` scrape endpoint. The separation exists because a standard HTTP/1.1 Prometheus scraper cannot negotiate HTTP/2; the gRPC listener must stay pure HTTP/2.
`SiteServiceRegistration.Configure` registers the site-only components. `AkkaHostedService.RegisterSiteActorsAsync` creates:
- `DeploymentManagerActor` — cluster singleton scoped to `"site-{SiteId}"`.
- `SiteCommunicationActor` — registered with `ClusterClientReceptionist`; creates a `ClusterClient` to configured central contact points.
- `SiteReplicationActor` — one per node (not a singleton); handles best-effort S&F replication to the standby.
- `EventLogHandlerActor` — cluster singleton scoped to `"site-{SiteId}"`.
- `ParkedMessageHandlerActor` — bridges Akka to `StoreAndForwardService`.
- `SiteAuditTelemetryActor` — created on a dedicated `audit-telemetry-dispatcher` (2-thread `ForkJoinDispatcher`) so SQLite reads and gRPC pushes never contend with hot-path actors.
- `DataConnectionManagerActor` — if `IDataConnectionFactory` is registered.
Shutdown ordering for the site role is explicit: `IHostApplicationLifetime.ApplicationStopping` fires before `IHostedService.StopAsync`, so `SiteStreamGrpcServer.CancelAllStreams` is called first (clients observe a clean cancellation and reconnect), then `AkkaHostedService` runs `CoordinatedShutdown` and tears down actors.
```csharp
siteLifetime.ApplicationStopping.Register(() => siteGrpcServer.CancelAllStreams());
```
### Database migration retry
On central nodes, `StartupRetry.ExecuteWithRetryAsync` wraps the migration step with up to 8 attempts and initial 2-second exponential backoff (capped at 30 seconds). Only connection-class faults (`SocketException`, `SqlException`, `DbException`, `TimeoutException`) are retried; a schema-version mismatch surfaces as an `InvalidOperationException` and fails immediately. The `ApplicationStopping` token is threaded into both the migration call and the inter-attempt `Task.Delay` so a SIGTERM during the retry window tears down cleanly.
## Usage
The Host is not consumed as a library; it is the executable entry point. Other components expose themselves to the Host via the extension-method convention:
- `IServiceCollection.AddXxx()` — registers DI services.
- `AkkaHostedService.RegisterXxxActors()` / inline `ActorOf` calls in `AkkaHostedService` — registers actors.
- `WebApplication.MapXxx()` — maps web endpoints (Central UI, Inbound API, Management API, Audit API).
`Program.cs` calls these methods; the component libraries own the registration logic. This keeps the Host thin and each component self-contained.
### Component registration by role
| Component | Central | Site | `AddXxx` | Actors | `MapXxx` |
|---|:---:|:---:|:---:|:---:|:---:|
| ClusterInfrastructure | Yes | Yes | Yes | — | — |
| Communication | Yes | Yes | Yes | Yes | — |
| HealthMonitoring | Yes | Yes | Yes | — | — |
| ExternalSystemGateway | Yes | Yes | Yes | — | — |
| AuditLog | Yes | Yes | Yes | Yes | — |
| NotificationService | Yes | No | Yes | — | — |
| NotificationOutbox | Yes | No | Yes | Yes (singleton) | — |
| SiteCallAudit | Yes | No | Yes | Yes (singleton) | — |
| TemplateEngine | Yes | No | Yes | Yes | — |
| DeploymentManager | Yes | No | Yes | Yes | — |
| Security | Yes | No | Yes | — | — |
| CentralUI | Yes | No | Yes | — | Yes |
| InboundAPI | Yes | No | Yes | — | Yes |
| ManagementService | Yes | No | Yes | Yes | Yes |
| Transport | Yes | No | Yes | — | — |
| ConfigurationDatabase | Yes | No | Yes | — | — |
| SiteRuntime | No | Yes | Yes | Yes (singleton) | — |
| DataConnectionLayer | No | Yes | Yes | Yes | — |
| StoreAndForward | No | Yes | Yes | Yes | — |
| SiteEventLogging | No | Yes | Yes | Yes (singleton) | — |
`AuditLog` calls `AddAuditLog` on both roles; central additionally calls `AddAuditLogCentralMaintenance`. Site calls `AddAuditLogHealthMetricsBridge` to bridge write failures into the site health report.
## Configuration
Options are bound via the .NET Options pattern (`IOptions<T>`). Each component owns its options class; the Host binds each section and passes the `IConfiguration` to component extension methods only where the component's own validator needs it at startup.
### `ScadaBridge:Node``NodeOptions`
| Key | Default | Description |
|-----|---------|-------------|
| `Role` | — | `"Central"` or `"Site"`. Validated by `StartupValidator`. |
| `NodeHostname` | — | Hostname or IP advertised to the Akka cluster and enriched on log entries. |
| `NodeName` | — | Free-form semantic name stamped as `SourceNode` on audit rows (e.g. `"central-a"`, `"node-b"`). Empty normalises to `null`. |
| `SiteId` | — | Site identifier; required for Site nodes; used to scope cluster singletons and enrich telemetry. |
| `RemotingPort` | `8081` | Akka.NET remoting TCP port. Must be in range 165535. |
| `GrpcPort` | `8083` | Kestrel HTTP/2 port for the site gRPC stream server (Site nodes only). Must differ from `RemotingPort`. |
| `MetricsPort` | `8084` | Kestrel HTTP/1+2 port for the Prometheus `/metrics` scrape endpoint (Site nodes only). Must differ from both `RemotingPort` and `GrpcPort`. |
### `ScadaBridge:Cluster``ClusterOptions`
| Key | Default | Description |
|-----|---------|-------------|
| `SeedNodes` | — | List of Akka seed-node URIs (`akka.tcp://scadabridge@host:port`). At least 2 required. Must reference remoting ports, not gRPC ports. |
| `SplitBrainResolverStrategy` | `keep-oldest` | Active strategy name (e.g. `"keep-oldest"`). |
| `StableAfter` | `"00:00:15"` | Duration the cluster must be stable before the resolver acts. |
| `HeartbeatInterval` | `"00:00:02"` | Akka failure-detector heartbeat cadence. |
| `FailureDetectionThreshold` | `"00:00:10"` | Acceptable heartbeat pause before a node is considered unreachable. |
| `MinNrOfMembers` | `1` | Minimum cluster members before the leader is elected. |
| `DownIfAlone` | `true` | When using `keep-oldest`, whether a lone surviving node downs itself. |
### `ScadaBridge:Database``DatabaseOptions`
| Key | Role | Description |
|-----|------|-------------|
| `ConfigurationDb` | Central | MS SQL connection string for the central `ScadaBridgeDbContext`. Required; validated by `StartupValidator`. |
| `SiteDbPath` | Site | Filesystem path to the site-local SQLite database. Required for Site nodes. |
### `ScadaBridge:Logging``LoggingOptions`
| Key | Default | Description |
|-----|---------|-------------|
| `MinimumLevel` | `"Information"` | Serilog minimum log level. Overrides any `Serilog:MinimumLevel` entry — a one-shot warning is emitted to `stderr` if both are present. Parsed case-insensitively; unrecognised values fall back to `Information` with a warning. |
Serilog sinks (console output template, file path, rolling interval) are configured under the standard `Serilog` JSON section and applied via `ReadFrom.Configuration`. Every log entry is enriched with `SiteId`, `NodeHostname`, and `NodeRole` properties from the resolved node configuration.
### `ScadaBridge:InboundApi:ApiKeyStore`
| Key | Default | Description |
|-----|---------|-------------|
| `SqlitePath` | `data/inbound-api-keys.sqlite` under content root | Path to the SQLite store for inbound API keys. |
| `TokenPrefix` | `"sbk"` | Prefix for issued API key tokens. Fixed; injected by the Host as in-memory config. |
| `PepperSecretName` | `"ScadaBridge:InboundApi:ApiKeyPepper"` | Configuration key holding the peppered-HMAC secret. The pepper itself must be ≥ 16 characters; validated by `StartupValidator`. |
| `RunMigrationsOnStartup` | `true` | Whether the hosted service creates the SQLite schema on first run. |
All other per-component configuration sections (`ScadaBridge:Communication`, `ScadaBridge:HealthMonitoring`, `ScadaBridge:Security`, `ScadaBridge:InboundApi`, `ScadaBridge:NotificationOutbox`, `ScadaBridge:Transport`, `ScadaBridge:DataConnection`, `ScadaBridge:StoreAndForward`, `ScadaBridge:SiteEventLog`, `ScadaBridge:SiteRuntime`, `ScadaBridge:Notification`) are bound by their respective component extension methods. The Host binds them at the shared `BindSharedOptions` call or at the role-specific `Configure<T>` sites in `Program.cs` and `SiteServiceRegistration.Configure`.
## Dependencies & Interactions
- **All 19 component libraries** — the Host project-references every component to call its extension methods. The Host is the only project with this fan-out; component libraries do not reference each other except where documented.
- [Cluster Infrastructure (#13)](./ClusterInfrastructure.md) — the Host configures the underlying Akka.NET cluster (`AkkaHostedService.BuildHocon`); ClusterInfrastructure manages it at runtime.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — the Host registers `ScadaBridgeDbContext` and calls `AddConfigurationDatabase` (Central only); the `StartupRetry`-wrapped migration step runs before traffic is accepted.
- [CentralSite Communication (#5)](./Communication.md) — the Host creates `CentralCommunicationActor` and `SiteCommunicationActor`, registers them with `ClusterClientReceptionist`, and wires the `ClusterClient` for site→central messaging; the gRPC server is mapped at `app.MapGrpcService<SiteStreamGrpcServer>()`.
- [Health Monitoring (#11)](./HealthMonitoring.md) — the Host registers health checks (`DatabaseHealthCheck`, `AkkaClusterHealthCheck`, `ActiveNodeHealthCheck`) and mounts them via `app.MapZbHealth()` on central; site nodes register `AddSiteHealthMonitoring` and `AkkaHealthReportTransport`.
- [Audit Log (#23)](./AuditLog.md) — the Host calls `AddAuditLog` on both roles, `AddAuditLogCentralMaintenance` on central, and `AddAuditLogHealthMetricsBridge` on site; it creates the `AuditLogIngestActor` singleton and registers `SiteAuditTelemetryActor` on the dedicated dispatcher.
- [Notification Outbox (#21)](./NotificationOutbox.md) — the Host creates the `NotificationOutboxActor` cluster singleton and hands its proxy to `CentralCommunicationActor`.
- [Site Call Audit (#22)](./SiteCallAudit.md) — the Host creates the `SiteCallAuditActor` cluster singleton with a graceful-stop drain task registered in the `cluster-leave` coordinated-shutdown phase.
- [Management Service (#18)](./ManagementService.md) — the Host creates `ManagementActor` and registers it with `ClusterClientReceptionist`; maps the Management and Audit HTTP APIs.
- [Traefik Proxy (#20)](./TraefikProxy.md) — Traefik polls `/health/active` to determine which central node to route traffic to; the Host implements the `ActiveNodeHealthCheck` and `ActiveNodeGate` that back this endpoint.
- Design spec: [Component-Host.md](../requirements/Component-Host.md).
## Troubleshooting
### Node fails to start with validation errors
`StartupValidator` throws before any DI or actor system setup. The exception message lists all failing keys and their expected constraints. Common causes: missing `ScadaBridge:Node:Role`, a `GrpcPort`/`RemotingPort` collision on a site node, a seed-node URI that accidentally points at the gRPC port rather than the remoting port, or a missing `ConfigurationDb` connection string on a central node.
### Central node loops on database migration
`StartupRetry` retries connection-class faults up to 8 times (roughly 2 minutes worst-case). If the loop exhausts without success, the process exits with a `Fatal` log entry. Permanent errors (schema-version mismatch detected by `MigrationHelper`) are not retried and exit on the first attempt. Check `SqlException` details in the log to distinguish a connectivity failure from a schema fault.
### Dead letters appearing at startup
A burst of dead letters during startup is normal: actors send messages before their targets finish `PreStart`. `DeadLetterMonitorActor` logs each at `Warning` and increments the health counter — these are observable on the site health report. Sustained dead letters after the cluster stabilises indicate a stale actor reference or a lifecycle race.
### Standby central node receives traffic
If Traefik is not yet polling `/health/active` or its health-check interval has not elapsed after a failover, traffic may briefly reach the standby. `ActiveNodeGate` returns `false` on the standby, causing the Inbound API endpoint filter to respond `503 Service Unavailable`. The response header `X-ScadaBridge-Active: false` is present so the condition is identifiable in access logs. No operator action is needed; Traefik will reroute on its next health-check cycle.
## Related Documentation
- [Host design specification](../requirements/Component-Host.md)
- [Cluster Infrastructure](./ClusterInfrastructure.md)
- [CentralSite Communication](./Communication.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Health Monitoring](./HealthMonitoring.md)
- [Audit Log](./AuditLog.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Management Service](./ManagementService.md)
- [Traefik Proxy](./TraefikProxy.md)
+279
View File
@@ -0,0 +1,279 @@
# Inbound API
The Inbound API exposes a `POST /api/{methodName}` endpoint on the active central node so external systems can invoke C# scripts that live entirely on central, with the ability to reach any site instance through a routing surface. It is the inward counterpart of the External System Gateway — where that component handles scripts calling out, this handles callers coming in.
## Overview
Inbound API (#14) is a central-only, active-node-only component. Its code lives in `src/ZB.MOM.WW.ScadaBridge.InboundAPI/`, with shared entity and message types in `src/ZB.MOM.WW.ScadaBridge.Commons/`.
The component has three runtime responsibilities:
- **Auth and dispatch**`EndpointExtensions.MapInboundAPI` registers the endpoint; `InboundApiEndpointFilter` enforces the active-node gate and body-size cap before the handler runs; the handler authenticates via `IApiKeyVerifier` and resolves the matching `ApiMethod` from `IInboundApiRepository`.
- **Script execution**`InboundScriptExecutor` compiles `ApiMethod.Script` via Roslyn, caches the compiled delegate, and runs it inside `InboundScriptContext` against a method-level timeout.
- **Audit emission**`AuditWriteMiddleware` wraps the entire request pipeline; it mints the per-request `ExecutionId`, buffers request and response bodies up to the configured cap, and writes one `ApiInbound` row to `ICentralAuditWriter` in its `finally` block regardless of outcome.
The DI entry point is `ServiceCollectionExtensions.AddInboundAPI`, which registers `InboundScriptExecutor` (singleton), `RouteHelper` (scoped), `CommunicationServiceInstanceRouter` (scoped), and `InboundApiEndpointFilter` (singleton). API key verification is registered separately by the Host composition root via `AddZbApiKeyAuth``AddInboundAPI` does not register it.
## Key Concepts
### API key authentication
Authentication uses a Bearer token in the `Authorization` header (`sbk_<keyId>_<secret>`). The shared `IApiKeyVerifier` performs a peppered-HMAC constant-time secret comparison against the key store. Every verifier failure — missing token, unknown key, revoked key, secret mismatch — maps to a single `401` with the body `{"error":"Invalid or missing API key"}` so the failure reason is never surfaced to the caller.
The spec describes `X-API-Key` header auth. The code has retired that header in favour of a `Bearer` token scheme (`Authorization: Bearer sbk_<keyId>_<secret>`). The constant `UnauthorizedMessage` and `NotApprovedMessage` in `EndpointExtensions` are deliberately identical across different reject branches to prevent method enumeration.
### Per-method scope authorization
Once a key verifies, the handler checks whether `identity.Scopes.Contains(methodName)` (ordinal, case-sensitive) before making any database call. A key must carry the exact method name as a scope — `"Echo"` does not grant `"echo"`. If the scope check fails, or the subsequent `IInboundApiRepository.GetMethodByNameAsync` returns null, both branches emit `403` with the same body `{"error":"API key not approved for this method"}`. The scope check runs first to avoid a DB round-trip on the reject path and to eliminate a latency timing oracle.
### `ApiMethod` entity
`ApiMethod` (in `ZB.MOM.WW.ScadaBridge.Commons.Entities.InboundApi`) is the persistence-ignorant shape:
```csharp
public class ApiMethod
{
public int Id { get; set; }
public string Name { get; set; } // route segment
public string Script { get; set; } // Roslyn C# script body
public string? ParameterDefinitions { get; set; } // JSON: List<ParameterDefinition>
public string? ReturnDefinition { get; set; } // JSON: List<ReturnFieldDefinition>
public int TimeoutSeconds { get; set; }
}
```
`ParameterDefinitions` and `ReturnDefinition` are stored as JSON strings to keep the schema simple; both are deserialized on every request by `ParameterValidator` and `ReturnValueValidator`.
### Extended type system
Parameter and return field definitions share the same six-type vocabulary:
| Type | JSON shape | C# value after coercion |
|-----------|----------------------|-------------------------------------|
| `Boolean` | `true` / `false` | `bool` |
| `Integer` | number (whole) | `long` |
| `Float` | number | `double` |
| `String` | string | `string` |
| `Object` | JSON object | `Dictionary<string, object?>` |
| `List` | JSON array | `List<object?>` |
`Object` and `List` are validated for JSON shape only — field-level or element-level type constraints are the script's responsibility. Template attributes use only the four primitive types; the extended types apply here and in the External System Gateway.
## Architecture
### Request pipeline
```text
POST /api/{methodName}
├─ AuditWriteMiddleware ← mints ExecutionId; buffers bodies; emits audit row in finally
│ └─ InboundApiEndpointFilter ← 503 on standby node; 413 on oversized body
│ └─ HandleInboundApiRequest
│ ├─ IApiKeyVerifier.VerifyAsync ← 401 on any auth failure
│ ├─ scope check + GetMethodByNameAsync ← 403 on not-approved
│ ├─ ParameterValidator.Validate ← 400 on bad parameters
│ └─ InboundScriptExecutor.ExecuteAsync
│ ├─ ForbiddenApiChecker ← static trust model enforcement
│ ├─ Roslyn compile + cache ← handler cached by method name
│ └─ ReturnValueValidator ← 500 on return shape mismatch
└─ ICentralAuditWriter.WriteAsync ← fire-and-forget from middleware finally
```
The filter is applied at registration time via `.AddEndpointFilter<InboundApiEndpointFilter>()` in `EndpointExtensions.MapInboundAPI`; it runs before the handler so a standby node or an oversized body never reaches auth or script execution.
### Script compilation and handler cache
`InboundScriptExecutor` is a singleton holding two `ConcurrentDictionary` instances:
- `_scriptHandlers` — maps method name to a compiled `Func<InboundScriptContext, Task<object?>>`.
- `_knownBadMethods` — records methods whose scripts have failed to compile, capped at 1 000 entries, so a bad script is compiled at most once per startup and a flood of unique bogus names cannot grow the cache without bound.
The compilation path in `CompileAndRegister`:
```csharp
public bool CompileAndRegister(ApiMethod method)
{
var handler = Compile(method);
if (handler == null)
{
TryRecordBadMethod(method.Name);
return false;
}
_knownBadMethods.TryRemove(method.Name, out _);
return Register(method.Name, handler);
}
```
`Compile` runs `ForbiddenApiChecker.FindViolations` first — a Roslyn syntax-tree walk that rejects forbidden namespace references (`System.IO`, `System.Diagnostics`, `System.Threading` except `Tasks`, `System.Reflection`, `System.Net`, `System.Runtime.InteropServices`, `Microsoft.Win32`) and reflection-gateway member names (`GetType`, `Assembly`, `GetMethod`, `CreateInstance`, `InvokeMember`, and others). Scripts containing `dynamic` or `Activator` are also rejected. This is defence-in-depth, not a true sandbox.
If a method is invoked before it has been compiled — for example a method created after startup — `ExecuteAsync` performs a lazy compile on first call, then stores the handler via `GetOrAdd` so concurrent first callers share one delegate.
Scripts are compiled with a restricted reference set (`mscorlib`, `System.Linq`, `System.Collections.Generic`, `RouteHelper`'s assembly, `ScriptParameters`'s assembly, and the C# runtime binder) and with imports for `System`, `System.Collections.Generic`, `System.Linq`, and `System.Threading.Tasks`. The `globalsType` is `InboundScriptContext`.
### Script context and the `Route` surface
`InboundScriptContext` is the Roslyn globals object injected into every running script:
```csharp
public class InboundScriptContext
{
public ScriptParameters Parameters { get; }
public RouteHelper Route { get; }
public CancellationToken CancellationToken { get; }
}
```
`Parameters` wraps the validated, type-coerced values. `Parameters["key"]` gives raw `object?` access; `Parameters.Get<T>("key")` adds typed conversion with clear error messages (`ScriptParameterException`). `Route` is a scoped `RouteHelper` already bound to the method-level deadline token and to the inbound request's `ParentExecutionId`.
`RouteHelper.To(instanceCode)` returns a `RouteTarget` that exposes five operations:
| Method | Description |
|--------|-------------|
| `Call(scriptName, parameters?)` | Invoke a script on the instance; returns the script's return value. |
| `GetAttribute(name)` | Read one attribute value. |
| `GetAttributes(names)` | Batch-read; returns `IReadOnlyDictionary<string, object?>`. |
| `SetAttribute(name, value)` | Write one attribute value. |
| `SetAttributes(dict)` | Batch-write. |
All five operations are synchronous from the script's perspective (the central node blocks until the site responds or the method timeout fires). There is no store-and-forward — a site-unreachable or timed-out routed call throws `InvalidOperationException` back to the script, which surfaces as a `500` to the caller.
`RouteTarget.Call` builds a `RouteToCallRequest` carrying `ParentExecutionId` so the spawned site script execution records the inbound request as its parent in the audit tree:
```csharp
var request = new RouteToCallRequest(
correlationId, _instanceCode, scriptName, ScriptArgs.Normalize(parameters),
DateTimeOffset.UtcNow, _parentExecutionId);
var response = await _instanceRouter.RouteToCallAsync(siteId, request, token);
```
`IInstanceRouter` is the seam over `CommunicationService`; in production, `CommunicationServiceInstanceRouter` delegates every call directly to `CommunicationService.RouteToCallAsync / RouteToGetAttributesAsync / RouteToSetAttributesAsync`.
### Active-node gating
`IActiveNodeGate.IsActiveNode` is the seam the Host implements using Akka cluster state. When `false`, `InboundApiEndpointFilter` returns `503` before any auth or script logic runs. When no implementation is registered — non-clustered hosts, tests — the endpoint is served, preserving prior behaviour.
### Audit integration
`AuditWriteMiddleware` sits in the pipeline above the endpoint filter and handler. At the start of every request it:
1. Mints a fresh `Guid` as the request's `ExecutionId` and stashes it on `HttpContext.Items[InboundExecutionIdItemKey]`.
2. Calls `HttpRequest.EnableBuffering()` (for POST/PUT/PATCH requests only) and reads up to `AuditLogOptions.InboundMaxBytes` bytes of the request body into a bounded audit copy, then rewinds the stream to position 0 so the downstream handler sees the full payload.
3. Wraps `HttpResponse.Body` in `CapturedResponseStream`, which mirrors every write to the real sink while capturing up to `InboundMaxBytes` bytes for the audit copy.
In the `finally` block, the middleware calls `ICentralAuditWriter.WriteAsync` (fire-and-forget with fault observation) to emit one `AuditChannel.ApiInbound` row. The row's `AuditKind` is `InboundAuthFailure` for `401`/`403` and `InboundRequest` for all other outcomes. Status is `Delivered` for 2xx and `Failed` for 4xx/5xx or a handler exception. `Actor` is the resolved API key display name (stashed by the endpoint handler on `HttpContext.Items[AuditActorItemKey]` after successful auth); it is forced null for auth failures so the middleware never echoes an unauthenticated principal. The audit row's `ExecutionId` is the same `Guid` minted in step 1.
The endpoint handler reads that same `ExecutionId` from `HttpContext.Items` and threads it into `InboundScriptExecutor.ExecuteAsync` as `parentExecutionId`, which in turn passes it to `RouteHelper.WithParentExecutionId`. Any `Route.To().Call()` inside the script carries it as `RouteToCallRequest.ParentExecutionId`, so the spawned site script execution is linked back to this inbound request in the audit tree.
Audit emission is best-effort. A write failure is caught, logged at `Warning`, and dropped. It never alters the HTTP response.
## Usage
### HTTP contract
```http
POST /api/{methodName}
Authorization: Bearer sbk_<keyId>_<secret>
Content-Type: application/json
{
"siteId": "SiteA",
"startDate": "2026-03-01",
"endDate": "2026-03-16"
}
```
Success response (`200`):
```json
{
"siteName": "Site Alpha",
"totalUnits": 14250,
"lines": [
{ "lineName": "Line-1", "units": 8200, "efficiency": 92.5 }
]
}
```
Error responses:
| Status | Condition |
|--------|-----------|
| `401` | Missing, malformed, unknown, revoked, or secret-mismatched token. |
| `403` | Valid key, but not in scope for this method; or method not found. |
| `400` | Missing required parameters, wrong types, or unexpected fields. |
| `413` | Request body exceeds `MaxRequestBodyBytes`. |
| `500` | Script execution error, compilation failure, or return-shape mismatch. |
| `503` | Request reached a standby node. |
The `403` body is identical whether the method does not exist or the key lacks scope, so a caller holding a valid key cannot enumerate method names by observing status differences.
### Writing a method script
A method script runs as a Roslyn C# script with `InboundScriptContext` as globals. The script has access to `Parameters`, `Route`, and `CancellationToken`.
```csharp
// Example: read a parameter, call a site script, return a result
var siteId = Parameters.Get<string>("siteId");
var result = await Route.To(siteId).Call("GetProductionSummary", new { date = Parameters.Get<string>("date") });
return result;
```
The `Route.To().Call()` inherits the method-level timeout automatically. A script that needs a tighter per-call bound may pass an explicit `CancellationToken`. Scripts may not access `System.IO`, the entire `System.Diagnostics` namespace (including `Process`), `System.Threading` (except `Tasks`), `System.Reflection`, `System.Net`, or reflection-gateway members — violations are rejected statically at compile time.
### Startup compilation and hot-reload
At startup the Host loads all `ApiMethod` rows from the configuration database and calls `CompileAndRegister` on each. After a method is updated via the Management API or CLI, the Management Service calls `CompileAndRegister` again — the updated script takes effect on the next request, with no node restart. Methods created after startup compile lazily on first invocation. Scripts modified directly in the database do not take effect until the next node restart; always use the Management API, CLI, or Central UI.
## Configuration
Options class: `InboundApiOptions`, bound from the `ScadaBridge:InboundApi` section.
| Key | Default | Description |
|-----|---------|-------------|
| `DefaultMethodTimeout` | `00:00:30` | Execution timeout applied when `ApiMethod.TimeoutSeconds` is zero or not set. |
| `MaxRequestBodyBytes` | `1048576` (1 MiB) | Body size cap enforced by `InboundApiEndpointFilter` before the body is parsed. Requests whose `Content-Length` exceeds this return `413`; chunked requests are cut off by Kestrel as they stream in. |
| `ApiKeyPepper` | _(required)_ | Server-side HMAC pepper for bearer credentials. Consumed by the shared `IApiKeyVerifier`; must be a strong, random value (`≥ 16` characters), different per environment, supplied via a secret store. |
The inbound body-capture cap for audit is configured separately under `AuditLog:InboundMaxBytes` (default 1 MiB; range `[8192, 16777216]`). It governs only the audit copy — the downstream handler always sees the full body.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `ApiMethod`, `ParameterDefinition`, `ScriptParameters`, `ScriptParameterException`, the `RouteToCall*` / `RouteToGetAttributes*` / `RouteToSetAttributes*` message records, `IInboundApiRepository`, and `IInstanceLocator`. Also owns `ICentralAuditWriter` (via `ZB.MOM.WW.Audit`), `AuditChannel`, `AuditKind`, `AuditStatus`, and `ScadaBridgeAuditEventFactory`.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — provides the `IInboundApiRepository` implementation (`GetMethodByNameAsync`, `GetAllApiMethodsAsync`, CRUD). Method definitions persist in the central MS SQL configuration database.
- [CentralSite Communication (#5)](./Communication.md) — `CommunicationServiceInstanceRouter` delegates every `Route.To()` operation to `CommunicationService`. The routed call travels from the central `CentralCommunicationActor` to the target site via `ClusterClient`, reaches the target `InstanceActor`, and a `ScriptExecutionActor` executes the named script. The return value flows back synchronously.
- [Audit Log (#23)](./AuditLog.md) — `AuditWriteMiddleware` resolves `ICentralAuditWriter` to emit the `ApiInbound` row via the central direct-write path. The inbound request is the parent execution for any site script it spawns: the middleware's `ExecutionId` becomes `RouteToCallRequest.ParentExecutionId` on every routed `Call`. Cross-link: `AuditWriteMiddleware.InboundExecutionIdItemKey` / `AuditWriteMiddleware.AuditActorItemKey` are the `HttpContext.Items` keys that tie the endpoint handler and middleware together.
- [Security (#10)](./Security.md) — API key verification (`IApiKeyVerifier`, `AddZbApiKeyAuth`) is registered by the Host. The inbound API uses a dedicated key scheme independent of LDAP/AD session auth.
- [Cluster Infrastructure (#13)](./ClusterInfrastructure.md) — `IActiveNodeGate` (interface in this project; implementation in the Host) gates the endpoint to the active central node. A standby returns `503` without running any script logic.
- [Health Monitoring (#11)](./HealthMonitoring.md) — `ScadaBridgeTelemetry.RecordInboundApiRequest(methodName)` is called on every request (after auth failures are classified); `CentralAuditWriteFailures` surfaces on the central health snapshot when an audit write fails.
- Design spec: [Component-InboundAPI.md](../requirements/Component-InboundAPI.md).
## Troubleshooting
### A method always returns 500 after a script update
The in-memory handler cache still holds the previous compiled delegate. If the update went through the Management API or CLI, `CompileAndRegister` should have been called automatically and the new script should be active on the next request. If the script was edited directly in the database, the cached delegate is stale until the next node restart. Check the `ScadaBridge.InboundAPI` log category for a `"script compilation failed"` or `"trust model violation"` warning to distinguish a compile error from a routing failure.
### A method is stuck in the known-bad-methods cache
If a previously broken script is fixed but `ExecuteAsync` still returns `"Script compilation failed for this method"`, the method name is in `_knownBadMethods`. `CompileAndRegister` clears the bad-method entry on a successful compile; calling it (via the Management API or CLI `api-method update`) after the fix is applied resets the cache and makes the corrected script active immediately.
### Routed calls time out but the site is reachable
The method-level timeout covers the entire execution including `Route.To().Call()`. A slow site script, a large return value, or network latency can consume the budget. `TimeoutSeconds` on the `ApiMethod` entity controls the cap per method; `DefaultMethodTimeout` in `InboundApiOptions` applies when `TimeoutSeconds` is zero. Increase `TimeoutSeconds` for long-running methods; a `503` from the `/health/active` endpoint on the site side indicates a site failover mid-call.
### Audit rows missing for inbound requests
`AuditWriteMiddleware` emits on a fire-and-forget `Task`; a write failure is caught and logged at `Warning` under `AuditWriteMiddleware`. `CentralAuditWriteFailures` increments on the central health snapshot. The request itself still returns its normal HTTP response — a missing audit row never means the call failed.
## Related Documentation
- [Inbound API design specification](../requirements/Component-InboundAPI.md)
- [Audit Log](./AuditLog.md)
- [Commons](./Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [CentralSite Communication](./Communication.md)
- [Security](./Security.md)
- [Cluster Infrastructure](./ClusterInfrastructure.md)
- [External System Gateway](./ExternalSystemGateway.md)
- [Site Runtime](./SiteRuntime.md)
+240
View File
@@ -0,0 +1,240 @@
# Management Service
The Management Service is the Akka.NET actor that provides programmatic access to every admin operation on the central cluster — the same operations the Central UI exposes, made available over an HTTP API and, optionally, a `ClusterClient` path for cross-cluster callers.
## Overview
Management Service (#18) runs on the central cluster only. The component code lives in `src/ZB.MOM.WW.ScadaBridge.ManagementService/`, with four source files:
- `ManagementActor.cs` — the `ReceiveActor` that owns authorization, dispatch, and error mapping for all commands.
- `ManagementEndpoints.cs` — the `POST /management` minimal-API endpoint that authenticates over HTTP Basic Auth and forwards to the actor.
- `AuditEndpoints.cs` — dedicated REST endpoints (`GET /api/audit/query`, `GET /api/audit/export`) for the centralized Audit Log (#23); these bypass the actor because the workload is read-only and keyset-paged.
- `DebugStreamHub.cs` — a SignalR hub for real-time debug stream subscriptions (attribute and alarm state changes).
`ServiceCollectionExtensions.AddManagementService` registers `ManagementActorHolder` (a DI singleton that holds the live `IActorRef`) and binds `ManagementServiceOptions` from `ScadaBridge:ManagementService`.
The `ManagementActor` is not a cluster singleton. Because it is completely stateless — it opens a new DI scope per command and delegates all work to repositories and domain services — every central node runs its own instance. Either node can serve any request independently, so no singleton coordination is needed.
## Key Concepts
### `ManagementEnvelope` and the wire protocol
Every command arrives wrapped in a `ManagementEnvelope`:
```csharp
public record AuthenticatedUser(
string Username, string DisplayName,
string[] Roles, string[] PermittedSiteIds);
public record ManagementEnvelope(AuthenticatedUser User, object Command, string CorrelationId);
```
The HTTP endpoint constructs the envelope after LDAP authentication and role resolution; the `CorrelationId` (a `Guid` formatted as `"N"`) ties server-log entries to the caller's request. The actor never authenticates a second time — the envelope carries an already-resolved `AuthenticatedUser`.
### Role enforcement and site scope
Authorization is a two-level check. `GetRequiredRole` maps each command type to the minimum role required:
| Role | Commands |
|------|----------|
| `Administrator` | Site management, role mappings, API key management, scope rules, `QueryAuditLogCommand`, `PreviewBundle`, `ImportBundle` |
| `Designer` | Template authoring (members, folders, compositions), external systems, data connections, notification lists, shared scripts, database connections, inbound API methods, `ExportBundle` |
| `Deployer` | Instance lifecycle, connection bindings, overrides, deployments, debug snapshot, `RetryParkedMessageCommand`, `DiscardParkedMessageCommand` |
| _(any authenticated user)_ | Read-only list/get queries, health summary |
Within `Deployer` commands, `EnforceSiteScope` applies a second check: users whose role mapping carries `PermittedSiteIds` can only touch instances and sites within their permitted set. Administrators and system-wide deployers (empty `PermittedSiteIds`) are unrestricted. A violation throws `SiteScopeViolationException`, which `MapFault` converts to `ManagementUnauthorized`.
### Command registry
`ManagementCommandRegistry` (in Commons) maps wire names to CLR types via reflection at startup. It scans the `ZB.MOM.WW.ScadaBridge.Commons.Messages.Management` namespace for non-abstract types whose name ends in `"Command"` and stores them in a `FrozenDictionary`. The HTTP endpoint calls `ManagementCommandRegistry.Resolve(commandName)` to get the target type, then deserializes the `payload` JSON into it.
### Audit contract
Mutating handlers that call repositories directly invoke `AuditAsync` (backed by `IAuditService`) after a successful write. Most handlers that delegate to a domain service — `TemplateService`, `DeploymentService`, `ArtifactDeploymentService`, `TemplateFolderService`, `SharedScriptService` — do not call `AuditAsync`; those services audit internally, avoiding double-logging. However, some delegating handlers also call `AuditAsync` directly: `HandleCreateInstance` delegates to `InstanceService.CreateInstanceAsync` and then calls `AuditAsync` itself. SMTP configuration and API key responses project out secrets before the audit entry is written.
## Architecture
### Actor lifecycle and registration
`AkkaHostedService` (in the Host) creates the `ManagementActor` under the path `/user/management` and registers it with `ClusterClientReceptionist`:
```csharp
var mgmtActor = _actorSystem!.ActorOf(
Props.Create(() => new ManagementActor(_serviceProvider, mgmtLogger)),
"management");
ClusterClientReceptionist.Get(_actorSystem).RegisterService(mgmtActor);
var mgmtHolder = _serviceProvider.GetRequiredService<ManagementActorHolder>();
mgmtHolder.ActorRef = mgmtActor;
```
`ClusterClientReceptionist` advertises the actor to `ClusterClient` senders without requiring them to join the Akka cluster. The `ManagementActorHolder.ActorRef` property is then the bridge from the HTTP endpoint (which runs in ASP.NET Core middleware) into the Akka actor world.
The actor declares an explicit supervisor strategy — one-for-one with Resume and no retry limit — to match the coordinator-actor convention and remain correct if child actors are added later.
### HTTP Management API (`POST /management`)
`ManagementEndpoints.MapManagementAPI` registers the endpoint. Each request goes through six steps:
1. Raise the per-request body size cap to 200 MB (needed for Transport bundle imports).
2. Decode `Authorization: Basic <base64>` and split username/password.
3. Authenticate via `ILdapAuthService`.
4. Resolve roles via `RoleMapper`, building the `AuthenticatedUser` with any site-scope limits.
5. Deserialize the JSON body (`command` + `payload`) via `ManagementCommandRegistry`.
6. `Ask` the `ManagementActor` with a `ManagementEnvelope` and map the response:
```csharp
return response switch
{
ManagementSuccess success => Results.Text(success.JsonData, "application/json", statusCode: 200),
ManagementError error => Results.Json(new { error = error.Error, code = error.ErrorCode }, statusCode: 400),
ManagementUnauthorized u => Results.Json(new { error = u.Message, code = "UNAUTHORIZED" }, statusCode: 403),
_ => Results.Json(new { error = "Unexpected response.", code = "INTERNAL_ERROR" }, statusCode: 500)
};
```
The `Ask` timeout defaults to 30 seconds and is overridable via `ScadaBridge:ManagementService:CommandTimeout`. An elapsed timeout returns HTTP 504.
### Actor dispatch and error mapping
`ManagementActor.HandleEnvelope` checks the required role, then calls `ProcessCommand`, which opens a DI scope, runs `DispatchCommand`, and wraps the result in `ManagementSuccess`. The `PipeTo` pattern keeps the actor's message loop free during async work; the failure continuation maps exceptions to `ManagementError` or `ManagementUnauthorized`:
```csharp
private void HandleEnvelope(ManagementEnvelope envelope)
{
var sender = Sender;
var correlationId = envelope.CorrelationId;
var user = envelope.User;
var requiredRole = GetRequiredRole(envelope.Command);
if (requiredRole != null && !user.Roles.Contains(requiredRole, StringComparer.OrdinalIgnoreCase))
{
sender.Tell(new ManagementUnauthorized(correlationId,
$"Role '{requiredRole}' required for {envelope.Command.GetType().Name}"));
return;
}
ProcessCommand(envelope, user)
.PipeTo(sender,
success: result => result,
failure: ex => MapFault(ex, correlationId, envelope.Command));
}
```
`ManagementCommandException` carries a message safe to surface to callers. Any other exception is an unanticipated fault; only the correlation ID is returned so internal detail (server names, constraint names) is not disclosed.
### Audit REST API (`/api/audit/*`)
`AuditEndpoints.MapAuditAPI` registers two GET endpoints that go directly to `IAuditLogRepository`, bypassing the actor:
- `GET /api/audit/query` — keyset-paged JSON result. Requires `OperationalAudit` permission (Admin / Audit / AuditReadOnly roles). Accepts `channel`, `kind`, `status`, `sourceSiteId`, `correlationId`, `executionId`, `parentExecutionId`, `fromUtc`, `toUtc`, `pageSize`, and cursor params `afterOccurredAtUtc`/`afterEventId`. Returns `{ events, nextCursor }` where `nextCursor` is explicit `null` on the last page.
- `GET /api/audit/export` — server-side streaming export (CSV or JSONL) of all matching rows, paging the repository internally at 1 000 rows per batch and flushing after each batch. Requires `AuditExport` permission (Admin / Audit roles). `format=parquet` returns HTTP 501 (deferred).
Both endpoints apply the same HTTP Basic Auth / LDAP / role flow as `/management`. Site-scoped callers have their `sourceSiteId` filter intersected with their `PermittedSiteIds`; an explicit out-of-scope filter returns HTTP 403 rather than silently empty results.
### Debug stream (`/debug-stream`)
`DebugStreamHub` is a SignalR hub registered alongside the management endpoints. It authenticates on `OnConnectedAsync` (same Basic Auth / LDAP / role flow), requires the `Deployer` role, and enforces per-instance site scope on `SubscribeInstance`. Accepted connections receive an initial `DebugViewSnapshot` followed by incremental `AttributeValueChanged` and `AlarmStateChanged` events pushed from `DebugStreamService`.
## Usage
### Sending a command from the CLI
The CLI sends a single `POST /management` with JSON body and Basic Auth; it does not use `ClusterClient` directly. A typical request:
```http
POST /management
Authorization: Basic base64(username:password)
Content-Type: application/json
{
"command": "ListSites",
"payload": {}
}
```
A successful response is HTTP 200 with the JSON result. An authorization failure is HTTP 403 with `{ "error": "...", "code": "UNAUTHORIZED" }`.
### Sending a command via ClusterClient
The `ManagementActor` is also reachable from any `ClusterClient` that has a contact point into the central cluster. The actor is registered under `/system/receptionist` with the path `/user/management`. Callers construct and `Tell` a `ManagementEnvelope` and expect one of `ManagementSuccess`, `ManagementError`, or `ManagementUnauthorized` in reply.
## Command Groups
`DispatchCommand` in `ManagementActor.cs` is the canonical enumeration of every supported command. The table below organizes them by domain area.
| Group | Commands | Minimum role |
|-------|----------|--------------|
| Templates | `ListTemplates`, `GetTemplate`, `CreateTemplate`, `UpdateTemplate`, `DeleteTemplate`, `ValidateTemplate` | Designer (mutations) |
| Template members | `AddTemplateAttribute`, `UpdateTemplateAttribute`, `DeleteTemplateAttribute`, `AddTemplateAlarm`, `UpdateTemplateAlarm`, `DeleteTemplateAlarm`, `AddTemplateNativeAlarmSource`, `UpdateTemplateNativeAlarmSource`, `DeleteTemplateNativeAlarmSource`, `ListTemplateNativeAlarmSources`, `AddTemplateScript`, `UpdateTemplateScript`, `DeleteTemplateScript`, `AddTemplateComposition`, `DeleteTemplateComposition` | Designer (mutations) |
| Template folders | `ListTemplateFolders`, `CreateTemplateFolder`, `RenameTemplateFolder`, `MoveTemplateFolder`, `DeleteTemplateFolder`, `MoveTemplateToFolder` | Designer (mutations) |
| Instances | `ListInstances`, `GetInstance`, `CreateInstance`, `MgmtDeployInstance`, `MgmtEnableInstance`, `MgmtDisableInstance`, `MgmtDeleteInstance`, `SetConnectionBindings`, `SetInstanceOverrides`, `SetInstanceArea`, `SetInstanceAlarmOverride`, `DeleteInstanceAlarmOverride`, `ListInstanceAlarmOverrides`, `SetInstanceNativeAlarmSourceOverride`, `DeleteInstanceNativeAlarmSourceOverride`, `ListInstanceNativeAlarmSourceOverrides` | Deployer (mutations) |
| Sites & areas | `ListSites`, `GetSite`, `CreateSite`, `UpdateSite`, `DeleteSite`, `ListAreas`, `CreateArea`, `UpdateArea`, `DeleteArea` | Administrator (site mutations); Designer (`CreateArea`, `UpdateArea`, `DeleteArea`) |
| Data connections | `ListDataConnections`, `GetDataConnection`, `CreateDataConnection`, `UpdateDataConnection`, `DeleteDataConnection` | Designer (mutations) |
| External systems | `ListExternalSystems`, `GetExternalSystem`, `CreateExternalSystem`, `UpdateExternalSystem`, `DeleteExternalSystem`, `ListExternalSystemMethods`, `GetExternalSystemMethod`, `CreateExternalSystemMethod`, `UpdateExternalSystemMethod`, `DeleteExternalSystemMethod` | Designer (mutations) |
| Notification lists / SMTP | `ListNotificationLists`, `GetNotificationList`, `CreateNotificationList`, `UpdateNotificationList`, `DeleteNotificationList`, `ListSmtpConfigs`, `UpdateSmtpConfig` | Designer (mutations) |
| Shared scripts | `ListSharedScripts`, `GetSharedScript`, `CreateSharedScript`, `UpdateSharedScript`, `DeleteSharedScript` | Designer (mutations) |
| Database connections | `ListDatabaseConnections`, `GetDatabaseConnection`, `CreateDatabaseConnectionDef`, `UpdateDatabaseConnectionDef`, `DeleteDatabaseConnectionDef` | Designer (mutations) |
| Inbound API methods | `ListApiMethods`, `GetApiMethod`, `CreateApiMethod`, `UpdateApiMethod`, `DeleteApiMethod` | Designer (mutations) |
| Security | `ListRoleMappings`, `CreateRoleMapping`, `UpdateRoleMapping`, `DeleteRoleMapping`, `ListApiKeys`, `CreateApiKey`, `UpdateApiKey`, `DeleteApiKey`, `SetApiKeyMethods`, `ListScopeRules`, `AddScopeRule`, `DeleteScopeRule` | Administrator |
| Deployments | `MgmtDeployArtifacts`, `QueryDeployments`, `GetDeploymentDiff` | Deployer |
| Health | `GetHealthSummary`, `GetSiteHealth` | Any authenticated user |
| Remote queries | `QueryEventLogsCommand`, `QueryParkedMessagesCommand` (any authenticated user); `RetryParkedMessageCommand`, `DiscardParkedMessageCommand`, `DebugSnapshotCommand` (Deployer) | Varies |
| Audit (legacy) | `QueryAuditLog` | Administrator |
| Transport | `ExportBundle` (Designer), `PreviewBundle`, `ImportBundle` (Administrator) | Varies |
`ValidateTemplate` builds a `FlattenedConfiguration` from the template's attributes, alarms, and scripts, runs the full `ValidationService` pipeline (collision detection, script compilation, trigger reference checks), and merges in naming-collision errors from `TemplateService.DetectCollisionsAsync` — all without a deployment.
`SetInstanceOverrides` validates every attribute name and lock status against the template before applying any write, making the batch all-or-nothing at the validation layer.
## Configuration
| Section | Key | Default | Description |
|---------|-----|---------|-------------|
| `ScadaBridge:ManagementService` | `CommandTimeout` | `00:00:30` | `Ask` timeout the `ManagementEndpoints` applies when forwarding to the `ManagementActor`. A non-positive value falls back to the 30-second default. |
The 200 MB per-request body cap (`ManagementEndpoints.MaxManagementRequestBodyBytes`) is hard-coded; it exists to accommodate Transport (#24) Import calls where a 100 MB raw bundle base64-inflates to roughly 140 MB plus the envelope overhead.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns the message contracts (`Messages/Management/`), `ManagementEnvelope`, `ManagementCommandRegistry`, `AuthenticatedUser`, and `ManagementSuccess`/`ManagementError`/`ManagementUnauthorized` response types.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — every repository (`ITemplateEngineRepository`, `ISiteRepository`, `IExternalSystemRepository`, `INotificationRepository`, `ISecurityRepository`, `IInboundApiRepository`, `IDeploymentManagerRepository`, `ICentralUiRepository`) and `IAuditService` are backed by EF Core against the central MS SQL database. Management Service resolves them per-command through scoped DI.
- [Template Engine (#1)](./TemplateEngine.md) — `TemplateService`, `TemplateFolderService`, `SharedScriptService`, and the `ValidationService` handle template authoring and validation. Management Service is the sole entry point for template mutations from outside the Central UI.
- [Deployment Manager (#2)](./DeploymentManager.md) — `DeploymentService` and `ArtifactDeploymentService` own the deployment pipeline. `MgmtDeployInstance` and `MgmtDeployArtifacts` delegate here.
- [CentralSite Communication (#5)](./Communication.md) — `CommunicationService` routes `QueryEventLogsCommand`, `QueryParkedMessagesCommand`, `RetryParkedMessageCommand`, `DiscardParkedMessageCommand`, and `DebugSnapshotCommand` to site actors via `ClusterClient`. Deployment commands also flow through the communication layer.
- [Security & Auth (#10)](./Security.md) — `ILdapAuthService` and `RoleMapper` authenticate and map roles on every HTTP request; the `Roles` constants and `IInboundApiKeyAdmin` are also consumed here.
- [Health Monitoring (#11)](./HealthMonitoring.md) — `ICentralHealthAggregator` answers `GetHealthSummary` and `GetSiteHealth` queries synchronously from its in-memory state.
- [Audit Log (#23)](./AuditLog.md) — `AuditEndpoints` reads the central `AuditLog` table via `IAuditLogRepository` directly (no actor hop). `QueryAuditLogCommand` through `/management` is a legacy path for the configuration-change audit via `ICentralUiRepository`.
- [CLI (#19)](./CLI.md) — the primary consumer of `POST /management` and the `/api/audit/*` endpoints. Constructs `ManagementEnvelope`-shaped JSON, sends Basic Auth, and deserializes the response.
- [Host (#15)](./Host.md) — `AkkaHostedService` creates the `ManagementActor`, registers it with `ClusterClientReceptionist`, and sets `ManagementActorHolder.ActorRef` so the HTTP endpoint can reach it.
- Design spec: [Component-ManagementService.md](../requirements/Component-ManagementService.md).
## Troubleshooting
### Actor not ready (HTTP 503)
If `POST /management` returns `503 SERVICE_UNAVAILABLE`, `ManagementActorHolder.ActorRef` is null — the actor system has not finished starting. This resolves itself once `AkkaHostedService.StartAsync` completes. The `/health/ready` endpoint is the gating signal; traffic should not reach `/management` before it returns 200.
### Command timeout (HTTP 504)
A 504 response means the `Ask` to `ManagementActor` did not return within the configured `CommandTimeout`. The server log entry includes the `CorrelationId` from the response body. Common causes: a long-running deployment waiting on a site that is offline, or a database query against a cold EF Core connection. Increasing `ScadaBridge:ManagementService:CommandTimeout` buys time while the root cause is investigated.
### Unexpected internal error
Any exception that is not a `ManagementCommandException` or `SiteScopeViolationException` maps to a generic `COMMAND_FAILED` error with the correlation ID. The server log at `Error` level will contain the full exception, keyed by `CorrelationId`. `ManagementCommandException` messages are intentionally surfaced verbatim; all other exception messages are suppressed on the wire to avoid leaking internal detail.
### Audit log export stalls mid-stream
`GET /api/audit/export` streams rows in pages of 1 000 and flushes after each page. If the response body stops arriving, check whether a proxy is buffering the response (the endpoint sets `Cache-Control: no-store` to defeat most buffers). The `pageSize` parameter on `/api/audit/query` caps at 1 000; requests above that are silently clamped.
## Related Documentation
- [Management Service design specification](../requirements/Component-ManagementService.md)
- [CLI](./CLI.md)
- [CentralSite Communication](./Communication.md)
- [Commons](./Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Template Engine](./TemplateEngine.md)
- [Deployment Manager](./DeploymentManager.md)
- [Security](./Security.md)
- [Audit Log](./AuditLog.md)
- [Host](./Host.md)
+260
View File
@@ -0,0 +1,260 @@
# Notification Outbox
The Notification Outbox is the central component that receives store-and-forwarded notifications from site clusters, persists each one to the `Notifications` table in the central MS SQL database, and delivers them through per-type delivery adapters. It is the first outbox component to run centrally — the Store-and-Forward Engine remains site-only.
## Overview
Notification Outbox (#21) runs exclusively on the central cluster. Sites no longer send notifications directly via SMTP: a script's `Notify.Send` call generates a `NotificationId` (GUID) locally, the notification is stored in the site S&F buffer, forwarded to central via CentralSite Communication, and the central outbox owns all dispatch and delivery from that point on.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.NotificationOutbox/`, with a flat layout:
- Root — `NotificationOutboxActor`, `NotificationOutboxOptions`, `ServiceCollectionExtensions`.
- `Delivery/``INotificationDeliveryAdapter`, `DeliveryOutcome`/`DeliveryResult`, `EmailNotificationDeliveryAdapter`.
- `Messages/``InternalMessages` (actor-internal timer and pipe messages, never sent over the network).
Shared types used throughout — `Notification`, `NotificationStatus`, `NotificationType`, `INotificationOutboxRepository`, and all ClusterClient message contracts — live in `src/ZB.MOM.WW.ScadaBridge.Commons/`.
The DI entry point is `ServiceCollectionExtensions.AddNotificationOutbox`, called by the Host on central nodes. It binds `NotificationOutboxOptions` and registers the typed delivery adapters. The Host separately registers `NotificationOutboxActor` as a cluster singleton and wires the ClusterClient receptionist so inbound `NotificationSubmit` messages reach the actor regardless of which central node is active.
## Key Concepts
### `Notifications` table as source of truth
The central MS SQL `Notifications` table is the single source of audit truth for every notification in the system. One row per `NotificationId` (GUID primary key), regardless of delivery channel. The table is type-agnostic: the `Type` discriminator (`Email`; future channels add new enum members) selects the delivery adapter while the rest of the schema is shared. The `Notifications` table answers both operational queries (current status, retry count, next attempt) and KPI queries (queue depth, stuck count, throughput); no separate time-series store is needed.
### Status lifecycle
| Status | Where it lives | Meaning |
|---|---|---|
| `Forwarding` | Site-local only | Notification is in the site S&F buffer; never stored in `Notifications`. |
| `Pending` | Central | Ingested; awaiting first dispatch sweep. |
| `Retrying` | Central | Transient failure; `NextAttemptAt` schedules the next attempt. |
| `Delivered` | Central, terminal | Successfully sent; `DeliveredAt` and `ResolvedTargets` are set. |
| `Parked` | Central, terminal | Permanent failure or retries exhausted; `LastError` records why. |
| `Discarded` | Central, terminal | Operator-cancelled a `Parked` notification; row is kept for the audit record. |
`Forwarding` is answered site-locally by `Notify.Status(id)`; once the ack arrives, queries round-trip to the `Notifications` table on central.
### At-least-once site→central handoff
Central ingests a `NotificationSubmit` with `insert-if-not-exists` on `NotificationId`, then acks the site with `NotificationSubmitAck`. The site S&F engine clears the message only after receiving that ack. A lost ack causes the site to resend; the GUID idempotency key makes the resend a no-op. Because the ack is sent only after the row is persisted, no notification is lost to a race between the ack and a central failover — the row already exists.
### No Akka-level replication
All outbox state lives in MS SQL, which is already the central HA store. There is no Akka replication of the actor's in-memory state. On a central failover the new active node's singleton starts a fresh dispatch sweep; `Pending` and due `Retrying` rows are reclaimed from the table on the next tick.
## Architecture
### `NotificationOutboxActor`
`NotificationOutboxActor` is a `ReceiveActor` that implements `IWithTimers`. It runs as a cluster singleton on the active central node. The actor is responsible for both the ingest path (accepting `NotificationSubmit` messages) and the dispatch path (running the periodic delivery loop). All async work is executed via `PipeTo(Self)` so every result lands on the actor's mailbox thread, preserving single-threaded actor semantics:
```csharp
public class NotificationOutboxActor : ReceiveActor, IWithTimers
{
public NotificationOutboxActor(
IServiceProvider serviceProvider,
NotificationOutboxOptions options,
ICentralAuditWriter auditWriter,
ILogger<NotificationOutboxActor> logger)
{
Receive<NotificationSubmit>(HandleSubmit);
Receive<InternalMessages.IngestPersisted>(HandleIngestPersisted);
Receive<InternalMessages.DispatchTick>(_ => HandleDispatchTick());
Receive<InternalMessages.DispatchComplete>(_ => _dispatching = false);
Receive<InternalMessages.PurgeTick>(_ => HandlePurgeTick());
Receive<InternalMessages.PurgeComplete>(_ => { });
Receive<NotificationOutboxQueryRequest>(HandleQuery);
Receive<NotificationStatusQuery>(HandleStatusQuery);
Receive<NotificationDetailRequest>(HandleDetailRequest);
Receive<RetryNotificationRequest>(HandleRetry);
Receive<DiscardNotificationRequest>(HandleDiscard);
Receive<NotificationKpiRequest>(HandleKpiRequest);
Receive<PerSiteNotificationKpiRequest>(HandlePerSiteKpiRequest);
}
}
```
`PreStart` starts two periodic Akka timers: `DispatchTick` at `DispatchInterval` and `PurgeTick` at `PurgeInterval`. A lifecycle-scoped `CancellationTokenSource` (`_shutdownCts`) is created in `PreStart` and cancelled in `PostStop` so any in-flight SMTP send observes coordinated shutdown instead of blocking for a full connect/auth/send timeout.
### Ingest path
`HandleSubmit` maps a `NotificationSubmit` to a `Notification` entity and calls `PersistAsync`, which opens a fresh DI scope, resolves `INotificationOutboxRepository`, and calls `InsertIfNotExistsAsync`. The boolean result is intentionally ignored — an existing row is treated identically to a fresh insert. The async result is piped back to `Self` as `InternalMessages.IngestPersisted`, which carries the original `Sender` reference so the ack is sent from the actor thread:
```csharp
private void HandleSubmit(NotificationSubmit msg)
{
var sender = Sender;
var notification = BuildNotification(msg);
PersistAsync(notification).PipeTo(
Self,
success: () => new InternalMessages.IngestPersisted(
msg.NotificationId, sender, Succeeded: true, Error: null),
failure: ex => new InternalMessages.IngestPersisted(
msg.NotificationId, sender, Succeeded: false, Error: ex.GetBaseException().Message));
}
```
`NotificationSubmitAck(Accepted: true)` is returned for both a fresh insert and an existing row. Only a thrown repository error yields `Accepted: false`, causing the site to retain and retry its S&F message.
### Dispatch loop
On each `DispatchTick` the actor checks a boolean in-flight guard (`_dispatching`). If a sweep is already running the tick is silently dropped — sweeps never overlap. Otherwise the guard is raised and `RunDispatchPass` is launched:
1. A scoped `INotificationOutboxRepository` fetches `Pending` rows and `Retrying` rows whose `NextAttemptAt ≤ now`, ordered by `CreatedAt` ascending, capped at `DispatchBatchSize`.
2. The retry policy (`maxRetries`, `retryDelay`) is resolved by reading `SmtpConfiguration` from `INotificationRepository`. Non-positive values are clamped to `FallbackMaxRetries = 10` and `FallbackRetryDelay = 1 min` with a warning log so a misconfigured SMTP row does not silently park notifications without retrying.
3. Each notification in the batch is delivered sequentially via `DeliverOneAsync`. Per-notification exceptions are caught and logged so one bad row never aborts the rest of the batch.
`DispatchComplete` (singleton instance) is piped back to `Self` on both the success and failure projections, ensuring the in-flight guard is always cleared even if the sweep faults unexpectedly.
### Delivery adapters
`INotificationDeliveryAdapter` is the per-channel delivery seam:
```csharp
public interface INotificationDeliveryAdapter
{
NotificationType Type { get; }
Task<DeliveryOutcome> DeliverAsync(
Notification notification, CancellationToken cancellationToken = default);
}
```
`DeliveryOutcome` carries a `DeliveryResult` enum (`Success`, `TransientFailure`, `PermanentFailure`), resolved recipients on success, and an error string on failure. The adapter map (`NotificationType → INotificationDeliveryAdapter`) is built lazily on the first dispatch sweep and cached in `_adaptersCache` for the actor's lifetime, paired with an actor-lifetime `IServiceScope` (`_adaptersScope`) disposed in `PostStop`. This avoids rebuilding the dictionary on every sweep while respecting each adapter's scoped DI dependencies.
`EmailNotificationDeliveryAdapter` is the only registered adapter. It resolves the target list and recipients from `INotificationRepository` at delivery time (not at ingest), connects to SMTP via `ISmtpClientWrapper`, acquires an OAuth2 token if configured, and sends a BCC plain-text email. Error classification mirrors the External System Gateway pattern:
| Exception | `DeliveryResult` |
|---|---|
| `SmtpPermanentException` (SMTP 5xx) | `PermanentFailure` |
| Connection/protocol/timeout errors | `TransientFailure` |
| `OperationCanceledException` (shutdown) | propagated, not classified |
| Missing list, empty recipient list, no SMTP config, invalid TLS mode | `PermanentFailure` |
| Unclassified (e.g. OAuth2 token fetch failure) | `PermanentFailure` |
### Status transitions in `DeliverOneAsync`
```csharp
switch (outcome.Result)
{
case DeliveryResult.Success:
notification.Status = NotificationStatus.Delivered;
notification.DeliveredAt = now;
notification.ResolvedTargets = outcome.ResolvedTargets;
break;
case DeliveryResult.TransientFailure:
notification.RetryCount++;
notification.LastError = outcome.Error;
if (notification.RetryCount >= maxRetries)
notification.Status = NotificationStatus.Parked;
else
{
notification.Status = NotificationStatus.Retrying;
notification.NextAttemptAt = now + retryDelay;
}
break;
case DeliveryResult.PermanentFailure:
notification.Status = NotificationStatus.Parked;
notification.LastError = outcome.Error;
break;
}
await outboxRepository.UpdateAsync(notification, cancellationToken);
```
Every attempt also writes audit rows via `ICentralAuditWriter` (see Audit Integration below). Audit-write failure is caught, logged, and never propagates back into the dispatcher — the delivery outcome on the `Notifications` row stands regardless.
### Audit integration
Each delivery attempt emits at least one `AuditChannel.Notification` / `AuditKind.NotifyDeliver` row via `ICentralAuditWriter`:
- An `AuditStatus.Attempted` row (always, per attempt), carrying attempt duration in milliseconds.
- A second, terminal row (`Delivered`, `Parked`, or `Discarded`) only when the post-outcome status is terminal — a transient failure that transitions the notification to `Retrying` emits only the `Attempted` row.
`CorrelationId` on the emitted row(s) is parsed from the `NotificationId` GUID. `ExecutionId` and `ParentExecutionId` are echoed from `Notification.OriginExecutionId` / `Notification.OriginParentExecutionId`, linking the central `NotifyDeliver` rows to the site-emitted `NotifySend` row for the same script run. The `Actor` field is `"system"` — there is no authenticated user at dispatch time.
Manual discard via `HandleDiscard` also emits a terminal `Discarded` row (with a null error, because the discard is operator-driven, not a delivery failure).
### Purge
`HandlePurgeTick` fires daily at `PurgeInterval`. `RunPurgePass` opens a scoped `INotificationOutboxRepository` and calls `DeleteTerminalOlderThanAsync(cutoff)`, where `cutoff = now TerminalRetention` (default 365 days). The deleted count is logged at `Information`. Purge faults are caught internally so the returned task never faults.
## Usage
The outbox is consumed through two DI seams.
**Ingest** — the Host registers `NotificationOutboxActor` as an Akka cluster singleton and with the `ClusterClientReceptionist`. Site clusters send `NotificationSubmit` messages through CentralSite Communication; the actor ingests them without further configuration by the caller.
**Operator actions / UI queries** — the Central UI's Notification Outbox page and the ManagementActor resolve the singleton `IActorRef` and send query or command messages:
| Message | Actor response | Allowed when |
|---|---|---|
| `NotificationOutboxQueryRequest` | `NotificationOutboxQueryResponse` | Any time |
| `NotificationStatusQuery` | `NotificationStatusResponse` | Any time |
| `NotificationDetailRequest` | `NotificationDetailResponse` | Any time |
| `RetryNotificationRequest` | `RetryNotificationResponse` | Row is `Parked` |
| `DiscardNotificationRequest` | `DiscardNotificationResponse` | Row is `Parked` |
| `NotificationKpiRequest` | `NotificationKpiResponse` | Any time |
| `PerSiteNotificationKpiRequest` | `PerSiteNotificationKpiResponse` | Any time |
Retry resets the notification to `Pending` with `RetryCount = 0`, `NextAttemptAt = null`, and `LastError = null` so the dispatch loop reclaims it on the next sweep. Discard transitions to terminal `Discarded` and emits the corresponding audit row.
## Configuration
Options are bound from `ScadaBridge:NotificationOutbox` via `NotificationOutboxOptions`:
| Key | Default | Description |
|---|---|---|
| `DispatchInterval` | `00:00:10` (10 s) | How often the dispatcher polls for due rows. |
| `DispatchBatchSize` | `100` | Maximum notifications claimed per sweep. |
| `StuckAgeThreshold` | `00:10:00` (10 min) | Age beyond which a non-terminal row is counted as stuck in KPIs and the UI badge. Display-only; does not affect dispatcher behaviour. |
| `TerminalRetention` | `365` days | How long terminal rows are kept before the daily purge removes them. |
| `PurgeInterval` | `1` day | Cadence of the background purge sweep. |
| `DeliveredKpiWindow` | `00:01:00` (1 min) | Trailing window for the "delivered last interval" throughput KPI. |
Delivery retry policy (`MaxRetries`, `RetryDelay`) is read at runtime from `SmtpConfiguration` via `INotificationRepository`, not from `NotificationOutboxOptions`. Non-positive values are clamped to `FallbackMaxRetries = 10` and `FallbackRetryDelay = 1 min` with a `Warning` log.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `Notification`, `NotificationStatus`, `NotificationType`, `INotificationOutboxRepository`, `INotificationRepository`, and all message contracts (`NotificationSubmit`, `NotificationSubmitAck`, `NotificationStatusQuery`, `NotificationKpiRequest`, and their responses). Also owns `ScadaBridgeAuditEventFactory` and the `AuditChannel`/`AuditKind`/`AuditStatus` enums used to build dispatch audit rows.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — registers the scoped `INotificationOutboxRepository` (the central `dbo.Notifications` table) and `INotificationRepository` (notification-list, recipient, and SMTP configuration tables). Central hosts must call `AddConfigurationDatabase` before `AddNotificationOutbox`.
- [Notification Service (#8)](./NotificationService.md) — supplies `ISmtpClientWrapper`, `OAuth2TokenService`, `NotificationOptions`, `SmtpTlsModeParser`, `SmtpErrorClassifier`, and the `SmtpPermanentException` type. `AddNotificationOutbox` relies on `AddNotificationService` being called by the Host to register these shared SMTP primitives; registering them twice would duplicate them.
- [CentralSite Communication (#5)](./Communication.md) — carries `NotificationSubmit` / `NotificationSubmitAck` between sites and central via ClusterClient, and `NotificationStatusQuery` / `NotificationStatusResponse` for the `Notify.Status` round-trip.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — the site-side component that durably buffers notifications in SQLite and retries forwarding until central acks. The outbox is the receiving end of the S&F handoff.
- [Audit Log (#23)](./AuditLog.md) — the outbox is a central direct-write caller of `ICentralAuditWriter`. It emits an `Attempted` `NotifyDeliver` row per delivery attempt, plus a terminal row only when the attempt drives the notification to a terminal status (`Delivered`/`Parked`/`Discarded`); it also emits a terminal row per operator Discard. The upstream `NotifySend` row is emitted by the site and arrives at central via standard audit telemetry.
- [Health Monitoring (#11)](./HealthMonitoring.md) — polls `NotificationKpiRequest` / `PerSiteNotificationKpiRequest` for the headline KPI tiles on the health dashboard (queue depth, stuck count, parked count). These are central-computed from the `Notifications` table and are separate from the site S&F backlog metric.
- [Central UI (#9)](./CentralUI.md) — hosts the Notification Outbox page: KPI tiles, a queryable/filterable notification list, per-row Retry/Discard actions on parked notifications, and a stuck-row badge.
## Troubleshooting
### Notifications stuck in `Pending`
A notification stays `Pending` when the dispatch loop is not running or is failing silently. Check for `"Dispatch sweep failed"` at `Error` level in the central node logs. The most common cause is a missing or misconfigured `SmtpConfiguration` row, which the adapter surfaces as a `PermanentFailure` and parks the notification immediately. A `Warning` log line naming `SmtpConfiguration.MaxRetries` or `SmtpConfiguration.RetryDelay` being non-positive indicates the retry policy was clamped — correct the SMTP configuration row.
### Notifications parked with "no delivery adapter for type"
The actor parks a notification immediately and logs this message when `NotificationType` has a value for which no `INotificationDeliveryAdapter` is registered. Currently only `Email` has an adapter; future channel types must be registered in `AddNotificationOutbox` before notifications of that type are submitted.
### Dispatch loop wedged (guard stuck `true`)
The boolean `_dispatching` guard is cleared by `DispatchComplete`, which is piped even on a faulted sweep. If the actor itself stops and restarts, `PreStart` reinitialises the timers and the guard resets. A wedged guard without a restart indicates the `PipeTo` callback is never completing — examine logs around `"Dispatch sweep faulted unexpectedly"`.
### SMTP credentials appearing in logs
`EmailNotificationDeliveryAdapter` runs `CredentialRedactor.Scrub` on all exception messages before logging. If credential strings appear in logs the SMTP exception message is not being routed through the `catch` filters in `DeliverAsync` — ensure the exception type is reachable by one of the three `catch` blocks and not escaping before scrubbing.
### Failover mid-delivery
A central failover while a delivery attempt is in flight leaves the row in its pre-attempt status (`Pending` or `Retrying`). The new active node picks it up on the next dispatch tick. One notification may be re-sent to SMTP as a result — this is an accepted trade-off, consistent with the at-least-once guarantee the S&F Engine already provides.
## Related Documentation
- [Notification Outbox design specification](../requirements/Component-NotificationOutbox.md)
- [Audit Log](./AuditLog.md)
- [Commons](./Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [CentralSite Communication](./Communication.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Health Monitoring](./HealthMonitoring.md)
+228
View File
@@ -0,0 +1,228 @@
# Notification Service
The Notification Service is the central-only component that owns notification-list and SMTP definitions, and supplies the per-channel `INotificationDeliveryAdapter` implementations that the Notification Outbox invokes at delivery time. Sites never deliver notifications; they store-and-forward notification payloads to central, where this component's adapters perform all actual SMTP sends.
## Overview
Notification Service (#8) runs on the central cluster only. Its responsibilities split cleanly into two layers:
- **Definitions**`NotificationList` and `SmtpConfiguration` entities stored in the central Configuration Database. Notification lists carry a `NotificationType` discriminator (`Email` now; additional types such as `Teams` are planned). Lists and SMTP config are never deployed to sites.
- **Delivery adapters** — per-type implementations of `INotificationDeliveryAdapter`. The Notification Outbox selects the adapter matching a notification's `Type`, calls `DeliverAsync`, and receives a three-way `DeliveryOutcome` (`Success` / `TransientFailure` / `PermanentFailure`). The adapter owns the full recipient-resolution, connection, authentication, send, and disconnect sequence. `EmailNotificationDeliveryAdapter` is registered as scoped (it holds a scoped `INotificationRepository`) and the outbox actor caches a single instance for its lifetime.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.NotificationService/`. The `EmailNotificationDeliveryAdapter` that consumes these primitives lives in `src/ZB.MOM.WW.ScadaBridge.NotificationOutbox/Delivery/`.
## Key Concepts
### Central-only delivery
Before the current design, site nodes delivered notifications directly over SMTP. That arrangement required SMTP credentials and notification lists to be deployed to every site. The redesign inverts the path: a site script calls `Notify.To("list").Send(subject, body)`, receives a `string` notification id immediately, and the notification is store-and-forwarded to central. The Notification Outbox on central ingests it and calls the delivery adapter. Sites never open an SMTP connection.
This means:
- Credential exposure is limited to the central cluster.
- List membership is resolved at delivery time, so a list change takes effect for all future deliveries without redeploying to sites.
- The SMTP `MaxConcurrentConnections` value is configured at a single point, though it is not currently enforced (no connection gate or semaphore).
### `NotificationType` discriminator
`NotificationList.Type` is a `NotificationType` enum value (`Email` currently). The script API `Notify.To("listName")` is type-agnostic — the calling script does not reference a type. The Notification Outbox reads the type from the central database when it picks up the notification, then selects the matching adapter by `INotificationDeliveryAdapter.Type`. Adding a new delivery channel means adding a new adapter; existing scripts continue to work.
### Per-delivery SMTP client lifetime
`MailKitSmtpClientWrapper` wraps a single `MailKit.Net.Smtp.SmtpClient`. MailKit's client is not thread-safe and holds one TCP/TLS connection. The DI registration is therefore a **factory**, not a singleton wrapper:
```csharp
services.AddSingleton<Func<ISmtpClientWrapper>>(_ => () => new MailKitSmtpClientWrapper());
```
`EmailNotificationDeliveryAdapter.SendAsync` invokes the factory at the top of each delivery attempt, runs connect → authenticate → send → disconnect on the fresh wrapper, and disposes it in a `finally` block. Each delivery pays a full TCP+TLS handshake; this is the deliberate cost of avoiding shared connection state between concurrent outbox dispatches. The factory shape allows a future pooled implementation to be slotted in without changing callers.
## Architecture
### Primitives registered by `AddNotificationService`
`ServiceCollectionExtensions.AddNotificationService` is the single DI entry point, called on the central composition root only:
```csharp
public static IServiceCollection AddNotificationService(this IServiceCollection services)
{
services.AddOptions<NotificationOptions>()
.BindConfiguration("ScadaBridge:Notification");
services.AddHttpClient();
services.AddSingleton<OAuth2TokenService>();
services.AddSingleton<Func<ISmtpClientWrapper>>(_ => () => new MailKitSmtpClientWrapper());
return services;
}
```
Four things are registered: the `NotificationOptions` fallback values, the `HttpClient` infrastructure (required by `OAuth2TokenService`), the `OAuth2TokenService` token cache, and the `ISmtpClientWrapper` factory. The `EmailNotificationDeliveryAdapter` itself is registered by `ZB.MOM.WW.ScadaBridge.NotificationOutbox`, which depends on this project.
### `INotificationDeliveryAdapter`
```csharp
public interface INotificationDeliveryAdapter
{
NotificationType Type { get; }
Task<DeliveryOutcome> DeliverAsync(
Notification notification,
CancellationToken cancellationToken = default);
}
```
The `DeliveryOutcome` record carries a `DeliveryResult` (`Success` / `TransientFailure` / `PermanentFailure`), `ResolvedTargets` (a snapshotted string of the concrete recipients, written to the `Notifications` audit row on success), and an `Error` string on failure.
### Email delivery sequence
`EmailNotificationDeliveryAdapter.DeliverAsync` runs this sequence, classifying every failure before returning:
1. **Resolve list** — calls `INotificationRepository.GetListByNameAsync`. An unknown list returns `Permanent` immediately (the list was deleted; retrying cannot fix it).
2. **Resolve recipients** — calls `GetRecipientsByListIdAsync`. An empty list returns `Permanent`.
3. **Resolve SMTP config** — calls `GetAllSmtpConfigurationsAsync`, takes the first row. No config returns `Permanent`.
4. **Parse TLS mode**`SmtpTlsModeParser.Parse(smtpConfig.TlsMode)`. An unrecognised string throws `ArgumentException`; `DeliverAsync` catches it and returns `Permanent` (config fault, not a transient network condition).
5. **Validate addresses**`EmailAddressValidator.ValidateAddresses(fromAddress, recipients)`. A malformed address returns `Permanent`.
6. **Send** — calls the private `SendAsync`, which connect/auth/send/disconnects via a fresh `ISmtpClientWrapper`.
`SendAsync` maps `SmtpCommandException` 5xx responses to `SmtpPermanentException`, then lets it propagate. `DeliverAsync` catches `SmtpPermanentException``Permanent`; SMTP 4xx / socket / protocol / timeout exceptions → `Transient` (via `SmtpErrorClassifier`); unclassified exceptions (e.g., OAuth2 token fetch failure) → `Permanent` (retrying a broken credential wastes token-endpoint calls).
### SMTP error classification
`SmtpErrorClassifier.Classify` uses MailKit's typed exceptions and the numeric `SmtpStatusCode` rather than message substring matching:
```csharp
public static SmtpErrorClass Classify(Exception ex, CancellationToken cancellationToken)
{
if (ex is OperationCanceledException && cancellationToken.IsCancellationRequested)
return SmtpErrorClass.Unknown;
if (ex is SmtpCommandException command)
{
var code = (int)command.StatusCode;
if (code >= 400 && code < 500) return SmtpErrorClass.Transient;
if (code >= 500 && code < 600) return SmtpErrorClass.Permanent;
return SmtpErrorClass.Unknown;
}
if (ex is SmtpProtocolException
or ServiceNotConnectedException
or SocketException
or TimeoutException)
return SmtpErrorClass.Transient;
return SmtpErrorClass.Unknown;
}
```
A `Permanent` classification inside `SendAsync` is wrapped in `SmtpPermanentException` so the outer `DeliverAsync` catch filter can identify it cleanly.
### OAuth2 token lifecycle
`OAuth2TokenService.GetTokenAsync` fetches tokens for Microsoft 365 Client Credentials SMTP. Credentials are supplied as `tenantId:clientId:clientSecret`. Tokens are cached in a `ConcurrentDictionary` keyed by a SHA-256 hash of the credential string (NS-006), so distinct SMTP configurations never share a token. A per-credential `SemaphoreSlim` prevents thundering-herd refreshes. Tokens are refreshed 60 seconds before the reported `expires_in` expiry. Only the tenant is logged — the client secret and token value are never written to logs.
### Credential redaction
`CredentialRedactor.Scrub(text, credentials)` masks the full packed credential string and its trailing colon-component (password or `clientSecret`) in any text before it reaches a log line. Components shorter than 12 characters are not masked — a short username such as `root` would otherwise mask unrelated diagnostic text. All SMTP error paths in `EmailNotificationDeliveryAdapter` pass exception messages through `Scrub` before logging.
## Usage
### Script API
Site scripts do not interact with this component directly. The script surface is:
```csharp
// Returns a string notification id immediately — does not block for delivery.
string id = await Notify.To("Shift-Supervisors").Send("Tank overflow", "Tank T-03 is at 98%");
// Site-local while still in the S&F buffer; round-trips to central once forwarded.
NotificationDeliveryStatus status = await Notify.Status(id);
```
`Notify.To("list")` is type-agnostic. The notification id is a 32-character "N"-format GUID string (`Guid.NewGuid().ToString("N")`) generated at the site. `Notify.Status(string notificationId)` returns a `NotificationDeliveryStatus` record with `Status` (`Forwarding` site-local, `Unknown` if no central row and not in the S&F buffer, or `Pending` / `Retrying` / `Delivered` / `Parked` / `Discarded` from central), `RetryCount`, `LastError`, and `DeliveredAt`.
### Registering the adapter
On the central host, both projects are registered. The Notification Outbox registers `EmailNotificationDeliveryAdapter` as a scoped concrete type and as a scoped `INotificationDeliveryAdapter`; the outbox actor resolves adapters by enumerating `IEnumerable<INotificationDeliveryAdapter>` (no keyed/named registration). `AddNotificationService` is called to register the shared SMTP primitives:
```csharp
// Central composition root (simplified)
services.AddNotificationService();
services.AddNotificationOutbox(); // registers EmailNotificationDeliveryAdapter
```
## Configuration
`NotificationOptions` is bound from `ScadaBridge:Notification`. These values are **fallbacks** — when a `SmtpConfiguration` row has a non-positive value for a field, the adapter uses the option value instead. A positive value on the row always takes precedence.
| Section | Key | Default | Description |
|---------|-----|---------|-------------|
| `ScadaBridge:Notification` | `ConnectionTimeoutSeconds` | `30` | SMTP connection/operation timeout in seconds. Applied when `SmtpConfiguration.ConnectionTimeoutSeconds` is zero or negative. |
| `ScadaBridge:Notification` | `MaxConcurrentConnections` | `5` | Maximum concurrent SMTP connections. Used as the documented fallback default when the `SmtpConfiguration` row is unset; this limit is not currently enforced by a connection gate or semaphore. |
SMTP retry settings (`MaxRetries`, `RetryDelay`) live on the `SmtpConfiguration` entity and are read by the Notification Outbox dispatcher — they are not part of `NotificationOptions`.
### `SmtpConfiguration` entity fields
| Field | Type | Notes |
|-------|------|-------|
| `Host` | `string` | SMTP server hostname or IP. |
| `Port` | `int` | e.g., 587 for StartTLS, 465 for SSL. |
| `AuthType` | `string` | `basic` or `oauth2`. |
| `Credentials` | `string?` | Basic: `username:password`. OAuth2: `tenantId:clientId:clientSecret`. |
| `TlsMode` | `string?` | `None`, `StartTLS`, or `SSL`. Null/empty defaults to `StartTls`. |
| `FromAddress` | `string` | Sender address in the From header. Also the XOAUTH2 `user=` identity for M365. |
| `ConnectionTimeoutSeconds` | `int` | 0 → falls back to `NotificationOptions`. |
| `MaxConcurrentConnections` | `int` | 0 → falls back to `NotificationOptions`. |
| `MaxRetries` | `int` | Read by Notification Outbox. |
| `RetryDelay` | `TimeSpan` | Read by Notification Outbox. |
### `NotificationList` entity fields
| Field | Type | Notes |
|-------|------|-------|
| `Name` | `string` | Unique list name. Passed as `Notify.To("name")`. |
| `Type` | `NotificationType` | Enum discriminator. Currently `Email` only. |
| `Recipients` | `ICollection<NotificationRecipient>` | Resolved at delivery time by the adapter. |
Each `NotificationRecipient` carries `Name` (display) and `EmailAddress`.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `NotificationList`, `NotificationRecipient`, `SmtpConfiguration`, `Notification`, `NotificationType`, `NotificationStatus`, `INotificationRepository`, and the `NotificationSubmit` / `NotificationSubmitAck` / `NotificationStatusQuery` / `NotificationStatusResponse` / `NotificationDeliveryStatus` message contracts.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — persists `NotificationList`, `NotificationRecipient`, and `SmtpConfiguration`. Implements `INotificationRepository`. The `EmailNotificationDeliveryAdapter` resolves lists and recipients via this repository at delivery time.
- [Notification Outbox (#21)](./NotificationOutbox.md) — the central dispatch counterpart. The Notification Outbox registers `EmailNotificationDeliveryAdapter`, drives retry and parking, and owns the `Notifications` audit table. Notification Service supplies the SMTP primitives (`ISmtpClientWrapper` factory, `OAuth2TokenService`, `SmtpErrorClassifier`, `CredentialRedactor`, `EmailAddressValidator`); Notification Outbox owns when and how often `DeliverAsync` is called.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — site-side buffer. Site scripts hand notifications to the S&F engine, which forwards them to central. The Notification Service has no direct interaction with the site S&F engine; by the time `DeliverAsync` is called, the notification has already been ingested by the Notification Outbox.
- [Security & Auth (#10)](./Security.md) — Design role is required to manage notification lists and SMTP configuration.
- Design spec: [Component-NotificationService.md](../requirements/Component-NotificationService.md).
## Troubleshooting
### A notification is Parked with a permanent failure
A `PermanentFailure` outcome means `EmailNotificationDeliveryAdapter` determined that retrying cannot fix the failure. Common root causes:
| Symptom | Cause | Fix |
|---------|-------|-----|
| "Notification list '…' not found" | List was renamed or deleted after the notification was submitted. | Recreate the list or discard the notification in the Central UI Outbox page. |
| "Notification list '…' has no recipients" | List exists but has no recipient rows. | Add recipients to the list. |
| "No SMTP configuration available" | No `SmtpConfiguration` row exists. | Add an SMTP configuration in Central UI. |
| "Unknown SMTP TLS mode '…'" | `TlsMode` field contains a value other than `None`, `StartTLS`, or `SSL`. | Correct the `TlsMode` value. |
| "Invalid sender (from) email address" or "Invalid recipient email address(es)" | Malformed address in the `SmtpConfiguration.FromAddress` or in a `NotificationRecipient.EmailAddress`. | Correct the address; the adapter validates via `MailboxAddress.TryParse`. |
| SMTP 5xx reply | Server rejected the message permanently (e.g., mailbox not found, policy block). | Check the `LastError` field on the `Notifications` row. The error text has credentials redacted. |
| OAuth2 credential parse error | `Credentials` field is not in `tenantId:clientId:clientSecret` format. | Correct the credentials on the SMTP configuration. |
### Transient failures retrying indefinitely
The retry count and delay come from `SmtpConfiguration.MaxRetries` and `RetryDelay`, enforced by the Notification Outbox. Once `MaxRetries` is exhausted, the Notification Outbox moves the row to `Parked`. If a notification stays in `Retrying` longer than expected, check whether `MaxRetries` is set to a non-zero value on the `SmtpConfiguration` row and that the Notification Outbox actor is running on the active central node.
### OAuth2 token not refreshing
`OAuth2TokenService` caches tokens per credential hash. A singleton restart resets the cache; the next `GetTokenAsync` call fetches a fresh token. If token fetches fail repeatedly (network partition to `login.microsoftonline.com`, wrong tenant/client/secret), the failure surfaces as an unclassified exception in `DeliverAsync` and the notification is parked as permanent. The log line includes the tenant ID but not the secret.
## Related Documentation
- [Notification Service design specification](../requirements/Component-NotificationService.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Commons](./Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Security & Auth](./Security.md)
+40
View File
@@ -0,0 +1,40 @@
# Component Reference Documentation
Developer-reference docs for each ScadaBridge component, describing how the shipped
code in `src/` actually works — with real code examples. These complement the
[design specs](../requirements/) in `docs/requirements/`: the specs say what a
component should do and why; these docs say how the code does it.
| # | Component | Description |
|---|-----------|-------------|
| 1 | [Template Engine](TemplateEngine.md) | Template modeling, inheritance, composition, flattening, diffs, and semantic validation as implemented by the engine's services. |
| 2 | [Deployment Manager](DeploymentManager.md) | The central deployment pipeline: deployment identity and idempotency, per-instance operation locks, state transitions, all-or-nothing site apply, and artifact deployment. |
| 3 | [Site Runtime](SiteRuntime.md) | The site actor hierarchy (Deployment Manager singleton, Instance, Script, and Alarm actors), script compilation and execution, the site-wide stream, native alarms, and supervision. |
| 4 | [Data Connection Layer](DataConnectionLayer.md) | Protocol adapters (OPC UA, MxGateway), the Become/Stash connection state machine, reconnect and re-subscribe, and the native-alarm subscription seam. |
| 5 | [CentralSite Communication](Communication.md) | ClusterClient command and control plus gRPC server-streaming, the central and site communication actors, receptionist registration, and per-site contact points. |
| 6 | [Store-and-Forward Engine](StoreAndForward.md) | Site-only buffering, fixed-interval retry, parking, SQLite persistence, standby replication, and the operation-tracking store. |
| 7 | [External System Gateway](ExternalSystemGateway.md) | Script-facing HTTP/REST and database access, the `Call` and `CachedCall` modes, and transient versus permanent error classification. |
| 8 | [Notification Service](NotificationService.md) | Central notification-list and SMTP definitions, the per-type delivery adapters, and OAuth2 or Basic SMTP delivery. |
| 9 | [Central UI](CentralUI.md) | The Blazor Server app structure, authentication, real-time updates (SignalR and gRPC), and the management pages. |
| 10 | [Security & Auth](Security.md) | LDAP bind authentication, cookie and JWT sessions, role-based and site-scoped authorization, and shared Data Protection keys. |
| 11 | [Health Monitoring](HealthMonitoring.md) | Site metric collection and central aggregation, report and heartbeat intervals, offline detection, and dead-letter monitoring. |
| 12 | [Site Event Logging](SiteEventLogging.md) | The site-local SQLite event log, retention and storage cap, daily purge, and paginated central query access. |
| 13 | [Cluster Infrastructure](ClusterInfrastructure.md) | The `ClusterOptions` model and validation for active/standby clustering, split-brain resolution, and failover. |
| 14 | [Inbound API](InboundAPI.md) | `POST /api/{method}` script invocation, API-key authentication, the extended type system, and inbound-request auditing. |
| 15 | [Host](Host.md) | The deployable binary: role-based component registration, Akka bootstrap, ASP.NET Core hosting, and the readiness and active-node health endpoints. |
| 16 | [Commons](Commons.md) | Shared POCO entities, repository and service interfaces, message contracts, and the `Types`/`Interfaces`/`Entities`/`Messages` namespace layout. |
| 17 | [Configuration Database](ConfigurationDatabase.md) | The EF Core `DbContext`, repository implementations, the audit service, secret encryption, and migration and partition maintenance. |
| 18 | [Management Service](ManagementService.md) | The `ManagementActor` admin command surface, its HTTP endpoints, and ClusterClientReceptionist registration. |
| 19 | [CLI](CLI.md) | The System.CommandLine tool over the HTTP Management API — its command groups, config file, and output formats. |
| 20 | [Traefik Proxy](TraefikProxy.md) | The reverse proxy fronting the central cluster, active-node routing via `/health/active`, and the Docker topology configuration. |
| 21 | [Notification Outbox](NotificationOutbox.md) | The central `NotificationOutboxActor`, the `Notifications` table, the dispatcher loop, retry and parking, and KPIs. |
| 22 | [Site Call Audit](SiteCallAudit.md) | The central `SiteCallAuditActor`, the `SiteCalls` mirror table, telemetry ingest, and the parked-call Retry/Discard relay. |
| 23 | [Audit Log](AuditLog.md) | The append-only audit store: site SQLite hot-path, gRPC telemetry, central ingest and reconciliation, redaction, retention, and KPIs. |
| 24 | [Transport](Transport.md) | Encrypted bundle export and import, dependency resolution, conflict detection, and `BundleImportId`-correlated audit. |
| 25 | [Tree View](TreeView.md) | The reusable Blazor tree component — parameters, selection modes, and usage in the Central UI. |
## Related Documentation
- [Component design specs](../requirements/) — the spec each component implements.
- [Documentation Style Guide](../../StyleGuide.md) — the writing conventions these docs follow.
- [README](../../README.md) — the repository master index.
+229
View File
@@ -0,0 +1,229 @@
# Security & Auth
The Security & Auth component handles user authentication against an LDAP/Active Directory server and enforces role-based authorization across all central cluster operations. It owns the cookie+JWT hybrid session model, the LDAP-group-to-role mapping pipeline, site-scoped deployment permissions, and the inbound API key management seam.
## Overview
Security & Auth (#10) runs exclusively on the central cluster — sites have no user-facing interface and perform no independent authentication. The component code lives in `src/ZB.MOM.WW.ScadaBridge.Security/`, which is a component library (it accepts no `IConfiguration` directly; wiring is Options-pattern only). The Host composition root calls `AddZbLdapAuth` with the `ScadaBridge:Security:Ldap` section before calling `AddSecurity`, because the shared LDAP service is config-coupled and the component library is not allowed to bind configuration itself.
The component registers:
- `JwtTokenService` — token generation, validation, idle-timeout enforcement, and sliding refresh logic.
- `RoleMapper` — DB-backed LDAP-group-to-role resolution with site-scope union semantics.
- `ScadaBridgeGroupRoleMapper` — adapter exposing `RoleMapper` on the shared `IGroupRoleMapper<string>` seam.
- `HttpAuditActorAccessor` — resolves the authenticated username from the ambient HTTP context for audit `Actor` stamping.
- ASP.NET Core cookie authentication (sliding idle window, HttpOnly/Secure defaults via `ZbCookieDefaults.Apply`).
- Authorization policies (`RequireAdmin`, `RequireDesign`, `RequireDeployment`, `OperationalAudit`, `AuditExport`).
## Key Concepts
### Direct LDAP bind
Authentication uses a direct username/password bind against the LDAP/AD server via the shared `ILdapAuthService` (`ZB.MOM.WW.Auth.Ldap`). The flow is: service-account bind → search for the user entry by username → user-credential bind → group-membership query. The app never caches credentials locally. LDAPS (port 636) or StartTLS is required for production; the `AllowInsecure` flag in `LdapOptions` gates unencrypted use to explicitly opted-in deployments (local dev only). No Kerberos/NTLM path exists.
LDAP failure behavior is fail-closed at the login boundary and fail-open at the session boundary: a new login fails immediately if the directory is unreachable; an active session (valid cookie+JWT) continues with its current claims until the JWT expires. This avoids disrupting engineers mid-work during a brief directory outage. When the directory recovers, the next token refresh re-queries groups and issues a fresh token.
### Cookie+JWT hybrid session
On successful login the server mints a JWT via `JwtTokenService.GenerateToken` and writes it into an HttpOnly/Secure authentication cookie. The cookie is the transport — not a bearer header — because Blazor Server's persistent SignalR circuits do not carry `Authorization` headers on reconnect. The browser sends the cookie on every HTTP and SignalR request automatically.
The JWT is signed with HMAC-SHA256 using a shared symmetric key (`JwtSigningKey`). Both central nodes share the same key, so either node can issue and validate tokens without a shared session store; the load balancer needs no sticky-session configuration. `ClockSkew` is set to `TimeSpan.Zero` to close the standard five-minute tolerance window.
Claims embedded in every token:
| Claim type | Constant | Value |
|---|---|---|
| `ZbClaimTypes.DisplayName` | `JwtTokenService.DisplayNameClaimType` | Human-readable display name |
| `ZbClaimTypes.Username` | `JwtTokenService.UsernameClaimType` | Authenticated username |
| `ClaimTypes.Role` (URI) | `JwtTokenService.RoleClaimType` | One claim per role |
| `ZbClaimTypes.ScopeId` | `JwtTokenService.SiteIdClaimType` | One claim per permitted site (Deployer only) |
| `"LastActivity"` | `JwtTokenService.LastActivityClaimType` | ISO 8601 idle-timeout anchor |
`MapOutboundClaims` is cleared on the mint path (`GenerateToken`) so `JwtSecurityTokenHandler` writes claim type strings verbatim into the token. On the validate path (`ValidateToken`) only `MapInboundClaims = false` is set — the outbound map is not touched. Together these settings prevent the default claim-type rewrite maps from transforming the canonical role URI or `zb:` claim types, keeping the type strings byte-for-byte identical in the token and in every policy check.
### Token lifecycle and idle timeout
The JWT lifetime (`JwtExpiryMinutes`, default 15 minutes) and the cookie idle window (`IdleTimeoutMinutes`, default 30 minutes) are separate layers. ASP.NET cookie auth's `SlidingExpiration = true` with `ExpireTimeSpan = IdleTimeout` models the idle window: the middleware re-issues the cookie once the session passes its halfway mark, keeping active users signed in. The JWT within that cookie has its own 15-minute expiry.
`JwtTokenService.ShouldRefresh` checks whether remaining JWT lifetime is below `JwtRefreshThresholdMinutes` (default 5 minutes). `RefreshToken` issues a fresh JWT while **preserving** the existing `LastActivity` anchor — a background refresh is not treated as user activity. `RecordActivity` advances the anchor to now. `IsIdleTimedOut` checks whether the elapsed time since `LastActivity` exceeds `IdleTimeoutMinutes`; `RefreshToken` enforces the idle check internally so an idle-expired session cannot be kept alive by background polling regardless of caller discipline (Security-014).
## Architecture
### Registration split between Host and component
`AddSecurity` (component library) registers everything except the LDAP service itself:
```csharp
public static IServiceCollection AddSecurity(this IServiceCollection services)
{
services.AddScoped<JwtTokenService>();
services.AddScoped<RoleMapper>();
services.AddHttpContextAccessor();
services.AddSingleton<IAuditActorAccessor, HttpAuditActorAccessor>();
services.AddScoped<IGroupRoleMapper<string>, ScadaBridgeGroupRoleMapper>();
services.AddAuthentication(CookieAuthenticationDefaults.AuthenticationScheme)
.AddCookie(options =>
{
options.LoginPath = "/login";
options.LogoutPath = "/auth/logout";
});
services.AddOptions<CookieAuthenticationOptions>(CookieAuthenticationDefaults.AuthenticationScheme)
.Configure<IOptions<SecurityOptions>, ILoggerFactory>((cookieOptions, securityOptions, loggerFactory) =>
{
ZbCookieDefaults.Apply(
cookieOptions,
requireHttps: securityOptions.Value.RequireHttpsCookie,
idleTimeout: TimeSpan.FromMinutes(securityOptions.Value.IdleTimeoutMinutes));
var cookieName = securityOptions.Value.CookieName;
cookieOptions.Cookie.Name = string.IsNullOrWhiteSpace(cookieName)
? SecurityOptions.DefaultCookieName
: cookieName;
});
services.AddScadaBridgeAuthorization();
return services;
}
```
The Host composition root calls `AddZbLdapAuth(configuration, LdapSectionPath)` before `AddSecurity()`. `AddZbLdapAuth` registers `ILdapAuthService` as a singleton, binds `LdapOptions` from `ScadaBridge:Security:Ldap`, and registers `IValidateOptions<LdapOptions>` with `ValidateOnStart` so a misconfigured directory fails at boot rather than at first login. The Host also registers `LibraryInboundApiKeyAdmin` as the `IInboundApiKeyAdmin` singleton via `AddSingleton` — this is not done by `AddSecurity`.
### Role mapping and site scoping
`RoleMapper.MapGroupsToRolesAsync` loads all `LdapGroupMapping` rows from the database, matches the supplied LDAP group names (case-insensitive), and accumulates roles. For the `Deployer` role it also loads associated `SiteScopeRule` rows — each row carries a `SiteId` limiting that mapping to one site. Union semantics (Security-016): if any matched Deployer mapping has no scope rules, the result is system-wide and all accumulated site IDs are discarded:
```csharp
var isSystemWide = hasUnscopedDeploymentMapping
|| (hasDeploymentRole && !hasScopedDeploymentMapping);
if (isSystemWide)
{
permittedSiteIds.Clear();
}
return new RoleMappingResult(
matchedRoles.ToList(),
permittedSiteIds.ToList(),
isSystemWide);
```
`ScadaBridgeGroupRoleMapper` adapts `RoleMappingResult` onto the shared `IGroupRoleMapper<string>` seam, carrying the full `RoleMappingResult` (including `PermittedSiteIds` and `IsSystemWideDeployment`) as the opaque `Scope` field so no site-scope information is lost at the seam boundary.
### Authorization policies
Five named policies are registered by `AuthorizationPolicies.AddScadaBridgeAuthorization`. Every policy uses `RequireClaim(JwtTokenService.RoleClaimType, ...)` — no custom requirement handlers — making the policy check a direct look-up into the JWT's role claims.
| Policy | Constant | Roles satisfied |
|---|---|---|
| `RequireAdmin` | `AuthorizationPolicies.RequireAdmin` | `Administrator` |
| `RequireDesign` | `AuthorizationPolicies.RequireDesign` | `Designer` |
| `RequireDeployment` | `AuthorizationPolicies.RequireDeployment` | `Deployer` |
| `OperationalAudit` | `AuthorizationPolicies.OperationalAudit` | `Administrator`, `Viewer` |
| `AuditExport` | `AuthorizationPolicies.AuditExport` | `Administrator` |
Role names are declared in `Roles` (the single source of truth). The four active roles (`Administrator`, `Designer`, `Deployer`, `Viewer`) are the canonical subset of the shared six-role vocabulary; `Operator` and `Engineer` exist upstream but are not used. The `OperationalAudit` and `AuditExport` roles arrays are public (`AuthorizationPolicies.OperationalAuditRoles`, `AuditExportRoles`) so the ManagementService HTTP API can reuse the exact same sets when gating `/api/audit/*` routes through its own Basic-Auth + LDAP role check.
### LDAP failure messages
`LdapAuthFailureMessages.ToMessage` maps the structured `LdapAuthFailure` enum from the shared library to user-facing strings. `BadCredentials` and `UserNotFound` both return the generic "Invalid username or password." — intentionally identical to prevent username enumeration. `AmbiguousUser` and `ServiceAccountBindFailed` (which also covers a directory that is unreachable at connect/search time) return a misconfiguration message. `GroupLookupFailed` (post-bind directory outage, or a successful-but-empty group set) returns a transient-outage message.
### Inbound API key management
`LibraryInboundApiKeyAdmin` implements the Commons `IInboundApiKeyAdmin` seam over the shared `ApiKeyAdminCommands` facade. Keys use the `sbk_<keyId>_<secret>` token format (prefix `sbk`), with the key ID as a 32-hex-character GUID (`"N"` format, no hyphens, because hyphens are not valid in the delimiter-separated token). The library stores keys in a SQLite file (`data/inbound-api-keys.sqlite` by default). Scopes in the library map 1:1 to method names in ScadaBridge. Delete is implemented as revoke-then-delete because the library only permits deleting an already-revoked key.
### Data Protection key sharing
`AddConfigurationDatabase` calls `AddDataProtection().PersistKeysToDbContext<ScadaBridgeDbContext>()`. Both central nodes therefore read and write Data Protection keys from the same MS SQL database, which means either node can protect and unprotect the same data (including the cookie payload) regardless of which node issued it — a prerequisite for load-balancer failover transparency.
## Usage
Login flow (Central UI `/auth/login` and `/auth/token`):
1. Call `ILdapAuthService.AuthenticateAsync(username, password)` (registered by Host via `AddZbLdapAuth`).
2. On success, call `RoleMapper.MapGroupsToRolesAsync(ldapGroups)` to resolve roles and site scope.
3. Call `JwtTokenService.GenerateToken(displayName, username, roles, permittedSiteIds)` to mint a signed JWT.
4. Write the JWT into the HttpOnly cookie via the ASP.NET cookie auth `SignInAsync`.
On each subsequent request, middleware reads the cookie, validates the embedded JWT with `JwtTokenService.ValidateToken`, and checks `IsIdleTimedOut`. If the token is near expiry (`ShouldRefresh`), fresh claims are re-queried from LDAP and `RefreshToken` issues a replacement. Genuine user interactions call `RecordActivity` to advance the last-activity anchor.
Authorization gates use the named policies:
```csharp
// Razor page or controller — declarative
[Authorize(Policy = AuthorizationPolicies.RequireDesign)]
// ManagementActor — imperative, reusing the same role arrays
if (!AuthorizationPolicies.OperationalAuditRoles.Contains(userRole))
return Unauthorized();
```
## Configuration
`SecurityOptions` is bound from the `ScadaBridge:Security` section. LDAP connection settings are bound separately from `ScadaBridge:Security:Ldap` (the `LdapSectionPath` constant) into the shared `LdapOptions` by `AddZbLdapAuth`.
### `ScadaBridge:Security``SecurityOptions`
| Key | Default | Description |
|---|---|---|
| `JwtSigningKey` | *(required)* | Symmetric HMAC-SHA256 signing key. Must be at least 32 bytes (256 bits); validated at `JwtTokenService` construction — startup fails if too short. |
| `JwtExpiryMinutes` | `15` | JWT lifetime in minutes before embedded token expires and must be refreshed. |
| `JwtRefreshThresholdMinutes` | `5` | Minutes before JWT expiry at which `ShouldRefresh` triggers a re-issue. |
| `IdleTimeoutMinutes` | `30` | Session idle timeout in minutes. Cookie `ExpireTimeSpan` is set to this value; `IsIdleTimedOut` enforces it from the `LastActivity` claim. |
| `RequireHttpsCookie` | `true` | When `true`, the cookie is `Secure`-only (HTTPS required). Set `false` for HTTP-only dev deployments; a warning is logged at startup. |
| `CookieName` | `ZB.MOM.WW.ScadaBridge.Auth` | Authentication cookie name. Override per deployment when two ScadaBridge stacks share a hostname — browsers scope cookies by host+path but not by port. |
### `ScadaBridge:Security:Ldap``LdapOptions` (shared library)
| Key | Description |
|---|---|
| `Server` | LDAP/AD server hostname or IP. |
| `Port` | LDAP port. Use 636 for LDAPS or 389 for StartTLS. |
| `Transport` | `Ldaps`, `StartTls`, or `None` (dev only — requires `AllowInsecure = true`). |
| `AllowInsecure` | Must be `true` to permit `Transport = None`. Default `false`. |
| `SearchBase` | LDAP search base DN (e.g. `dc=corp,dc=example,dc=com`). |
| `ServiceAccountDn` | Service-account distinguished name used for the initial bind and group search. |
| `ServiceAccountPassword` | Service-account password. |
`LdapOptionsValidator` (registered with `ValidateOnStart` by `AddZbLdapAuth`) enforces that `Server`, `SearchBase`, `ServiceAccountDn`, and a secure transport are configured before the first request is served.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — defines `ISecurityRepository` (LDAP mapping + scope rule read/write), `IInboundApiKeyAdmin` (key admin seam), `IAuditActorAccessor` (audit actor resolution), `LdapGroupMapping`, and `SiteScopeRule` entities, plus `ManagementEnvelope` (carries username/roles into every Management command).
- [Configuration Database (#17)](./ConfigurationDatabase.md) — provides the scoped `ISecurityRepository` implementation (`SecurityRepository`) backed by `LdapGroupMappings` and `SiteScopeRules` tables in MS SQL, and hosts the Data Protection key ring via `PersistKeysToDbContext<ScadaBridgeDbContext>()`.
- [Central UI (#9)](./CentralUI.md) — every Blazor Server page and Razor component passes through cookie authentication and named policy authorization. The login page drives the LDAP bind → role map → token mint flow. The Admin → LDAP Mappings page is gated by `RequireAdmin` and calls `ISecurityRepository` directly.
- [Management Service (#18)](./ManagementService.md) — the `ManagementActor` enforces role and site-scope rules on every incoming command using identity carried in the `ManagementEnvelope`. The CLI authenticates users via the same LDAP bind and passes identity in every request.
- [Inbound API (#14)](./InboundAPI.md) — inbound API requests authenticate via `Authorization: Bearer sbk_<keyId>_<secret>` (library verifier, `sbk_*` token format) rather than the cookie/JWT path. `HttpAuditActorAccessor` resolves the authenticated username for audit `Actor` stamping on the interactive path; the inbound API path keeps its own actor/fallback.
- [Audit Log (#23)](./AuditLog.md) — `IAuditActorAccessor` is a seam this component implements; the Inbound API audit path calls `CurrentActor` to record the authenticated user as the event actor.
- [Transport (#24)](./Transport.md) — export gates on `RequireDesign`; import gates on `RequireAdmin`, enforced at both the Razor page layer and inside the Transport service entrypoints.
- Design spec: [Component-Security.md](../requirements/Component-Security.md).
## Troubleshooting
### Login fails: "Authentication service is misconfigured"
This message maps from `LdapAuthFailure.ServiceAccountBindFailed` or `LdapAuthFailure.AmbiguousUser`. The service-account DN or password in `ScadaBridge:Security:Ldap` is wrong, the LDAP server is unreachable at connect time, or the search for the username returned more than one entry. Check `ServiceAccountDn`, `ServiceAccountPassword`, and `Server` in configuration. `LdapOptionsValidator` enforces these keys at startup, so a complete absence fails fast — this error at login time indicates a runtime connectivity or data problem.
### Login fails: "The directory is temporarily unavailable"
Maps from `LdapAuthFailure.GroupLookupFailed`. The user-credential bind succeeded but the subsequent group-membership query failed. The directory is partially reachable (user bind works) but the group search is failing. Existing sessions with valid JWTs continue to operate unaffected.
### Session expires unexpectedly
Check `IdleTimeoutMinutes` and `JwtExpiryMinutes` in `SecurityOptions`. A background refresh that fires while the user is idle preserves the `LastActivity` anchor (`RefreshToken` does not advance it); `IsIdleTimedOut` enforces the window from the last genuine user interaction. If the idle timeout fires before the expected window, confirm that `RecordActivity` is being called on genuine user requests.
### Two ScadaBridge environments on the same host clobber each other's session
Set a distinct `CookieName` in `ScadaBridge:Security` for each deployment. Browsers scope cookies by host+path, not by port, so two stacks on `localhost:9000` and `localhost:9100` share cookie namespace under the default name.
## Related Documentation
- [Security & Auth design specification](../requirements/Component-Security.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Commons](./Commons.md)
- [Central UI](./CentralUI.md)
- [Management Service](./ManagementService.md)
- [Inbound API](./InboundAPI.md)
- [Audit Log](./AuditLog.md)
- [Transport](./Transport.md)
+253
View File
@@ -0,0 +1,253 @@
# Site Call Audit
Site Call Audit (#22) is a central-only observability component that maintains an eventually-consistent mirror of every cached call — `ExternalSystem.CachedCall()` and `Database.CachedWrite()` — issued by site scripts. It ingests lifecycle telemetry from sites into the central `SiteCalls` MS SQL table, computes point-in-time KPIs, and relays operator Retry/Discard actions back to the owning site. It does not deliver anything: cached-call execution stays entirely site-local.
## Overview
The component lives in `src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/` and runs only on the central cluster. Its single class, `SiteCallAuditActor`, is an Akka.NET `ReceiveActor` deployed as a `ClusterSingletonManager`-managed singleton on the active central node.
Telemetry reaches central through the shared `CachedCallTelemetry` packet (see [Audit Log](./AuditLog.md)), which carries both an `AuditEvent` for the `AuditLog` table and a `SiteCallOperational` snapshot for the `SiteCalls` table. The `AuditLogIngestActor` (Audit Log #23) writes both rows directly inside a single MS SQL transaction when it receives an `IngestCachedTelemetryCommand`: `auditRepo.InsertIfNotExistsAsync(...)` followed by `siteCallRepo.UpsertAsync(...)`, committed together or rolled back together. There is no Tell to `SiteCallAuditActor` on this path; the `UpsertSiteCallCommand` / `OnUpsertAsync` handler exists for other callers, not the cached-telemetry hot path. The `SiteCallAuditActor` is therefore an ingest target, not a transport; it never constructs telemetry packets and never decides what gets delivered.
Sites remain the source of truth. `Tracking.Status()` is answered site-locally from the site SQLite tracking store; the central `SiteCalls` row is what the Central UI Site Calls page reads — it may lag by one telemetry cycle.
## Key Concepts
### Mirror, not dispatcher
The Notification Outbox (#21) ingests notifications and dispatches them centrally. Site Call Audit is different: the Store-and-Forward Engine on each site performs all retry and delivery attempts against the site's locally reachable external systems and databases. Central cannot reach those systems. The `SiteCalls` table is read-only from central's perspective — operators can view it and trigger Retry/Discard actions, but the actions are forwarded to the site; central never mutates the mirror row directly.
### Monotonic upsert idempotency
The `SiteCalls` table holds one row per `TrackedOperationId`. `ISiteCallAuditRepository.UpsertAsync` implements insert-if-not-exists followed by a conditional update that only applies when the incoming status has a strictly higher rank than the stored status:
```text
Submitted=0, Forwarded=1, Attempted=2, Skipped=2,
Delivered=3, Failed=3, Parked=3, Discarded=3
```
Out-of-order telemetry, duplicate gRPC packets, and future reconciliation pulls therefore all feed the same writer safely — status never regresses.
### Stuck calls
A non-terminal row (`TerminalAtUtc IS NULL`) created before `now StuckAgeThreshold` (default 10 minutes) is classified as stuck. Stuck is display-only: surfaced as a `StuckCount` KPI and a row badge in the UI. There is no escalation or alerting.
## Architecture
### `SiteCallAuditActor`
`SiteCallAuditActor` is a `ReceiveActor` with two constructors:
- **Production**: receives `IServiceProvider` and opens a fresh DI scope per message to resolve the scoped EF Core `ISiteCallAuditRepository`. This mirrors `AuditLogIngestActor`'s pattern — a long-lived singleton cannot hold a scope across messages.
- **Test**: receives a concrete `ISiteCallAuditRepository` and reuses it across all messages, allowing integration tests to run against a real MS SQL fixture without DI scaffolding.
The actor catches all repository exceptions in its write path and replies `Accepted=false` without rethrowing, keeping the singleton alive across storage faults. The `SupervisorStrategy` override (one-for-one, `maxNrOfRetries: 0`) governs any future children — the actor currently has none.
```csharp
private async Task OnUpsertAsync(UpsertSiteCallCommand cmd)
{
var replyTo = Sender;
var id = cmd.SiteCall.TrackedOperationId;
IServiceScope? scope = null;
ISiteCallAuditRepository repository;
if (_injectedRepository is not null)
{
repository = _injectedRepository;
}
else
{
scope = _serviceProvider!.CreateScope();
repository = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
}
try
{
var siteCall = cmd.SiteCall with { IngestedAtUtc = DateTime.UtcNow };
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: true));
}
catch (Exception ex)
{
_logger.LogError(ex, "SiteCallAudit upsert failed for {TrackedOperationId}", id);
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: false));
}
finally
{
scope?.Dispose();
}
}
```
`IngestedAtUtc` is always stamped at central-side persist time, not carried from the site. This ensures the column reflects when central last processed the row, not when the site emitted it.
### Message handlers
| Message | Direction | Handler |
|---|---|---|
| `UpsertSiteCallCommand` | Central ingest → actor | `OnUpsertAsync` — scope-per-message upsert |
| `SiteCallQueryRequest` | Central UI → actor | `HandleQuery` — keyset-paged query, max 200 rows |
| `SiteCallDetailRequest` | Central UI → actor | `HandleDetail` — single-row full detail |
| `SiteCallKpiRequest` | Central UI → actor | `HandleKpi` — global KPI snapshot |
| `PerSiteSiteCallKpiRequest` | Central UI → actor | `HandlePerSiteKpi` — per-site KPI list |
| `RetrySiteCallRequest` | Central UI → actor | `HandleRetrySiteCall` — relay to site |
| `DiscardSiteCallRequest` | Central UI → actor | `HandleDiscardSiteCall` — relay to site |
| `RegisterCentralCommunication` | Host → actor | Wires the `CentralCommunicationActor` transport |
All read handlers capture `Sender` before the first `await` and use `PipeTo` to return the response — the standard Akka pattern for async ask-reply handlers.
### `SiteCalls` table
One row per `TrackedOperationId` in the central MS SQL configuration database. Key columns:
| Column | Type | Notes |
|---|---|---|
| `TrackedOperationId` | `uniqueidentifier` | PK; GUID stamped site-side at call time |
| `Channel` | `varchar` | `"ApiOutbound"` or `"DbOutbound"` |
| `Target` | `varchar` | Human-readable target, e.g. `"ERP.GetOrder"` |
| `SourceSite` | `varchar` | Site that issued the call |
| `SourceNode` | `varchar NULL` | `node-a` / `node-b`; nullable for retired nodes |
| `Status` | `varchar` | String form of `AuditStatus`; monotonic |
| `RetryCount` | `int` | Dispatch attempts so far |
| `LastError` | `varchar NULL` | Most recent error text |
| `HttpStatus` | `int NULL` | Last HTTP response code (API calls only) |
| `CreatedAtUtc` | `datetime2` | First submit timestamp from site |
| `UpdatedAtUtc` | `datetime2` | Latest site-side status mutation |
| `TerminalAtUtc` | `datetime2 NULL` | Set when status reaches a terminal rank |
| `IngestedAtUtc` | `datetime2` | Central-side stamp, updated on every upsert |
Unlike the `AuditLog` table, `SiteCalls` is a standard non-partitioned table on `[PRIMARY]` holding mutable operational state. No DB-role restriction applies; it is updated in place by the upsert.
### Status lifecycle
```text
Submitted → Forwarded → Attempted ──→ Delivered (terminal, success)
└──→ Parked (non-terminal, awaiting operator action)
└──→ Failed (terminal, permanent failure)
└──→ Discarded (terminal, operator-initiated on Parked row)
```
`Failed` rows are not operator-actionable — a permanent failure (e.g. HTTP 4xx) would fail again, and the error was already returned synchronously to the calling script. Only `Parked` rows support Retry and Discard.
### Retry / Discard relay
The `CentralCommunicationActor` is wired into `SiteCallAuditActor` after both actors exist, via `RegisterCentralCommunication`. Until registration completes, any relay request receives an immediate `SiteCallRelayOutcome.SiteUnreachable` outcome — there is genuinely no route to any site.
When `_centralCommunication` is set, the relay handler wraps the command in a `SiteEnvelope` keyed to `SourceSite` and Asks the `CentralCommunicationActor`, which routes it over the per-site `ClusterClient`:
```csharp
private void HandleRetrySiteCall(RetrySiteCallRequest request)
{
var sender = Sender;
if (_centralCommunication is null)
{
sender.Tell(UnreachableRetry(request.CorrelationId));
return;
}
var relay = new RetryParkedOperation(
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
var envelope = new SiteEnvelope(request.SourceSite, relay);
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
.PipeTo(
sender,
success: ack => MapRetryResponse(request.CorrelationId, ack),
failure: ex => MapRetryFailure(request.CorrelationId, request.SourceSite, ex));
}
```
The site applies `RetryParkedOperation` / `DiscardParkedOperation` to its own Store-and-Forward buffer and returns a `ParkedOperationActionAck`. The actor maps the ack to a `SiteCallRelayOutcome`:
| Ack | Outcome |
|---|---|
| `Applied=true` | `Applied` |
| `Applied=false`, no error | `NotParked` — site had nothing to do |
| `Applied=false`, error present | `OperationFailed` — site faulted |
| Ask timeout / no route | `SiteUnreachable` |
`SiteUnreachable` is distinguished from `OperationFailed` because central is a mirror — a relay that never reached the site is a transient transport condition, not an operation failure. The UI surfaces "site unreachable" so operators know to retry once the site is back online.
The corrected cached-call state flows back to central via the normal telemetry path after the site applies the action. Central never writes the `SiteCalls` row to reflect a relay outcome directly.
### KPI computation
KPIs are computed point-in-time against the `SiteCalls` table by `ISiteCallAuditRepository.ComputeKpisAsync` and `ComputePerSiteKpisAsync`. All aggregation is server-side; no rows are materialised. The actor derives the cutoff timestamps from `SiteCallAuditOptions` before calling the repository:
```csharp
private void HandleKpi(SiteCallKpiRequest request)
{
var sender = Sender;
var now = DateTime.UtcNow;
var stuckCutoff = now - _options.StuckAgeThreshold;
var intervalSince = now - _options.KpiInterval;
KpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
sender,
success: response => response,
failure: ex => new SiteCallKpiResponse(
request.CorrelationId, Success: false,
ErrorMessage: ex.GetBaseException().Message,
BufferedCount: 0, ParkedCount: 0, FailedLastInterval: 0,
DeliveredLastInterval: 0, OldestPendingAge: null, StuckCount: 0));
}
```
`SiteCallKpiSnapshot` is structurally similar to `NotificationKpiSnapshot` so the Central UI dashboard can reuse the same tile layout for both components. The shapes differ: `SiteCallKpiSnapshot` carries 6 fields (`BufferedCount`, `ParkedCount`, `FailedLastInterval`, `DeliveredLastInterval`, `OldestPendingAge`, `StuckCount`), while `NotificationKpiSnapshot` carries 5 (`QueueDepth`, `StuckCount`, `ParkedCount`, `DeliveredLastInterval`, `OldestPendingAge`) — `BufferedCount` replaces `QueueDepth` and `FailedLastInterval` is an addition with no counterpart in the notification shape.
## Usage
The actor accepts only Akka messages — there is no public API beyond the message protocol defined in Commons. The Central UI's Site Calls page sends `SiteCallQueryRequest` / `SiteCallKpiRequest` / `PerSiteSiteCallKpiRequest` / `SiteCallDetailRequest` through `CommunicationService`, which Asks the singleton and awaits `SiteCallQueryResponse` / `SiteCallKpiResponse` / `PerSiteSiteCallKpiResponse` / `SiteCallDetailResponse`.
The ingest path is driven by `AuditLogIngestActor.OnCachedTelemetryAsync`, which writes both the `AuditLog` row and the `SiteCalls` upsert directly inside a single EF transaction — no message is sent to `SiteCallAuditActor` on this path. Both writes succeed or both roll back; neither component needs to coordinate with the other after the transaction commits.
Registration is via `ServiceCollectionExtensions.AddSiteCallAudit`, which binds `SiteCallAuditOptions` from the `ScadaBridge:SiteCallAudit` configuration section. The actor `Props` and the `ClusterSingletonManager` registration are wired in the Host's central-role composition.
## Configuration
`SiteCallAuditOptions` is bound from the `ScadaBridge:SiteCallAudit` section.
| Key | Default | Description |
|---|---|---|
| `StuckAgeThreshold` | `00:10:00` | Age past which a non-terminal row is counted as stuck. Display-only; no escalation. |
| `KpiInterval` | `00:01:00` | Trailing window for `DeliveredLastInterval` and `FailedLastInterval` KPIs. |
| `RelayTimeout` | `00:00:10` | Ask timeout for the central→site Retry/Discard relay. Must be less than `CommunicationOptions.QueryTimeout` (default 30 s) so the inner relay times out first and returns the distinct `SiteUnreachable` outcome. |
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `SiteCall`, `SiteCallOperational`, `TrackedOperationId`, `SiteCallAuditOptions`-adjacent types (`SiteCallKpiSnapshot`, `SiteCallSiteKpiSnapshot`, `SiteCallQueryFilter`, `SiteCallPaging`), all message contracts (`UpsertSiteCallCommand`, `UpsertSiteCallReply`, `SiteCallQueryRequest/Response`, `SiteCallDetailRequest/Response`, `SiteCallKpiRequest/Response`, `PerSiteSiteCallKpiRequest/Response`, `RetrySiteCallRequest/Response`, `DiscardSiteCallRequest/Response`, `SiteCallRelayOutcome`), and the `ISiteCallAuditRepository` interface.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — implements `ISiteCallAuditRepository` against the central `dbo.SiteCalls` table. Central hosts must call `AddConfigurationDatabase` for the actor to resolve its scoped repository.
- [Audit Log (#23)](./AuditLog.md) — shares the `CachedCallTelemetry` packet. `AuditLogIngestActor.OnCachedTelemetryAsync` writes the `AuditLog` row and the `SiteCalls` upsert directly in a single MS SQL transaction; it does not send a message to this actor on that path. The two components share a database transaction, not a message exchange.
- [CentralSite Communication (#5)](./Communication.md) — the `CentralCommunicationActor` is the transport the relay handlers use. It is registered via `RegisterCentralCommunication` by the Host after both actors are running. `CommunicationService` also provides the async wrappers (`RetrySiteCallAsync`, `DiscardSiteCallAsync`) that the Central UI calls; those methods Ask the `SiteCallAuditActor` with the outer `CommunicationOptions.QueryTimeout`.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — site-side executor of `RetryParkedOperation` and `DiscardParkedOperation`. The site's S&F buffer is the source of truth for parked cached calls; it emits updated telemetry after applying an operator action.
- [Central UI (#9)](./CentralUI.md) — the `Health.razor` page (`SiteCallKpiTiles` component) consumes `SiteCallKpiResponse` to surface buffered count, parked count, stuck count, and throughput KPI tiles on the health dashboard alongside the Notification Outbox tiles; the Site Calls page queries this actor for the paginated list, detail modal, and KPIs and issues Retry/Discard actions that flow through `CommunicationService` to the relay handlers here.
- [Cluster Infrastructure (#13)](./ClusterInfrastructure.md) — hosts the `SiteCallAuditActor` singleton with active/standby failover via `ClusterSingletonManager`.
## Troubleshooting
### Relay returns `SiteUnreachable`
The `CentralCommunicationActor` could not route the command to the site — the site is offline, the `ClusterClient` route has not yet resolved, or the relay timed out waiting for a `ParkedOperationActionAck`. The `_options.RelayTimeout` (default 10 s) is the inner Ask timeout. The action was NOT applied. Retry the operator action once the site is back online; the `SiteCalls` mirror row will self-correct via telemetry after the site applies it.
### Relay returns `NotParked`
The site was reached but reported no parked row for the given `TrackedOperationId`. The call was likely already delivered, discarded, or transitioned out of `Parked` status between the operator clicking Retry/Discard and the relay arriving. No action is required; the telemetry will update the mirror row shortly.
### Upsert replied `Accepted=false`
The actor caught a repository exception and replied false to the caller without rethrowing. The central singleton remains alive. Check the structured log for a `SiteCallAudit upsert failed for {TrackedOperationId}` error with the exception detail. If the MS SQL configuration database is temporarily unavailable, the telemetry sender will retry on its next cycle (the at-least-once gRPC transport) or the future reconciliation pull will backfill the row.
### `SiteCalls` rows not appearing
Ingest flows through `AuditLogIngestActor.OnCachedTelemetryAsync`, which writes the `AuditLog` row and `SiteCalls` upsert directly in one EF transaction. If that transaction fails, neither row is written. Check `AuditLog` ingest health first — a missing `AuditLog` row for the same `TrackedOperationId` confirms the telemetry never reached central, not that the `SiteCalls` upsert failed in isolation.
## Related Documentation
- [Site Call Audit design specification](../requirements/Component-SiteCallAudit.md)
- [Audit Log](./AuditLog.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [CentralSite Communication](./Communication.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Commons](./Commons.md)
- [Health Monitoring](./HealthMonitoring.md)
+234
View File
@@ -0,0 +1,234 @@
# Site Event Logging
The Site Event Logging component records operational events at each site cluster into a local SQLite database. Events are written by site actors on a fire-and-forget basis and are available for remote query from central, providing a diagnostic window into site runtime activity without coupling subsystems to a central store.
## Overview
Site Event Logging (#12) is a site-only write path that runs alongside the operational subsystems it observes. Unlike the Audit Log (#23), which spans the script trust boundary and flows to a central append-only table, the site event log is a local diagnostic store: it captures events that are useful for troubleshooting runtime behaviour (script failures, connection flapping, deployment outcomes) but are not part of a ledger that must survive failover or node replacement.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/`:
- `SiteEventLogger` — the singleton write path: one owned `SqliteConnection` behind a shared write lock, fed by a bounded `Channel<T>` so actor threads never block on disk I/O.
- `EventLogQueryService` — executes queries against the local SQLite, filtering and paginating results for central requests.
- `EventLogHandlerActor` — Akka actor bridge that receives `EventLogQueryRequest` messages from the `SiteCommunicationActor` and returns `EventLogQueryResponse`.
- `EventLogPurgeService``BackgroundService` that enforces time-based retention and the storage cap on a configurable interval.
- `SiteEventLogOptions` — options class bound from `ScadaBridge:SiteEventLog`.
The DI entry point is `ServiceCollectionExtensions.AddSiteEventLogging`, registered on site nodes by `SiteServiceRegistration`. `EventLogHandlerActor` is wired separately as a cluster singleton inside `AkkaHostedService` because it must be created inside the `ActorSystem`.
## Key Concepts
### Active-node-only writes
Only the active site node generates and stores events. The standby's local SQLite receives no writes, so purging there is unnecessary. `EventLogPurgeService` consults an optional `SiteEventLogActiveNodeCheck` delegate on each tick and early-exits when the delegate returns `false`. The delegate is an optional seam: `AddSiteEventLogging` resolves it via `sp.GetService<SiteEventLogActiveNodeCheck>()`, so the service compiles and runs without it. The Host does **not** currently register the delegate, so `GetService` returns `null` and the constructor defaults to `() => true`. As a result the purge currently runs on every tick on both nodes. When no delegate is registered, the purge runs on every tick, preserving pre-cluster behaviour.
On failover, the newly active node starts logging to its own SQLite database. Historical events from the previous active node are not queryable until that node comes back online. This is acceptable because event logs are diagnostic, not transactional — a missing log tail after failover is not a data-integrity concern.
### Event types and severity
`ISiteEventLogger.LogEventAsync` accepts a free-form `eventType` string and one of three case-sensitive `severity` values: `"Info"`, `"Warning"`, or `"Error"`. Unknown severities are rejected at write time — the allowed set is enforced by a `HashSet<string>` with `StringComparer.Ordinal`, matching the SQLite `BINARY` collation used by query filters so a stored value is never invisible to a later query.
The `event_type` values used across site subsystems are: `script`, `alarm`, `deployment`, `connection`, `store_and_forward`, `instance_lifecycle`.
### Non-blocking write path
`LogEventAsync` validates its arguments and enqueues a `PendingEvent` onto a bounded `Channel<T>`. The background writer loop drains it sequentially against the shared connection. The returned `Task` completes once the event is durably persisted and faults if the write fails, so a caller that awaits it can detect a dropped event. The caller is never blocked on disk I/O.
### Keyset pagination
Queries use keyset pagination: the caller supplies a nullable `ContinuationToken` (the `id` of the last row in the previous page), and the query appends `id > $afterId` so each page starts exactly after the previous one with no row-skipping or re-scanning. The response includes a new `ContinuationToken` and a `HasMore` flag.
## Architecture
### Schema and indexes
`SiteEventLogger.InitializeSchema` sets `PRAGMA auto_vacuum = INCREMENTAL` before creating the table — this is required before any table exists for the mode to take effect, and it allows `PRAGMA incremental_vacuum` to reclaim free pages during cap-purge batches:
```csharp
cmd.CommandText = """
CREATE TABLE IF NOT EXISTS site_events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
event_type TEXT NOT NULL,
severity TEXT NOT NULL,
instance_id TEXT,
source TEXT NOT NULL,
message TEXT NOT NULL,
details TEXT
);
CREATE INDEX IF NOT EXISTS idx_events_timestamp ON site_events(timestamp);
CREATE INDEX IF NOT EXISTS idx_events_type ON site_events(event_type);
CREATE INDEX IF NOT EXISTS idx_events_instance ON site_events(instance_id);
CREATE INDEX IF NOT EXISTS idx_events_severity ON site_events(severity);
""";
```
Keyword search (`KeywordFilter`) runs as `LIKE '%…%' ESCAPE '\'` on `message` and `source`. A leading-wildcard `LIKE` cannot use a B-tree index, so keyword-only queries full-scan the table. All other filters (`event_type`, `severity`, `instance_id`, `timestamp`) are covered by the indexes above.
### Connection lock
`SiteEventLogger` owns one `SqliteConnection` that is not thread-safe. Every database access — writes from the background loop, reads from `EventLogQueryService`, deletes from `EventLogPurgeService` — must go through `WithConnection`, which serialises callers on a shared lock:
```csharp
internal bool WithConnection(Action<SqliteConnection> action)
{
ArgumentNullException.ThrowIfNull(action);
lock (_writeLock)
{
if (_disposed) return false;
action(_connection);
return true;
}
}
```
`EventLogQueryService` and `EventLogPurgeService` both depend on the concrete `SiteEventLogger` rather than `ISiteEventLogger` to avoid a downcast that would throw `InvalidCastException` for any other implementation.
### Write queue and drop behaviour
The write queue is bounded at `WriteQueueCapacity` (default 10,000). Overflow uses `BoundedChannelFullMode.DropOldest`: when the queue is full, the oldest pending event is evicted, its completion `Task` is faulted with `InvalidOperationException`, and `FailedWriteCount` is incremented so the drop is observable. On any SQLite write error, `FailedWriteCount` is also incremented and the affected `Task` is faulted:
```csharp
_writeQueue = Channel.CreateBounded<PendingEvent>(
new BoundedChannelOptions(capacity)
{
SingleReader = true,
SingleWriter = false,
FullMode = BoundedChannelFullMode.DropOldest,
},
itemDropped: dropped =>
{
Interlocked.Increment(ref _failedWriteCount);
dropped.Completion.TrySetException(
new InvalidOperationException(
$"Event was dropped because the write queue exceeded its bounded capacity ({capacity})."));
});
```
### Purge: retention and storage cap
`EventLogPurgeService` runs two passes on each tick:
1. **Retention purge** — deletes all rows where `timestamp < cutoff` (cutoff = `UtcNow` minus `RetentionDays`). A single `DELETE` statement; no batching needed.
2. **Storage cap purge** — if the logical database size exceeds `MaxStorageMb`, deletes the oldest 1,000 rows per batch and calls `PRAGMA incremental_vacuum` after each batch to reclaim free pages. The loop stops when the size is within the cap, when no rows are deleted, or when the size fails to decrease across a batch (guards against a scenario where vacuuming cannot shrink the file):
```csharp
cmd.CommandText = $"""
DELETE FROM site_events WHERE id IN (
SELECT id FROM site_events ORDER BY id ASC LIMIT {CapPurgeBatchSize}
)
""";
var rows = cmd.ExecuteNonQuery();
using var vacuumCmd = connection.CreateCommand();
vacuumCmd.CommandText = "PRAGMA incremental_vacuum";
vacuumCmd.ExecuteNonQuery();
```
Logical size is measured as `(page_count - freelist_count) × page_size` so the cap loop observes reclaimed pages even before they are returned to the OS.
A purge runs once on `BackgroundService` startup and then on each `PurgeInterval` tick.
### Central query path
Central queries arrive via the `SiteCommunicationActor`, which dispatches `EventLogQueryRequest` messages to the `EventLogHandlerActor` cluster singleton. The actor delegates immediately to `IEventLogQueryService.ExecuteQuery` and returns the `EventLogQueryResponse` to the sender synchronously, keeping the actor message loop unblocked while the read runs under the shared lock:
```csharp
public class EventLogHandlerActor : ReceiveActor
{
public EventLogHandlerActor(IEventLogQueryService queryService)
{
_queryService = queryService;
Receive<EventLogQueryRequest>(msg =>
{
var response = _queryService.ExecuteQuery(msg);
Sender.Tell(response);
});
}
}
```
`EventLogQueryService` clamps the caller-supplied `PageSize` to `MaxQueryPageSize` (default 500) before building the query, so a central client that requests `int.MaxValue` cannot force the query to materialise the entire log into one list while holding the write lock.
## Usage
Callers resolve `ISiteEventLogger` from DI. Because the write is non-blocking and best-effort, site actors discard the returned `Task` with `_ =` rather than awaiting it on the hot path:
```csharp
// ScriptExecutionActor — reporting a script failure
_ = siteEventLogger?.LogEventAsync(
"script", "Error", instanceName, $"ScriptActor:{scriptName}", errorMsg, ex.ToString());
// DataConnectionActor — reporting a connection loss
_ = _siteEventLogger.LogEventAsync(
"connection", "Warning", null, _connectionName,
$"Connection lost — entering reconnect cycle", null);
// DataConnectionActor — reporting a reconnection
_ = _siteEventLogger.LogEventAsync(
"connection", "Info", null, _connectionName,
$"Connection restored on {_activeEndpoint} endpoint", null);
```
The `source` argument uses the convention `"ActorType:Name"` (e.g. `"ScriptActor:MonitorSpeed"`, `"DataConnectionActor:PLC1"`). The `details` field carries any supplemental context — stack traces, compiler output, thresholds — as free-form text; JSON is conventional but not validated.
Callers that need to confirm persistence — rare in production, common in tests — can await the returned `Task` and handle a faulted result.
## Configuration
Options are bound from the `ScadaBridge:SiteEventLog` section by `SiteServiceRegistration`.
| Key | Default | Description |
|-----|---------|-------------|
| `RetentionDays` | `30` | Days before events are deleted by the retention purge. |
| `MaxStorageMb` | `1024` | Maximum logical database size in MB. Oldest rows are deleted in 1,000-row batches when exceeded. |
| `DatabasePath` | `site_events.db` | File path for the SQLite database. |
| `QueryPageSize` | `500` | Default page size when the caller does not supply one. |
| `MaxQueryPageSize` | `500` | Hard upper bound on caller-supplied page sizes. Silent clamp. |
| `PurgeInterval` | `24h` (`TimeSpan`) | Interval between purge ticks. An initial purge also runs on service startup. |
| `WriteQueueCapacity` | `10000` | Bounded write-queue capacity. Overflow evicts oldest with `DropOldest`. |
The docker cluster appsettings (`ScadaBridge:SiteEventLog`) sets `RetentionDays: 30` and `MaxStorageMb: 1024`, matching the code defaults. `PurgeScheduleCron` appears in those files as a vestigial key; the actual purge cadence is driven by `PurgeInterval` in the options class, not a cron expression.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — defines the `EventLogQueryRequest` / `EventLogQueryResponse` / `EventLogEntry` message contracts in `ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery`, shared across the site query path and the central dispatch path (`QueryEventLogsCommand`).
- [CentralSite Communication (#5)](./Communication.md) — the `SiteCommunicationActor` dispatches inbound `EventLogQueryRequest` messages to `EventLogHandlerActor` and carries the `EventLogQueryResponse` back to central. The query timeout is 30 s.
- [Site Runtime (#3)](./SiteRuntime.md) — `ScriptActor` and `ScriptExecutionActor` log `script`-type events: trigger expression failures, script execution errors, and timeouts. `ISiteEventLogger` is resolved from DI inside execution actors.
- [Data Connection Layer (#4)](./DataConnectionLayer.md) — `DataConnectionActor` logs `connection`-type events: connection loss, reconnection, and endpoint failover. `DataConnectionManagerActor` may also log connection-category events.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — logs `store_and_forward`-type events on the site→central notification forward path (forward failures, long-buffered notifications). Routine enqueue and forward-success events are not logged; central's `Notifications` table is the authoritative record.
- [Host (#15)](./Host.md) — `SiteServiceRegistration` calls `AddSiteEventLogging` and binds `SiteEventLogOptions`. `AkkaHostedService` wires `EventLogHandlerActor` as a cluster singleton scoped to `"site-{SiteId}"`. The `SiteEventLogActiveNodeCheck` delegate is an optional seam defined in `SiteEventLogging` for the Host to register when it wants to gate the purge to the active node only; the Host does not currently register it, so the purge defaults to always-active and runs on every node.
- [Audit Log (#23)](./AuditLog.md) — a distinct component. The Audit Log captures every trust-boundary action (outbound API calls, DB writes, notifications, inbound API) and flows to a central append-only table with monthly partitioning and 365-day retention. The site event log captures internal runtime diagnostics (failures, state transitions) locally with 30-day retention. The two stores are complementary, not overlapping.
- [Site Call Audit (#22)](./SiteCallAudit.md) — a distinct component. Site Call Audit mirrors cached-call operational status in the central `SiteCalls` table via gRPC telemetry. Site Event Logging has no role in that flow.
## Troubleshooting
### Write failures are observable but not propagated
A SQLite write failure increments `FailedWriteCount` on `ISiteEventLogger`, logs an error via `ILogger`, and faults the returned `Task`. The calling actor discards the `Task` on the hot path (`_ = logger?.LogEventAsync(…)`), so the failure does not surface to the actor's message loop. `FailedWriteCount` is available for Health Monitoring integration but is not yet wired to the health surface; a non-zero count indicates disk pressure, a full queue, or a corrupt database file.
### Queue overflow drops oldest events
When the site write queue fills (sustained disk slowness or very high event rates), the oldest pending event is silently evicted and `FailedWriteCount` is incremented. Recent events are preserved at the cost of older ones. Reducing event throughput or increasing `WriteQueueCapacity` addresses sustained overflow.
### Cap-purge loop terminates early
If the database size does not decrease across a cap-purge batch, the loop stops to avoid emptying the entire table. This situation should not occur with `auto_vacuum = INCREMENTAL` enabled, but the guard prevents runaway deletion if vacuuming regresses. A `Warning` log line reporting the stable size is the signal to investigate filesystem-level free-page reclamation.
### Central query returns stale data after failover
After a site failover, the new active node's event log starts empty. Central queries will see no events until the new node generates them. This is by design — event logs are not replicated. Historical events from the previous active node return when that node comes back online and responds to queries.
## Related Documentation
- [Site Event Logging design specification](../requirements/Component-SiteEventLogging.md)
- [Audit Log](./AuditLog.md)
- [Site Call Audit](./SiteCallAudit.md)
- [CentralSite Communication](./Communication.md)
- [Site Runtime](./SiteRuntime.md)
- [Data Connection Layer](./DataConnectionLayer.md)
- [Store-and-Forward Engine](./StoreAndForward.md)
- [Host](./Host.md)
- [Commons](./Commons.md)
+312
View File
@@ -0,0 +1,312 @@
# Site Runtime
The Site Runtime component runs the site-side actor hierarchy that executes deployed machine instances: it owns script compilation, alarm evaluation, native alarm mirroring, and the site-wide Akka stream that carries attribute value and alarm state changes to every subscriber.
## Overview
Site Runtime (#3) operates exclusively on site clusters. Its entry point is the `DeploymentManagerActor` cluster singleton, which re-creates the full actor hierarchy on every site startup or failover. Each deployed enabled instance gets an `InstanceActor` child; each `InstanceActor` spawns `ScriptActor` and `AlarmActor` coordinator children, plus a `NativeAlarmActor` peer for every configured native alarm source. Script invocations spawn short-lived `ScriptExecutionActor` children; alarm on-trigger invocations spawn short-lived `AlarmExecutionActor` children.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.SiteRuntime/`:
- `Actors/``DeploymentManagerActor`, `InstanceActor`, `ScriptActor`, `ScriptExecutionActor`, `AlarmActor`, `AlarmExecutionActor`, `NativeAlarmActor`, `SiteReplicationActor`.
- `Scripts/``ScriptCompilationService`, `ScriptExecutionScheduler`, `SharedScriptLibrary`, `ScriptRuntimeContext`, `ScopeAccessors`, `TriggerExpressionGlobals`.
- `Streaming/``SiteStreamManager` (the site-wide Akka broadcast stream).
- `Persistence/``SiteStorageService` (raw SQLite via `Microsoft.Data.Sqlite`), `SiteStorageInitializer`.
- `Repositories/``SiteExternalSystemRepository`, `SiteNotificationRepository`.
- `Tracking/``OperationTrackingStore`, `OperationTrackingOptions`.
`ServiceCollectionExtensions.AddSiteRuntime(connectionString)` registers all singletons; the `Host` calls it and wires the `DeploymentManagerActor` cluster singleton separately via `AkkaHostedService`.
## Key Concepts
### Cluster singleton as the single point of authority
`DeploymentManagerActor` runs as an Akka.NET cluster singleton — guaranteed to be active on exactly one site node at a time. On failover, Akka restarts the singleton on the surviving node. Because all deployment commands from central are routed through the singleton, there is never a split-brain dispute over which node owns instance lifecycle: the singleton is the only actor that calls `Context.ActorOf` for `InstanceActor` children.
### Staggered startup
The singleton reads all deployed configurations from SQLite in `PreStart`, compiles shared scripts off-thread, and then creates `InstanceActor` children in batches. The default batch size is 20, with a 100 ms delay between batches (`StartupBatchSize` and `StartupBatchDelayMs` in `SiteRuntimeOptions`). Without staggering, 500 instances each subscribing to OPC UA tags simultaneously would produce a reconnection storm that overwhelms the OPC UA server.
### Actor supervision and lifetimes
| Actor | Kind | Supervises children with | On exception |
|---|---|---|---|
| `DeploymentManagerActor` | long-lived singleton | `OneForOneStrategy` | Resume (coord) / Stop (init failure) |
| `InstanceActor` | long-lived per instance | `OneForOneStrategy` | Resume for all coordinator children |
| `ScriptActor` | long-lived coordinator | `OneForOneStrategy` | Stop execution child, keep self |
| `AlarmActor` | long-lived coordinator | `OneForOneStrategy` | Stop execution child, keep self |
| `NativeAlarmActor` | long-lived coordinator | — | Supervised by Instance Actor (Resume) |
| `ScriptExecutionActor` | short-lived per invocation | — | Stops itself; parent logs failure |
| `AlarmExecutionActor` | short-lived per invocation | — | Stops itself; parent logs failure |
Coordinator actors resume on exception because their in-memory state (trigger timers, last execution time, alarm level) must survive child crashes. Short-lived execution actors stop themselves on completion or exception — the coordinator remains available for the next trigger.
### Dedicated script-execution dispatcher
Script and alarm on-trigger bodies run on the `ScriptExecutionScheduler` (`SiteRuntime-009`): a custom `TaskScheduler` backed by a bounded set of dedicated threads (default 8, `ScriptExecutionThreadCount`). The script body is submitted to this scheduler via `Task.Factory.StartNew(..., scheduler)` inside `ScriptExecutionActor` and `AlarmExecutionActor`. Scripts that block on I/O (database connections, synchronous external system calls) block only the scheduler's threads, leaving the shared .NET thread pool and all Akka dispatchers unaffected.
### Tell vs. Ask
- **Tell**: tag value updates, `AttributeValueChanged` fan-out to child Script/Alarm actors, stream publishing. These are high-frequency or fire-and-forget paths.
- **Ask**: `Instance.CallScript()` (caller needs the return value), debug view snapshots, attribute reads from the Inbound API. Ask is reserved for cross-boundary, low-frequency calls.
### Attribute serialization through the Instance Actor
All in-memory state mutations (attribute values, qualities, alarm states) run inside `InstanceActor`'s mailbox. Multiple `ScriptExecutionActor` instances may run concurrently but all `SetAttribute` calls serialize through the `InstanceActor` mailbox, preventing race conditions. Concurrent script executions may interleave external side effects (HTTP calls, database writes, notifications); those are independent and intentionally not serialized.
## Architecture
### Actor hierarchy
```text
DeploymentManagerActor (Akka.NET cluster singleton)
└── InstanceActor "MachineA-001"
├── ScriptActor "MonitorSpeed" (coordinator)
│ └── ScriptExecutionActor (short-lived, per invocation)
├── ScriptActor "CalculateOEE" (coordinator)
│ └── ScriptExecutionActor (short-lived)
├── AlarmActor "OverTemp" (coordinator, computed)
│ └── AlarmExecutionActor (short-lived, on-trigger)
├── AlarmActor "LowPressure" (coordinator, computed)
└── NativeAlarmActor "OpcUaServer1" (read-only mirror, peer to AlarmActor)
```
`NativeAlarmActor` is a sibling of `AlarmActor` — a peer under the same `InstanceActor` parent. It is not a child of `AlarmActor` and has no relationship to the script engine.
### Deployment flow
Central sends a `DeployInstanceCommand` carrying a JSON `FlattenedConfiguration` to the site singleton. The singleton:
1. Calls `EnsureDclConnections` to push any new or changed connection definitions to the DCL manager (hash-guarded: unchanged configs are skipped).
2. Calls `CreateInstanceActor`, which does `Context.ActorOf(props, instanceName)`.
3. Runs an off-thread `Task` that calls `SiteStorageService.StoreDeployedConfigAsync`, clears static overrides and native alarm state, and — if `_replicationActor` is non-null (it is optional and null in isolated deployments/tests) — tells `SiteReplicationActor` to push to the peer node.
4. Pipes back a `DeployPersistenceResult`; only on success does it tell the deployer `DeploymentStatus.Success`. If persistence fails, the optimistically-created actor is stopped and the error is returned to central (`SiteRuntime-005`).
For redeployment (instance already running), the existing actor is stopped and watched:
```csharp
// DeploymentManagerActor.HandleDeploy
if (_instanceActors.TryGetValue(instanceName, out var existing))
{
_instanceActors.Remove(instanceName);
_pendingRedeploys[existing] = new PendingRedeploy(command, Sender);
_terminatingActorsByName[instanceName] = existing;
Context.Watch(existing);
Context.Stop(existing);
return;
}
```
The `Terminated` signal fires once the previous actor and its entire subtree have stopped (freeing the actor name), and only then does `ApplyDeployment` run for the replacement. A third deploy arriving mid-termination overwrites the buffered `PendingRedeploy` (last-write-wins) and tells the displaced sender a `Failed-superseded` response (`SiteRuntime-020`).
### Instance Actor initialization
On `PreStart`, `InstanceActor`:
1. Fires `SiteStorageService.GetStaticOverridesAsync` asynchronously and pipes the result to self as a `LoadOverridesResult` — this is non-blocking; the message arrives later in the mailbox.
2. Calls `CreateChildActors()` **immediately** (before the override message arrives). `CreateChildActors` snapshots `_attributes` (the live dictionary, seeded from flattened-config defaults) into `attributeSnapshot` before any child constructor runs. Each child's `Props` closure captures the immutable snapshot, not the live dictionary — preventing the race condition described in `SiteRuntime-017`. Because the override load is asynchronous, children are created from the un-overridden defaults; when the `LoadOverridesResult` message is subsequently processed, `HandleOverridesLoaded` applies the persisted overrides on top of the live `_attributes` dictionary.
3. Calls `SubscribeToDcl()`, grouping data-sourced attributes by connection name and sending `SubscribeTagsRequest` to the DCL manager. Tag paths are stored in `_tagPathToAttributes`, a `Dictionary<string, List<string>>`, because one physical tag can back more than one attribute canonical name.
Data-sourced attributes start with quality `Uncertain` until the first `TagValueUpdate` arrives; static attributes start with quality `Good`.
### Script compilation
`ScriptCompilationService.Compile(name, code)` first runs `ValidateTrustModel`, which uses Roslyn semantic analysis (not substring scanning) to detect references to forbidden namespaces (`System.IO`, `System.Diagnostics.Process`, `System.Threading` — except `Tasks`/`CancellationToken`, `System.Reflection`, `System.Net.Sockets`, `System.Net.Http`). Only after passing trust validation does it call `CSharpScript.Create<object?>` with the restricted `ScriptOptions` (references capped to `object`, `Enumerable`, `Math`, `CSharpArgumentInfo`, and `DynamicJsonElement` assemblies).
```csharp
// ScriptCompilationService.CompileCore
var violations = ValidateTrustModel(code);
if (violations.Count > 0)
return ScriptCompilationResult.Failed(violations);
var script = CSharpScript.Create<object?>(
code,
BuildScriptOptions(),
globalsType: globalsType);
var diagnostics = script.Compile();
```
`CompileTriggerExpression(name, expression)` follows the same path but uses `TriggerExpressionGlobals` as the globals type instead of `ScriptGlobals` — trigger expressions are read-only and have no access to the script runtime API.
### Shared script library
`SharedScriptLibrary` holds a `Dictionary<string, Script<object?>>` under a `lock`. It is populated at startup (off-thread by the singleton, piped back as `SharedScriptsLoaded`) and updated live when artifact deployments arrive carrying new shared scripts. Calling `Scripts.CallShared("name", params)` inside a script calls `SharedScriptLibrary.ExecuteAsync`, which runs the compiled delegate inline as compiled code (no actor message, no serialization) and is awaited by the caller.
### Script Actor triggers
`ScriptActor` parses the `ResolvedScript.TriggerType` and `TriggerConfiguration` into a discriminated union (`IntervalTriggerConfig`, `ValueChangeTriggerConfig`, `ConditionalTriggerConfig`, `ExpressionTriggerConfig`). Interval triggers use Akka `ITimerScheduler`. Value-change and conditional triggers react to `AttributeValueChanged` messages forwarded by the Instance Actor. Expression triggers maintain an `_attributeSnapshot` dictionary kept current by every `AttributeValueChanged`, then evaluate the compiled `_compiledTriggerExpression` synchronously with a 2-second `CancellationTokenSource` timeout.
`WhileTrue` mode is handled by `HandleWhileTrueTransition`:
```csharp
// ScriptActor.HandleWhileTrueTransition
private void HandleWhileTrueTransition(bool nowTrue, bool wasTrue)
{
if (nowTrue && !wasTrue)
{
TrySpawnExecution(null);
StartWhileTrueTimer();
}
else if (!nowTrue && wasTrue)
{
StopWhileTrueTimer();
}
}
```
On the false→true edge the script fires once and a periodic re-fire timer starts at `MinTimeBetweenRuns` cadence; on the true→false edge the timer stops.
### Site-wide Akka stream
`SiteStreamManager` materializes a broadcast hub in `Initialize(ActorSystem)`, called by the Host after Akka starts. The hub is fed by a `Source.ActorRef<ISiteStreamEvent>` (bounded with `OverflowStrategy.DropHead`). `InstanceActor` publishes via `_streamManager?.PublishAttributeValueChanged(changed)` and `PublishAlarmStateChanged(changed)` — both are Tell calls; they never block the actor.
Each subscriber (typically a `StreamRelayActor` created by the Communication Layer's `SiteStreamGrpcServer`) gets its own materialized sub-graph with an independent `Buffer(_bufferSize, DropHead)` and a `KillSwitch`. A slow subscriber drops only its own events; it cannot stall other subscribers or the publishing Instance Actor.
```csharp
// SiteStreamManager.Subscribe
var killSwitch = _hubSource
.Where(ev => ev.InstanceUniqueName == capturedInstance)
.Buffer(_bufferSize, OverflowStrategy.DropHead)
.ViaMaterialized(KillSwitches.Single<ISiteStreamEvent>(), Keep.Right)
.To(Sink.ForEach<ISiteStreamEvent>(ev => capturedSubscriber.Tell(ev)))
.Run(_materializer);
```
### Native alarm mirror
`NativeAlarmActor` mirrors the condition state of one source binding — an OPC UA A&C server or MxAccess Gateway connection — without writing back to the source. Each condition is keyed by `SourceReference`.
On `PreStart` it concurrently kicks off two operations: the SQLite rehydration (`GetNativeAlarmsAsync`, piped back as `RehydrationCompleted`) and a `SubscribeAlarmsRequest` to the DCL manager — the subscribe is sent before the async rehydration completes and is not gated on it. The DCL forwards the subscribe request to the connection's `IAlarmSubscribableConnection` implementation.
Transition handling:
- `AlarmTransitionKind.Snapshot` accumulates into `_snapshotBuffer`.
- `AlarmTransitionKind.SnapshotComplete` atomically swaps `_alarms` with `_snapshotBuffer`. Conditions absent from the snapshot emit return-to-normal events and drop from the mirror — the mechanism that reconciles state after a reconnect.
- Live transitions (`Raise`, `Ack`, `Clear`, etc.) upsert by `SourceReference`, ignoring transitions older than the held `TransitionTime` (out-of-order protection).
Retention: a condition that is both inactive and acknowledged (`!Active && Acknowledged`) is dropped from the mirror and its SQLite row deleted. If the mirror exceeds `MirroredAlarmCapPerSource` (default 1000), the oldest condition is dropped and logged. Persistence is fire-and-forget — a write failure is logged but never blocks the actor or suppresses the upward `AlarmStateChanged` emit.
### Enriched `AlarmStateChanged`
Both `AlarmActor` and `NativeAlarmActor` tell the `InstanceActor` an `AlarmStateChanged`. The message was extended additively so existing computed-alarm consumers continue to work unchanged:
| Field | Computed alarm | Native alarm |
|---|---|---|
| `Kind` | `AlarmKind.Computed` | `AlarmKind.NativeOpcUa` or `NativeMxAccess` |
| `Condition` | computed default (auto-acknowledged, `Severity = Priority`) | mirrored `AlarmConditionState` from source |
| `SourceReference`, `AlarmTypeName`, `Category`, `Message`, `OperatorUser`, `OperatorComment`, `OriginalRaiseTime`, `CurrentValue`, `LimitValue` | empty/null | populated from source transition |
`InstanceActor` stores the latest enriched event per alarm name in `_latestAlarmEvents`. The Debug View snapshot uses this map so native alarm metadata reaches the central debug view.
### Local SQLite schema
`SiteStorageService` owns the site database (raw `Microsoft.Data.Sqlite`, not EF Core). Tables created by `InitializeAsync`:
| Table | Purpose | Reset on redeploy? |
|---|---|---|
| `deployed_configurations` | Persisted flattened configs (survives restart/failover) | No (replaced) |
| `static_attribute_overrides` | Runtime attribute writes (`SetAttribute` on static attrs) | Yes — cleared by `ClearStaticOverridesAsync` |
| `native_alarm_state` | Mirrored native alarm conditions (survives failover) | Yes — cleared by `ClearNativeAlarmsForInstanceAsync` |
| `shared_scripts` | Shared script code from artifact deployments | No |
| `external_systems` | External system definitions | No |
| `database_connections` | Database connection strings | No |
| `data_connection_definitions` | OPC UA / MxGateway endpoint definitions | No |
| `notification_lists` | Notification list definitions | No |
| `smtp_configurations` | SMTP configuration (from artifact deployment) | No |
### Standby replication
`SiteReplicationActor` runs on every site node (not a singleton). The active node's `DeploymentManagerActor` tells it `ReplicateConfigDeploy`, `ReplicateConfigRemove`, `ReplicateConfigSetEnabled`, `ReplicateArtifacts`, or `ReplicateStoreAndForward`. The replication actor tracks the peer node via Akka cluster membership events and forwards each command to `/user/site-replication` on the peer via `ActorSelection`. Replication is fire-and-forget (no ack wait per design), so a failed write to the standby is logged but does not fail the primary operation.
## Usage
### Lifecycle commands
Central sends commands to the site `DeploymentManagerActor` singleton over the Communication Layer:
| Command | Effect |
|---|---|
| `DeployInstanceCommand` | Create or replace the instance actor; persist config to SQLite; clear static overrides and native alarm state |
| `DisableInstanceCommand` | Stop the instance actor; set `is_enabled = 0` in SQLite; retain config for re-enable |
| `EnableInstanceCommand` | Create a new instance actor from the stored config |
| `DeleteInstanceCommand` | Stop the instance actor; remove config from SQLite; store-and-forward messages are NOT cleared |
| `DeployArtifactsCommand` | Persist shared scripts, external system definitions, database connections, notification lists, data connection definitions; recompile shared scripts; push data connections to DCL |
### Script API surface
Scripts run inside `ScriptExecutionActor` with a `ScriptGlobals` object as the Roslyn host object. The `Instance` global is a `ScriptRuntimeContext`. Convenience top-level aliases (`ExternalSystem`, `Database`, `Notify`, `Scripts`, `Attributes`, `Children`, `Parent`) delegate to context methods. Key calls:
- `Instance.GetAttribute("name")` / `Instance.SetAttribute("name", value)` — Ask to `InstanceActor` for write, in-process for read.
- `Instance.CallScript("scriptName", params)` — Ask from `ScriptExecutionActor` to sibling `ScriptActor`, which spawns a new `ScriptExecutionActor`.
- `Scripts.CallShared("name", params)``SharedScriptLibrary.ExecuteAsync`, inline on the current scheduler thread.
- `ExternalSystem.Call(...)` — synchronous HTTP call through `IExternalSystemClient`.
- `ExternalSystem.CachedCall(...)` / `Database.CachedWrite(...)` — store-and-forwarded; returns a `TrackedOperationId`.
- `Tracking.Status(id)` — reads the site-local `OperationTrackingStore` synchronously.
- `Notify.To("list").Send(...)` — enqueues a notification in the Store-and-Forward Engine for delivery to central.
Alarm on-trigger scripts run in `AlarmExecutionActor` with a **restricted** context: they receive an `Alarm` global (`AlarmContext` carrying `Name`, `Level`, `Priority`, `Message`) and have access to the instance/shared-script surface (`Instance.*`, `Scripts.CallShared`, `Instance.CallScript`), but **not** the external-system, database, notification, or audit integration APIs. `AlarmExecutionActor` builds its `ScriptRuntimeContext` without a `serviceProvider`, so `ExternalSystem`, `Database`, `Notify`, and audit writes are unavailable to alarm on-trigger scripts — those APIs are only resolved inside `ScriptExecutionActor` (instance scripts).
### Debug view
The Communication Layer sends `SubscribeDebugViewRequest` or `DebugSnapshotRequest` to the singleton, which forwards to the named `InstanceActor`. The Instance Actor replies with a `DebugViewSnapshot` built from the current `_attributes` dictionary and `_latestAlarmEvents` map. Ongoing changes reach the central debug view via the gRPC stream, not through the actor hierarchy.
## Configuration
All options live in the `ScadaBridge:SiteRuntime` section, bound to `SiteRuntimeOptions`:
| Key | Default | Description |
|---|---|---|
| `StartupBatchSize` | `20` | Instance Actors created per batch during staggered startup |
| `StartupBatchDelayMs` | `100` | Milliseconds between startup batches |
| `MaxScriptCallDepth` | `10` | Maximum `Instance.CallScript` / `Scripts.CallShared` recursion depth |
| `ScriptExecutionTimeoutSeconds` | `30` | Per-script body execution timeout; exceeding it cancels and logs an error |
| `StreamBufferSize` | `1000` | Per-subscriber drop-oldest buffer size for the Akka broadcast stream |
| `ScriptExecutionThreadCount` | `8` | Dedicated threads in the `ScriptExecutionScheduler` (covers both scripts and alarm on-trigger bodies) |
| `MirroredAlarmCapPerSource` | `1000` | Maximum mirrored conditions per `NativeAlarmActor` source binding before oldest is dropped and logged |
| `NativeAlarmRetryIntervalMs` | `5000` | Milliseconds before retrying a failed native alarm subscription |
The SQLite connection string is passed directly to `AddSiteRuntime(connectionString)` in the host composition root and is not part of `SiteRuntimeOptions`.
## Dependencies & Interactions
- [Data Connection Layer (#4)](./DataConnectionLayer.md) — supplies `TagValueUpdate` and `ConnectionQualityChanged` messages to `InstanceActor`; receives `SubscribeTagsRequest` and `WriteTagRequest`. Also supplies `NativeAlarmTransitionUpdate` and `NativeAlarmSourceUnavailable` to `NativeAlarmActor` via `SubscribeAlarmsRequest` (connections implementing `IAlarmSubscribableConnection`).
- [CentralSite Communication (#5)](./Communication.md) — routes `DeployInstanceCommand`, `DisableInstanceCommand`, `EnableInstanceCommand`, `DeleteInstanceCommand`, `DeployArtifactsCommand`, debug view requests, and Inbound API `RouteToCallRequest` / `RouteToGetAttributesRequest` / `RouteToSetAttributesRequest` to the singleton; receives `DeploymentStatusResponse` and `ArtifactDeploymentResponse` back. The `SiteStreamManager` implements `ISiteStreamSubscriber` so the Communication Layer's `SiteStreamGrpcServer` can subscribe `StreamRelayActor` instances to the broadcast hub.
- [Store-and-Forward Engine (#6)](./StoreAndForward.md) — `ScriptRuntimeContext` passes `StoreAndForwardService` (resolved from DI inside `ScriptExecutionActor`) for `ExternalSystem.CachedCall`, `Database.CachedWrite`, and `Notify.To().Send()`. Owns the site-local operation tracking table that `Tracking.Status(id)` reads.
- [External System Gateway (#7)](./ExternalSystemGateway.md) — `IExternalSystemClient` called by `ScriptRuntimeContext.ExternalSystemHelper` for synchronous and cached external system calls.
- [Site Event Logging (#12)](./SiteEventLogging.md) — `ISiteEventLogger` (resolved from DI inside execution actors) receives script error, alarm error, and script execution events.
- [Health Monitoring (#11)](./HealthMonitoring.md) — `ISiteHealthCollector` (injected into `DeploymentManagerActor`, `InstanceActor`, `ScriptActor`, `AlarmActor`) tracks instance counts (`SetInstanceCounts`), script errors (`IncrementScriptError`), and alarm errors (`IncrementAlarmError`); sets `SetActiveNode` in `DeploymentManagerActor.PreStart`/`PostStop` so the health report reflects which node holds the singleton.
- [Audit Log (#23)](./AuditLog.md) — `IAuditWriter` (resolved from DI inside `ScriptExecutionActor`) receives one row per script-trust-boundary call; audit writes are best-effort and never abort the calling script.
- [Commons (#16)](./Commons.md) — owns all message contracts (`DeployInstanceCommand`, `AttributeValueChanged`, `AlarmStateChanged`, `ScriptCallRequest`, `NativeAlarmTransitionUpdate`, etc.), the `FlattenedConfiguration` / `ResolvedScript` / `ResolvedAlarm` / `ResolvedNativeAlarmSource` types, and the `AlarmKind` / `AlarmState` / `AlarmLevel` / `AlarmConditionState` / `AlarmTransitionKind` enums.
- Local SQLite — `SiteStorageService` owns the site database. Peer SQLite stores (Store-and-Forward buffer, AuditLog, operation tracking, site event log) are owned by their respective components but share the same SQLite file path convention.
- Design spec: [Component-SiteRuntime.md](../requirements/Component-SiteRuntime.md).
## Troubleshooting
### Instance stays in `Unknown` or `Failed` after deployment
`DeploymentManagerActor` only tells central `DeploymentStatus.Success` after `SiteStorageService.StoreDeployedConfigAsync` commits. If the site SQLite file is locked, full, or on a read-only volume, the persistence task throws, the optimistically-created actor is stopped, and central receives `Failed`. Check the site event log for `Failed to persist deployment` entries. The site SQLite path is configured in the host `appsettings.json`.
### Reconnection storm on failover
If many instances race to subscribe to OPC UA at once the server may throttle or drop connections. Increase `StartupBatchDelayMs` or decrease `StartupBatchSize` in `SiteRuntimeOptions`. The current defaults (batch 20, delay 100 ms) mean a site with 200 instances takes 1 second to start all subscriptions, which is acceptable for most servers.
### Script execution actor backpressure
The `ScriptExecutionScheduler` has a fixed thread count (`ScriptExecutionThreadCount`, default 8). If all threads are blocked (a burst of scripts each waiting on a slow database or external system), new script invocations queue behind them. The queue is unbounded — memory usage can grow during a backlog. If this is observed, raise `ScriptExecutionThreadCount` or reduce the number of concurrent long-running scripts. Script execution timeout (`ScriptExecutionTimeoutSeconds`) bounds the worst case.
### Native alarm conditions not recovering after reconnect
`NativeAlarmActor` retains last-known conditions during a source outage (it does not clear them) and reconciles state via the reconnect snapshot swap. If the snapshot never arrives (the DCL connection was cleanly unsubscribed rather than failing), the actor may hold stale Active conditions indefinitely. A redeploy of the instance clears `native_alarm_state` in SQLite and forces fresh subscription. Failed subscription retries are logged at `Warning` level with the retry interval.
### `InvalidActorNameException` on rapid redeployment
If two `DeployInstanceCommand` messages arrive for the same instance while the first redeployment is still terminating, the `_terminatingActorsByName` shadow index in `DeploymentManagerActor` detects the collision and buffers the second command (`SiteRuntime-020`). The displaced deploy receives a `Failed-superseded` response. This is expected behaviour — central should observe the `Failed` response and retry when the site is ready.
## Related Documentation
- [Site Runtime design specification](../requirements/Component-SiteRuntime.md)
- [CentralSite Communication](./Communication.md)
- [Commons](./Commons.md)
- [Host](./Host.md)
- [Audit Log](./AuditLog.md)
- [Cluster Infrastructure](./ClusterInfrastructure.md)
+312
View File
@@ -0,0 +1,312 @@
# Store-and-Forward Engine
The Store-and-Forward Engine buffers site-originated outbound messages when a target system or the central cluster is unreachable, retries them on a fixed interval, parks those that exhaust their retry budget, and persists the buffer in a local SQLite database that is asynchronously replicated to the standby node for failover continuity.
## Overview
The Store-and-Forward Engine (#6) is a site-only component. The central cluster has no equivalent buffer; it uses the Notification Outbox (#21) instead for its own queued delivery work. Every site node runs one `StoreAndForwardService` instance, backed by a `StoreAndForwardStorage` SQLite store and an optional `ReplicationService` that fans each buffer mutation to the standby.
The component code lives in `src/ZB.MOM.WW.ScadaBridge.StoreAndForward/`:
- `StoreAndForwardService` — the core buffer: enqueue, retry sweep, park/retry/discard, and the `ICachedCallLifecycleObserver` audit hook.
- `StoreAndForwardStorage` — the SQLite layer; all reads and writes against `sf_messages`.
- `ReplicationService` — fire-and-forget buffer replication to the standby.
- `ParkedMessageHandlerActor` — Akka actor bridge that exposes parked-message query/retry/discard to the `SiteCommunicationActor`.
- `NotificationForwarder` — the delivery handler for the `Notification` category; forwards buffered notifications to central via the ClusterClient transport and interprets the ack.
- `StoreAndForwardOptions` — options class bound from the `StoreAndForward` configuration section.
- `IStoreAndForwardSiteContext` — narrow interface through which the Host supplies the site identifier without creating a project-reference cycle with Health Monitoring.
DI registration is via `ServiceCollectionExtensions.AddStoreAndForward`. Actor bindings (`AddStoreAndForwardActors`) are a separate call resolved during Akka startup in the Host.
The operation tracking table that backs `Tracking.Status(id)` is **not** owned by this component; its implementation (`OperationTrackingStore`, `OperationTrackingOptions`) lives in `src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/`. The engine carries the `TrackedOperationId` linking a buffered message to its tracking row and drives tracking updates through the `ICachedCallLifecycleObserver` hook. The tracking table is documented here because its lifecycle is coupled to the S&F retry loop.
## Key Concepts
### Three message categories
`StoreAndForwardCategory` has three values, each serviced by its own registered delivery handler:
| Category | Delivery target | Tracked? |
|---|---|---|
| `ExternalSystem` | External system API (HTTP) | Yes — `TrackedOperationId` |
| `Notification` | Central cluster (`NotificationForwarder`) | No — central `Notifications` table |
| `CachedDbWrite` | Database connection | Yes — `TrackedOperationId` |
Only `ExternalSystem` and `CachedDbWrite` generate cached-call audit telemetry through the `ICachedCallLifecycleObserver` hook. `Notification` has its own central-side audit pipeline (Notification Outbox / Audit Log) and is explicitly excluded from that hook.
### Transient vs. permanent failures
Only transient failures are buffered. The delivery handler contract is:
- Returns `true` — delivered. The message is removed from the buffer (or, on the immediate path, never buffered).
- Returns `false` — permanent failure. The message is not buffered on the immediate path; on a retry the row is parked immediately.
- Throws — transient failure. On the immediate path the message is buffered for retry; on a retry the retry count is incremented and the row is parked once `MaxRetries` is reached.
A permanent failure for a cached-call category additionally writes a terminal `Failed` row to the operation tracking table via the observer hook. The error is returned synchronously to the calling script; no buffer row is created for a permanent failure.
### Fixed retry interval and no max buffer size
The retry interval is fixed — not exponential. There is no maximum buffer size; messages accumulate until delivery succeeds or the retry budget is exhausted. The default interval is 30 seconds and the background sweep fires every 10 seconds (checking which rows are due via the `last_attempt_at` predicate). Both are configurable.
### Retry budget and parking
`StoreAndForwardMessage.MaxRetries` controls how many background-sweep attempts the engine makes before parking. `MaxRetries = 0` means **no limit** — the message retries on every sweep until delivered and is never parked for retry exhaustion. It is not a "never retry" value; callers that want unbounded retry pass `maxRetries: 0` explicitly. The `EnqueueAsync` `maxRetries` parameter defaults to `StoreAndForwardOptions.DefaultMaxRetries` (50).
### Messages not cleared on instance deletion
When an instance is deleted, its buffered S&F messages are not removed. `StoreAndForwardMessage.OriginInstanceName` records the originating instance at enqueue time so the buffer can continue to deliver and so the central UI can attribute parked messages even after the instance is gone.
### CachedCall idempotency is the caller's responsibility
`StoreAndForwardService` does not deduplicate. If the same message is enqueued twice it is delivered twice. Callers using `ExternalSystem.CachedCall()` or `Database.CachedWrite()` must design payloads to be idempotent, for example by including unique request IDs and relying on the remote end to handle duplicates.
## Architecture
### Buffer storage — `sf_messages`
`StoreAndForwardStorage.InitializeAsync` creates the `sf_messages` table and its indexes:
```sql
CREATE TABLE IF NOT EXISTS sf_messages (
id TEXT PRIMARY KEY,
category INTEGER NOT NULL,
target TEXT NOT NULL,
payload_json TEXT NOT NULL,
retry_count INTEGER NOT NULL DEFAULT 0,
max_retries INTEGER NOT NULL DEFAULT 50,
retry_interval_ms INTEGER NOT NULL DEFAULT 30000,
created_at TEXT NOT NULL,
last_attempt_at TEXT,
status INTEGER NOT NULL DEFAULT 0,
last_error TEXT,
origin_instance TEXT
);
CREATE INDEX IF NOT EXISTS idx_sf_messages_status ON sf_messages(status);
CREATE INDEX IF NOT EXISTS idx_sf_messages_category ON sf_messages(category);
```
Three nullable columns (`execution_id`, `source_script`, `parent_execution_id`) were added by additive migrations after initial rollout. SQLite lacks `ADD COLUMN IF NOT EXISTS`, so each column is probed via `PRAGMA table_info` before the `ALTER TABLE` is issued — making `InitializeAsync` idempotent.
`StoreAndForwardStorage` opens a fresh `SqliteConnection` per call and relies on the Microsoft.Data.Sqlite connection pool (keyed on the connection string) for acceptable performance on the retry sweep. If a pooled-open ever becomes a bottleneck the remedy is a batched sweep API that opens one connection per sweep.
The engine uses two status values from `StoreAndForwardMessageStatus`: `Pending` (0) and `Parked` (2). On successful delivery the row is deleted (`RemoveMessageAsync`) — there is no `Delivered` status written. The enum also declares `InFlight` (1) and `Delivered` (3) but neither is assigned anywhere in the engine; they are dead values. The retry sweep loads only `Pending` rows whose `last_attempt_at` is older than `retry_interval_ms`.
### Retry sweep
`StoreAndForwardService.RetryPendingMessagesAsync` is the background sweep, fired by `_retryTimer` on `RetryTimerInterval` (default 10 s). An `Interlocked` flag prevents overlapping sweeps. `StopAsync` stops the timer, then awaits any in-flight sweep up to `SweepShutdownWaitTimeout` (10 s) before returning so the host can safely dispose `_storage` and `_replication`.
Each `RetryMessageAsync` call invokes the registered delivery handler for the message's category. A conditional `UpdateMessageIfStatusAsync` is used for every state-changing write so a concurrent operator action (retry, discard) is not silently overwritten by the sweep:
```csharp
// Transient failure — increment retry, check budget.
message.RetryCount++;
message.LastAttemptAt = DateTimeOffset.UtcNow;
message.LastError = ex.Message;
if (message.MaxRetries > 0 && message.RetryCount >= message.MaxRetries)
{
message.Status = StoreAndForwardMessageStatus.Parked;
var parked = await _storage.UpdateMessageIfStatusAsync(
message, StoreAndForwardMessageStatus.Pending);
if (!parked) return; // operator action won the race
Interlocked.Decrement(ref _bufferedCount);
_replication?.ReplicatePark(message);
// … observer notification …
}
else
{
if (!await _storage.UpdateMessageIfStatusAsync(
message, StoreAndForwardMessageStatus.Pending))
return; // operator action won the race
// … observer notification (TransientFailure) …
}
```
### Queue-depth gauge
`StoreAndForwardService` maintains a `long _bufferedCount` in-process gauge seeded from a `COUNT(*)` at startup. `BufferAsync` increments it; successful delivery and `Pending→Parked` transitions decrement it; operator requeue (`Parked→Pending`) increments it. `ScadaBridgeTelemetry.SetQueueDepthProvider` registers a sync, non-blocking read callback so the OpenTelemetry/Prometheus collector never needs to run an async query. The gauge is approximate: it is eventually consistent with the store, and standby replication applies to the standby's own counter separately.
### Async replication to standby
`ReplicationService` wraps each buffer mutation — add, remove, park, requeue — in a `Task.Run` fire-and-forget. The active node does not wait for standby acknowledgment. The standby applies each `ReplicationOperation` via `ApplyReplicatedOperationAsync`, which calls the same `StoreAndForwardStorage` methods. Replication failures are logged at Debug and discarded; the standby may be slightly behind the active at any moment, producing at-most a few duplicate deliveries or missed retries after a failover — an accepted trade-off for zero added latency on the enqueue path.
The four `ReplicationOperationType` values are `Add`, `Remove`, `Park`, and `Requeue` (requeue was added to cover the operator-initiated `Parked→Pending` transition so the standby preserves retry intent after failover).
### Notification delivery
`NotificationForwarder` is the delivery handler for `StoreAndForwardCategory.Notification`. It deserializes the buffered `PayloadJson` as a `NotificationSubmit`, re-stamps `SourceSiteId` and `SourceInstanceId` from the forwarder's own context (the site is authoritative for these), and sends the submit to the `SiteCommunicationActor` via Akka's `Ask` with a configurable timeout. A `NotificationSubmitAck` with `Accepted = true` returns `true`; any other ack or a timeout throws `NotificationForwardException`, which the engine treats as transient. A payload that cannot be deserialized is logged at Warning and discarded (returns `true`) rather than parked — a corrupt payload cannot be fixed by retrying.
Notification messages are subject to the same retry budget as every other category. The notification enqueue call passes no explicit `maxRetries`, so it inherits `StoreAndForwardOptions.DefaultMaxRetries` (50). Under a sustained central outage that exhausts all 50 retry attempts, the buffered notification is parked and surfaces in the parked-message UI exactly like any other parked message. Callers that require unbounded retry must pass `maxRetries: 0` to `EnqueueAsync`.
### Parked message management
`ParkedMessageHandlerActor` is the Akka bridge between `SiteCommunicationActor` and `StoreAndForwardService`. It handles five message types from central:
| Message | Action |
|---|---|
| `ParkedMessageQueryRequest` | Paginated list of parked rows, all categories |
| `ParkedMessageRetryRequest` | Move a parked row back to `Pending` |
| `ParkedMessageDiscardRequest` | Delete a parked row |
| `RetryParkedOperation` | Retry a parked cached call (keyed by `TrackedOperationId`) |
| `DiscardParkedOperation` | Discard a parked cached call (keyed by `TrackedOperationId`) |
All five use `PipeTo` for idiomatic Akka async reply. `RetryParkedMessageAsync` resets `retry_count = 0` and `last_attempt_at = NULL` so the requeued message is due on the next sweep, and replicates a `Requeue` operation to the standby. `DiscardParkedMessageAsync` deletes the row and replicates a `Remove`.
### Operation tracking table
The operation tracking table (`OperationTracking`) is a SQLite table in `site-tracking.db`, owned by `OperationTrackingStore` in `src/ZB.MOM.WW.ScadaBridge.SiteRuntime/Tracking/`. Its schema:
```sql
CREATE TABLE IF NOT EXISTS OperationTracking (
TrackedOperationId TEXT NOT NULL PRIMARY KEY,
Kind TEXT NOT NULL,
TargetSummary TEXT NULL,
Status TEXT NOT NULL,
RetryCount INTEGER NOT NULL DEFAULT 0,
LastError TEXT NULL,
HttpStatus INTEGER NULL,
CreatedAtUtc TEXT NOT NULL,
UpdatedAtUtc TEXT NOT NULL,
TerminalAtUtc TEXT NULL,
SourceInstanceId TEXT NULL,
SourceScript TEXT NULL,
SourceNode TEXT NULL
);
CREATE INDEX IF NOT EXISTS IX_OperationTracking_Status_Updated
ON OperationTracking (Status, UpdatedAtUtc);
```
One row per `TrackedOperationId`; lifecycle `Submitted → Retrying → Delivered / Parked / Failed / Discarded`. Writes are serialised through a `SemaphoreSlim` on a single owned `SqliteConnection`. Reads open a fresh connection to avoid blocking status queries behind in-flight writes.
`Tracking.Status(id)` reads this table **site-locally and authoritatively** — the answer never round-trips to central, even when central is unreachable. Terminal rows are purged after `OperationTrackingOptions.RetentionDays` (default 7 days). The `PurgeTerminalAsync` call only removes rows where `TerminalAtUtc IS NOT NULL` and `TerminalAtUtc < threshold`; non-terminal (in-flight) rows are never purged.
The S&F engine connects to this table only through the `ICachedCallLifecycleObserver` hook, not directly. `OperationTrackingStore` is wired in `SiteRuntime` and injected into the observer implementation; `StoreAndForward` carries the `TrackedOperationId` on the buffered message and passes it to the observer on each attempt.
### Cached-call observer hook
`ICachedCallLifecycleObserver.OnAttemptCompletedAsync` is called by the retry sweep after every `ExternalSystem` or `CachedDbWrite` delivery attempt with a `CachedCallAttemptContext` record:
```csharp
context = new CachedCallAttemptContext(
TrackedOperationId: trackedId,
Channel: channel, // "ApiOutbound" or "DbOutbound"
Target: message.Target,
SourceSite: _siteId,
Outcome: outcome, // Delivered / TransientFailure / PermanentFailure / ParkedMaxRetries
RetryCount: message.RetryCount,
LastError: lastError,
HttpStatus: httpStatus,
CreatedAtUtc: message.CreatedAt.UtcDateTime,
OccurredAtUtc: attemptStartUtc,
DurationMs: (int)attemptStopwatch.ElapsedMilliseconds,
SourceInstanceId: message.OriginInstanceName,
ExecutionId: message.ExecutionId,
SourceScript: message.SourceScript,
ParentExecutionId: message.ParentExecutionId);
```
The observer implementation (in `ZB.MOM.WW.ScadaBridge.AuditLog`) maps the outcome to `OperationTrackingStore` writes and builds the `CachedCallTelemetry` packet for the central Site Call Audit component. Observer failures are swallowed — a failing audit pipeline must never corrupt S&F retry bookkeeping or be misclassified as a transient delivery failure.
The `_siteId` stamped onto every context is sourced from the optional `IStoreAndForwardSiteContext` binding resolved at construction time. A null or whitespace site id is normalised to `UnknownSiteSentinel` (`$unknown-site`) so a misconfigured host produces a distinctive marker in the central audit log rather than silently merging multiple sites into an empty-string bucket.
## Usage
### Registering the service
```csharp
// In the Host composition root (site node only):
services.AddStoreAndForward();
services.AddStoreAndForwardActors();
services.Configure<StoreAndForwardOptions>(
configuration.GetSection("StoreAndForward"));
```
### Enqueueing a message
```csharp
public async Task<StoreAndForwardResult> EnqueueAsync(
StoreAndForwardCategory category,
string target,
string payloadJson,
string? originInstanceName = null,
int? maxRetries = null,
TimeSpan? retryInterval = null,
bool attemptImmediateDelivery = true,
string? messageId = null,
Guid? executionId = null,
string? sourceScript = null,
Guid? parentExecutionId = null)
```
Pass `attemptImmediateDelivery: false` when the caller has already attempted delivery itself — the message is placed directly into the buffer for the background sweep without invoking the handler again. The Notification Outbox uses the `messageId` overload to pin the script-generated `NotificationId` as the buffer row's id (the single idempotency key from script through central ingest).
`StoreAndForwardResult` carries `Accepted` (true if delivered or buffered), `MessageId`, and `WasBuffered`.
### Registering a delivery handler
```csharp
_storeAndForwardService.RegisterDeliveryHandler(
StoreAndForwardCategory.ExternalSystem,
async message => await _externalSystemGateway.DeliverAsync(message));
```
Handlers are registered by the component that owns the delivery channel (External System Gateway, database adapter, `NotificationForwarder`) during startup before `StartAsync` is called.
## Configuration
Options class: `StoreAndForwardOptions`, bound from the `StoreAndForward` configuration section.
| Key | Default | Description |
|---|---|---|
| `SqliteDbPath` | `./data/store-and-forward.db` | Path to the SQLite buffer database. The directory is created on startup if absent. |
| `ReplicationEnabled` | `true` | Whether to replicate buffer operations to the standby node. |
| `DefaultRetryInterval` | `00:00:30` | Fixed retry interval applied when `EnqueueAsync` is called without an explicit `retryInterval`. |
| `DefaultMaxRetries` | `50` | Max background-sweep attempts before parking. Applied when `EnqueueAsync` is called without an explicit `maxRetries`. `0` = no limit. |
| `RetryTimerInterval` | `00:00:10` | Cadence of the background retry sweep timer. |
Operation tracking options live separately under `OperationTrackingOptions` (bound in Site Runtime):
| Key | Default | Description |
|---|---|---|
| `ConnectionString` | `Data Source=site-tracking.db` | ADO.NET connection string for the tracking SQLite database. |
| `RetentionDays` | `7` | Terminal rows older than this many days are deleted by the nightly purge. |
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `StoreAndForwardCategory`, `StoreAndForwardMessageStatus`, `TrackedOperationId`, `TrackingStatusSnapshot`, `ICachedCallLifecycleObserver` / `CachedCallAttemptContext` / `CachedCallAttemptOutcome`, `IOperationTrackingStore`, and the `RemoteQuery` message contracts (`ParkedMessageQueryRequest/Response`, `ParkedMessageRetryRequest/Response`, `ParkedMessageDiscardRequest/Response`, `RetryParkedOperation`, `DiscardParkedOperation`, `ParkedOperationActionAck`).
- [CentralSite Communication (#5)](./Communication.md) — carries `ParkedMessageQueryRequest/Response` and operator Retry/Discard commands between the central UI and `ParkedMessageHandlerActor`. Also carries buffered notifications (`NotificationSubmit` / `NotificationSubmitAck`) from `NotificationForwarder` to the Notification Outbox, and `CachedCallTelemetry` from the observer implementation to Site Call Audit.
- [Notification Outbox (#21)](./NotificationOutbox.md) — the central destination for the `Notification` category. Central ingests each forwarded `NotificationSubmit` into the `Notifications` table and replies with `NotificationSubmitAck`; on `Accepted = true` the engine clears the buffered row. The S&F engine is the site half of the outbox handoff.
- [Site Call Audit (#22)](./SiteCallAudit.md) — the central mirror for cached-call status. Receives `CachedCallTelemetry` (audit rows + operational tracking snapshot) emitted by the observer on each S&F attempt outcome. Relays `RetryParkedOperation` / `DiscardParkedOperation` commands to the site when an operator acts on a parked cached call via the central UI.
- [Audit Log (#23)](./AuditLog.md) — the observer implementation (`ICachedCallLifecycleObserver`) lives in the Audit Log component. It maps `CachedCallAttemptContext` onto `AuditLog` rows and drives the `CachedCallTelemetry` packet to central.
- [Site Runtime (#3)](../requirements/Component-SiteRuntime.md) — owns the `OperationTrackingStore` and `OperationTrackingOptions` that back `Tracking.Status(id)`. Script Actors submit messages to `StoreAndForwardService.EnqueueAsync` on the buffered-call paths.
- [Health Monitoring (#11)](../requirements/Component-HealthMonitoring.md) — `ScadaBridgeTelemetry.SetQueueDepthProvider` registers the `_bufferedCount` gauge read by the OpenTelemetry/Prometheus collector. The `scadabridge.store_and_forward.queue.depth` gauge surfaces on the site health report.
- [Site Event Logging (#12)](../requirements/Component-SiteEventLogging.md) — the `OnActivity` event on `StoreAndForwardService` posts activity strings (Queued, Delivered, Retried, Parked, Retry, Discard) to the site event log.
- Design spec: [Component-StoreAndForward.md](../requirements/Component-StoreAndForward.md).
## Troubleshooting
### A message stays in the Pending queue and is never delivered
The retry sweep only picks up rows where `status = Pending AND (last_attempt_at IS NULL OR elapsed >= retry_interval_ms)`. If a row never appears in the sweep output, check that the delivery handler for the category is registered before `StartAsync` is called. A missing handler causes the sweep to log a Warning at category level and skip the row; the row stays `Pending` indefinitely rather than being parked.
### A parked cached call does not respond to Retry from the central UI
`RetryParkedOperation` and `DiscardParkedOperation` are keyed by `TrackedOperationId`, which is the S&F buffer message's `Id`. The buffer row's `Id` is the GUID string of the `TrackedOperationId` in `"N"` (no-hyphens) format for engine-minted ids, or `"D"` (hyphenated) format when the caller supplies one. `TrackedOperationId.TryParse` accepts both; confirm that the id in the command matches the stored row id.
### Standby has duplicate or stale rows after failover
Replication is best-effort and fire-and-forget. A message delivered just before failover may still appear in the standby's buffer (the `Remove` replication did not arrive in time) and will be re-delivered. A message buffered just before failover may not appear (the `Add` replication did not arrive in time) and will be silently skipped. Both are accepted trade-offs; the expected rate is a handful of events per failover, not a systematic backlog.
### `$unknown-site` appears in central audit rows for a site's cached calls
`StoreAndForwardService` was constructed without an `IStoreAndForwardSiteContext` binding, so the site id could not be resolved. Ensure the Host calls `services.AddSingleton<IStoreAndForwardSiteContext>(…)` with an adapter that forwards to the same `NodeOptions.SiteId` read by `ISiteIdentityProvider`.
## Related Documentation
- [Store-and-Forward design specification](../requirements/Component-StoreAndForward.md)
- [Notification Outbox](./NotificationOutbox.md)
- [Site Call Audit](./SiteCallAudit.md)
- [Audit Log](./AuditLog.md)
- [CentralSite Communication](./Communication.md)
- [Commons](./Commons.md)
+284
View File
@@ -0,0 +1,284 @@
# Template Engine
The Template Engine models the machine blueprints — templates — from which all deployed instances are created. It enforces inheritance, composition, locking, naming rules, and acyclicity at authoring time, then flattens the resulting graph plus instance overrides into a concrete, revision-hashed `FlattenedConfiguration` that the Deployment Manager sends to sites.
## Overview
Template Engine (#1) runs on the central cluster only. Sites receive flattened output and have no awareness of template structure. The component code lives in `src/ZB.MOM.WW.ScadaBridge.TemplateEngine/`, organized as follows:
- Root — `TemplateService`, `SharedScriptService`, `TemplateResolver`, `CycleDetector`, `CollisionDetector`, `LockEnforcer`, `TemplateNaming` — core authoring operations and graph invariant enforcement.
- `Flattening/``FlatteningService`, `RevisionHashService`, `DiffService` — produce and compare the deployment-ready representation.
- `Validation/``ValidationService`, `SemanticValidator`, `ScriptCompiler`, `CSharpDelimiterScanner` — pre-deployment and on-demand correctness checks.
- `Services/``InstanceService`, `SiteService`, `AreaService`, `TemplateFolderService`, `TemplateDeletionService` — domain operations that depend on the scoped `ITemplateEngineRepository`.
The single DI entry point is `ServiceCollectionExtensions.AddTemplateEngine`. `TemplateService` and `SharedScriptService` are scoped; the flattening and validation utilities are transient; static helpers (`CycleDetector`, `CollisionDetector`, `LockEnforcer`, `TemplateResolver`) are not registered.
## Key Concepts
### Template graph
The full set of templates forms a directed graph with two independent edge types:
- **Inheritance**`Template.ParentTemplateId` (nullable `int?`). A null value means no parent; a non-null value sets the defining ancestor. The parent is set at creation time and is immutable thereafter; `UpdateTemplateAsync` rejects any attempt to change it.
- **Composition**`TemplateComposition` rows, each pointing from an owner `TemplateId` to a `ComposedTemplateId` with a slot name (`InstanceName`). Only base (non-derived) templates may be composed.
Both edge types are enforced acyclic on every mutating call. `CycleDetector` provides three checks:
- `DetectInheritanceCycle` — walks the proposed parent chain upward looking for the template being modified.
- `DetectCompositionCycle` — BFS from the proposed composed template through its own compositions.
- `DetectCrossGraphCycle` — BFS across both inheritance and composition edges simultaneously, catching cycles that neither pure check alone would find.
`AddCompositionAsync` runs `DetectCompositionCycle` and `DetectCrossGraphCycle` before any composition is written — `DetectInheritanceCycle` does not run on the composition path. Because the graph can contain not-yet-saved templates (Id = 0), `CycleDetector.BuildLookup` uses `TryAdd` rather than `ToDictionary` so duplicate Ids do not throw.
### Derived templates and the composition model
When `AddCompositionAsync` composes template B into template A under slot name `Pump`, the engine calls `CreateCascadedCompositionAsync`, which:
1. Creates a derived `Template` (`IsDerived = true`, `ParentTemplateId = B.Id`, `Name = "Pump"`) as the slot-owned backing record.
2. Copies B's attributes, alarms, and scripts onto the derived template with `IsInherited = true`.
3. Creates a `TemplateComposition` row linking A to the derived template.
4. Sets `derived.OwnerCompositionId` so the slot can be deleted as a unit.
5. Recurses into B's own compositions to replicate them under the new derived template.
Derived templates are hidden from the main template tree and cannot be directly composed or deleted by name — removal goes through `DeleteCompositionAsync`, which calls `CascadeDeleteDerivedAsync` to tear the whole subtree down.
### Canonical naming and path qualification
Members of composed modules are addressed by **path-qualified canonical names**: `[ModuleInstanceName].[MemberName]`. For deeper nesting the dot chain extends: `Pump.Motor.Speed`. Direct members of the owning template carry no prefix.
`TemplateResolver.ResolveAllMembers` builds a `Dictionary<string, ResolvedTemplateMember>` in inheritance-chain order (root first) then adds composed-module members with their prefix. The last writer on a given canonical name wins — child overrides shadow parent definitions. `FindMemberByCanonicalName` is the entry point for lock checks during `UpdateAttribute/Alarm/ScriptAsync`.
`TemplateNaming.QualifiedName` computes the full dotted path of a derived template at read time by walking the `OwnerCompositionId` chain — the derived template stores only its contained name (`InstanceName`), so the full path is never stored and cannot drift.
### Locking and override granularity
`LockEnforcer` enforces three classes of rules for attributes, alarms, and scripts:
| Rule | Mechanism |
|------|-----------|
| `IsLocked` = true blocks all downstream overrides | `ValidateLockChange`: once set, cannot be cleared (one-way ratchet). |
| `LockedInDerived` = true blocks derived-template overrides of that specific member | `ValidateLockedInDerivedChange`: also a one-way ratchet — cannot be cleared on a base template. |
| Fixed fields cannot change at any level | `ValidateAttributeOverride` / `ValidateAlarmOverride` / `ValidateScriptOverride`. |
**Attribute fixed fields**: `DataType`, `DataSourceReference`. Overridable: `Value`, `Description`.
**Alarm fixed fields**: `Name`, `TriggerType`. Overridable: `PriorityLevel`, `TriggerConfiguration`, `Description`, `OnTriggerScriptId`.
**Script fixed fields**: `Name`. Overridable: `Code`, `TriggerType`, `TriggerConfiguration`, `MinTimeBetweenRuns`, `ParameterDefinitions`, `ReturnDefinition`.
Intermediate locking is permitted: a child template can lock an unlocked member inherited from its parent. Unlocking is never permitted at any level.
### Naming collisions
Adding a member (attribute, alarm, script, or composition) triggers `CollisionDetector.DetectCollisions` on a speculative clone of the template. The detector collects all canonical names — direct members plus path-qualified composed-module members plus inherited members — groups them by canonical name, and reports any group where two entries come from different origin descriptions. A collision is a design-time error and blocks the operation.
Because each composition slot has a unique `InstanceName` prefix, members from different slots can never collide by canonical name. Collisions arise only when a directly defined member shares an unqualified name with an inherited or composed member under the same owner.
### Flattening
`FlatteningService.Flatten` takes the instance, its template inheritance chain (most-derived first), a composition map, per-composed-template chains, and available data connections, and produces a `FlattenedConfiguration`. The resolution order is:
1. Instance overrides (highest priority, respects locks).
2. Most-derived template in the inheritance chain.
3. Parent templates, walking to the root.
4. Composed module members, path-qualified.
The nine steps in order:
1. Validate `LockedInDerived` is not violated across each chain.
2. Resolve attributes from the inheritance chain (base-to-derived; `IsInherited` placeholders never shadow live base values).
3. Resolve composed-module attributes with path-qualified canonical names, recursively.
4. Apply `InstanceAttributeOverride` records (locked attributes are silently skipped).
5. Apply `InstanceConnectionBinding` records (data-sourced attributes only).
6. Resolve alarms; for `HiLo` trigger type, merge setpoints key-by-key so a derived template can override just `hi` while inheriting `loLo`.
7. Resolve scripts; wire `ScriptScope` (self- and parent-path) into each composed script so `Attributes["X"]` resolves to the right path-prefix at runtime. Then resolve alarm `OnTriggerScriptId` FKs to canonical script names (`ResolveAlarmScriptReferences`).
8. Resolve native alarm source bindings (`TemplateNativeAlarmSource`), apply `InstanceNativeAlarmSourceOverride`.
9. Collect connection configurations — iterate resolved attributes, and for each attribute that has a bound data connection, populate `FlattenedConfiguration.Connections` with the corresponding `ConnectionConfig` (protocol, primary and backup JSON, failover retry count).
### Revision hash
`RevisionHashService.ComputeHash` produces a deterministic `sha256:<hex>` string over the canonical JSON serialization of the `FlattenedConfiguration`. Volatile fields (`GeneratedAtUtc`) are excluded. Collections are sorted by `CanonicalName` before hashing. Internal `Hashable*` records declare their properties in alphabetical order because `System.Text.Json` emits them in declaration order — out-of-order additions would silently break determinism. The hash covers resolved attributes, alarms, scripts, and connection configurations. **`NativeAlarmSources` is not included in the hash**: changes to native alarm source bindings do not alter the revision hash and therefore do not trigger a re-deploy on their own. The hash is included in the deployment identity and lets the Deployment Manager detect whether a re-flatten has changed anything before pushing to sites.
### Diff
`DiffService.ComputeDiff` compares two `FlattenedConfiguration` snapshots by canonical name, producing `Added`, `Removed`, and `Changed` entries for attributes, alarms, and scripts. `ComputeConnectionsDiff` produces the same shape for data-connection configurations. The diff is used by the Deployment Manager to decide whether a full redeploy is needed.
### Concurrent editing
Template edits use **last-write-wins** — there is no optimistic concurrency token on `Template` or its member rows. Two simultaneous edits to the same template produce one winner. This is by design. The same policy applies to instances and is documented in the `InstanceService` class comment: "Concurrent edits are last-write-wins — there is no version token or conflict detection on instance state." Optimistic concurrency (`RowVersion`) applies to deployment status records in the Deployment Manager, not to template or instance authoring.
## Architecture
### Service map
```text
AddTemplateEngine()
├── TemplateService (scoped) — template + member CRUD, collision/acyclicity pre-checks
├── SharedScriptService (scoped) — system-wide shared script CRUD + syntax validation
├── InstanceService (scoped) — instance CRUD, overrides, connection bindings
├── SiteService (scoped) — site CRUD
├── AreaService (scoped) — area CRUD
├── TemplateFolderService (scoped) — folder hierarchy, sibling-name uniqueness, acyclicity on move
├── TemplateDeletionService(scoped) — deletion constraints; called from TemplateService.DeleteTemplateAsync
├── FlatteningService (transient)
├── RevisionHashService (transient)
├── DiffService (transient)
├── ValidationService (transient) — full pipeline: 8 stages merged into one ValidationResult
├── SemanticValidator (transient) — call-target, argument-count, operand-type, cross-call rules
└── ScriptCompiler (transient) — advisory forbidden-API scan + delimiter balance check
```
Static helpers — `CycleDetector`, `CollisionDetector`, `LockEnforcer`, `TemplateResolver`, `TemplateNaming` — are not registered with DI.
### Validation pipeline
`ValidationService.Validate` runs eight stages in sequence and merges results via `ValidationResult.Merge`:
| Stage | Category | Outcome |
|-------|----------|---------|
| `ValidateFlatteningSuccess` | `FlatteningFailure` | Error on missing name; warning on empty configuration |
| `ValidateNamingCollisions` | `NamingCollision` | Error per duplicate canonical name within entity type |
| `ValidateScriptCompilation` | `ScriptCompilation` | Error per script that fails `ScriptCompiler.TryCompile` |
| `ValidateAlarmTriggerReferences` | `AlarmTriggerReference` | Error when `attributeName` / `attribute` key not in flattened attributes |
| `ValidateScriptTriggerReferences` | `ScriptTriggerReference` | Same check for script triggers |
| `ValidateExpressionTriggers` | `ScriptTriggerReference` / `AlarmTriggerReference` | Blank warning, syntax error, or missing `Attributes["X"]` reference |
| `ValidateConnectionBindingCompleteness` | `ConnectionBinding` | Warning per data-sourced attribute with no binding |
| `SemanticValidator.Validate` | Multiple | Call targets, argument counts, `RangeViolation`/`HiLo` operand types, on-trigger script existence, cross-call violations, native alarm source completeness |
`ValidationResult` is defined in Commons: `IsValid` is true when `Errors` is empty; `Warnings` do not block deployment. Each `ValidationEntry` carries a `ValidationCategory`, a human-readable `Message`, and an optional `EntityName` (canonical name of the offending entity).
### Key entity types (defined in Commons)
| Type | Namespace | Role |
|------|-----------|------|
| `Template` | `Commons.Entities.Templates` | Base and derived template rows; `IsDerived` distinguishes slot-owned derived templates |
| `TemplateAttribute` | same | Attribute definition with `IsInherited`, `LockedInDerived`, `DataType`, `DataSourceReference` |
| `TemplateAlarm` | same | Alarm definition; `TriggerType` and `Name` are fixed fields |
| `TemplateScript` | same | Script definition; `Name` is a fixed field |
| `TemplateComposition` | same | Slot row linking owner to composed (or derived) template by `InstanceName` |
| `TemplateNativeAlarmSource` | same | Read-only native alarm binding; `SourceReference` is a raw connection address |
| `FlattenedConfiguration` | `Commons.Types.Flattening` | Deployment-ready snapshot; fields `Attributes`, `Alarms`, `Scripts`, `NativeAlarmSources`, `Connections` |
| `ResolvedAttribute` / `ResolvedAlarm` / `ResolvedScript` / `ResolvedNativeAlarmSource` | same | Flattened member records carrying `CanonicalName` and `Source` provenance |
| `ConfigurationDiff` / `DiffEntry<T>` | same | Diff output keyed by canonical name |
| `ValidationResult` / `ValidationEntry` / `ValidationCategory` | same | Validation output |
## Usage
### Authoring a template with inheritance and composition
The normal flow goes through `TemplateService` methods; each call validates graph invariants before persisting and audits after:
```csharp
// Create a base template
var base = await templateService.CreateTemplateAsync(
"MotorBase", description: null, parentTemplateId: null, user: "alice");
// Add an attribute to the base
await templateService.AddAttributeAsync(base.Value.Id, new TemplateAttribute("Speed")
{
DataType = DataType.Float,
DataSourceReference = "/Motor/Speed"
}, user: "alice");
// Create a child that inherits from the base
var child = await templateService.CreateTemplateAsync(
"PumpMotor", description: null, parentTemplateId: base.Value.Id, user: "alice");
// Compose a feature module (AlarmsModule) into the base — acyclicity and collision
// checks run; a derived template is auto-created to back the slot
await templateService.AddCompositionAsync(
templateId: base.Value.Id,
composedTemplateId: alarmsModule.Id,
instanceName: "Alarms",
user: "alice");
```
After composition, `Alarms.HighTemp` is the canonical name of `HighTemp` from `AlarmsModule` as it appears in the flattened output.
### Resolving members and checking overrides
```csharp
// Returns all effective members with canonical names for templateId
var members = await templateService.ResolveTemplateMembersAsync(templateId);
// Returns TemplateResolver.ResolvedTemplateMember with CanonicalName, MemberType,
// IsLocked, and ModulePath (null for direct members, slot prefix for composed members).
foreach (var m in members)
{
Console.WriteLine($"{m.CanonicalName} ({m.MemberType}) locked={m.IsLocked}");
}
```
### Flattening and hashing
`FlatteningService` and `RevisionHashService` are transient services called by the Deployment Manager, not directly from the Central UI. The caller builds the template chain (most-derived first) from the repository and passes it:
```csharp
var flatResult = flatteningService.Flatten(
instance,
templateChain, // IReadOnlyList<Template>, index 0 = instance's template
compositionMap, // Dictionary<int, IReadOnlyList<TemplateComposition>>
composedTemplateChains, // Dictionary<int, IReadOnlyList<Template>>
dataConnections); // Dictionary<int, DataConnection>
if (flatResult.IsSuccess)
{
var hash = revisionHashService.ComputeHash(flatResult.Value);
// hash: "sha256:3a7f..."
}
```
### Validating before deployment
```csharp
var validationResult = validationService.Validate(flatResult.Value, sharedScripts);
if (!validationResult.IsValid)
{
foreach (var err in validationResult.Errors)
logger.LogError("{Category} {Entity}: {Msg}", err.Category, err.EntityName, err.Message);
}
```
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns all entity types (`Template`, `TemplateAttribute`, `TemplateAlarm`, `TemplateScript`, `TemplateComposition`, `TemplateNativeAlarmSource`), the flattening types (`FlattenedConfiguration`, `ResolvedAttribute`, `ResolvedAlarm`, `ResolvedScript`, `ResolvedNativeAlarmSource`, `ConfigurationDiff`), the `ValidationResult`/`ValidationEntry`/`ValidationCategory` hierarchy, the `ITemplateEngineRepository` interface, and the `IAuditService` interface. Template Engine imports from Commons; it never holds a direct EF Core dependency.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — provides the `ITemplateEngineRepository` implementation backed by the central MS SQL database. `TemplateService`, `InstanceService`, and all `Services/` classes resolve this via constructor injection. EF Core migrations for template tables live in this project.
- [Deployment Manager (#2)](./DeploymentManager.md) — consumes `FlatteningService`, `RevisionHashService`, `DiffService`, and `ValidationService` to prepare deployment packages. It also calls `TemplateDeletionService.CanDeleteTemplateAsync` to check constraints before removing a template that has active deployments.
- [Site Runtime (#3)](./SiteRuntime.md) — receives the `FlattenedConfiguration` (via the Deployment Manager) and uses `ResolvedScript.Scope` to set the path-prefix context for script attribute accessors. `ResolvedNativeAlarmSource` records drive the site's `NativeAlarmActor` bindings.
- [Central UI (#9)](./CentralUI.md) — the primary authoring surface. All template CRUD, instance management, shared script editing, and folder organization go through the Management Service, which delegates to `TemplateService`, `InstanceService`, and `SharedScriptService`. The Central UI calls `ValidateAsync` on-demand so designers see errors before deployment.
- [Management Service (#18)](./ManagementService.md) — the Akka.NET actor that exposes template operations over the cluster boundary. `TemplateService` and related services are injected into its DI scope per request.
- [Transport (#24)](./Transport.md) — exports and imports templates as encrypted bundles. On import, the Transport component calls `TemplateService.CreateTemplateAsync` (and member-add methods) for each template in the bundle; acyclicity and collision checks run identically to manual authoring.
- Design spec: [Component-TemplateEngine.md](../requirements/Component-TemplateEngine.md).
## Troubleshooting
### Composition fails with a cycle error
`CycleDetector.DetectCrossGraphCycle` rejects edges that would create a cycle across either inheritance or composition edges. The most common trigger is composing a template that already (transitively) includes the owner — for example, A composes B, and then trying to compose A into B. The error message identifies the template by name. Remove or restructure the graph to break the circular dependency.
### `LockedInDerived` cannot be cleared
`LockEnforcer.ValidateLockedInDerivedChange` enforces a one-way ratchet: once a base template sets `LockedInDerived = true` on a member, the flag cannot be cleared. This is intentional — clearing it retroactively would make previously blocked derived overrides legal without any visible signal to derived-template authors. The only remediation is to create a new base template without the flag.
### Revision hash changes unexpectedly between deploys
The SHA-256 hash covers resolved attributes, alarms, scripts, and connection configurations. Changes to any of those members anywhere in the inheritance or composition chain — including in parent templates or feature modules the instance does not directly own — will change the hash. Note that `NativeAlarmSources` is not part of the hash, so native alarm source binding changes alone do not change the revision hash. Use `DiffService.ComputeDiff` to identify exactly which canonical names changed and why.
### Naming collision on composed member add
`CollisionDetector.DetectCollisions` fires when a member's unqualified name collides with a direct or inherited member on the same owner template. Because composed members carry a slot-name prefix, a collision can arise between a directly defined member (`Speed`) and a composed member that resolves to the same unqualified name in an ancestor. The fix is to rename one of the conflicting members before adding the composition.
### Semantic validation: `CallScript` target not found
`SemanticValidator.ExtractCallTargets` uses a substring scan for `CallScript("name", ...)` and `CallShared("name", ...)`. If the target name does not match any resolved script canonical name, the error `CallTargetNotFound` is reported. Check that the call uses the full canonical name, including any composition prefix (e.g., `Alarms.HandleFault`, not just `HandleFault`).
## Related Documentation
- [Template Engine design specification](../requirements/Component-TemplateEngine.md)
- [Commons](./Commons.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Deployment Manager](./DeploymentManager.md)
- [Site Runtime](./SiteRuntime.md)
- [Central UI](./CentralUI.md)
- [Management Service](./ManagementService.md)
- [Transport](./Transport.md)
+192
View File
@@ -0,0 +1,192 @@
# Traefik Proxy
The Traefik Proxy is the reverse proxy and load balancer that fronts the central cluster's two web servers. It exposes a single stable entrypoint for all central traffic — Central UI, Management API, Inbound API — and routes exclusively to whichever central node is currently the Akka.NET cluster leader, using a health-check on each node's `/health/active` endpoint to make that determination. When the active node changes, Traefik detects the change on its next poll cycle and redirects traffic automatically, with no operator intervention.
## Overview
The proxy runs as the `scadabridge-traefik` Docker container in the main compose stack (`docker/docker-compose.yml`). It is a third-party infrastructure component (Traefik; the image tag is pinned in `docker/docker-compose.yml`) — there is no C# project for it. Its entire configuration is two YAML files mounted read-only into the container:
- `docker/traefik/traefik.yml` — static config: entrypoints, API dashboard, and file provider declaration.
- `docker/traefik/dynamic.yml` — routing rules: the router that catches all traffic, the `central` load-balancer service listing both backend nodes, and the `/health/active` health-check settings.
The proxy sits on the `scadabridge-net` Docker bridge network alongside both central nodes (`scadabridge-central-a`, `scadabridge-central-b`) and all site containers, so it can reach the central backends by container name.
## Key Concepts
### Active-node routing via `/health/active`
Traefik does not know which central node is the Akka.NET cluster leader — it discovers this by polling `/health/active` on both backends. The Host registers `ActiveNodeHealthCheck` under the `Active` health tag; `app.MapZbHealth()` serves it at `/health/active`. The check returns HTTP 200 on the leader and HTTP 503 on the standby (or when the actor system has not yet reached `MemberStatus.Up`):
```csharp
public bool IsActiveNode
{
get
{
var system = _akkaService.ActorSystem;
if (system == null)
return false;
var cluster = Cluster.Get(system);
var self = cluster.SelfMember;
if (self.Status != MemberStatus.Up)
return false;
var leader = cluster.State.Leader;
return leader != null && leader == self.Address;
}
}
```
The identical leadership check backs `ActiveNodeGate` — the `IActiveNodeGate` implementation the Inbound API endpoint filter consults before executing method scripts. Both surfaces agree on which node is active because they share the same Akka cluster state.
### Automatic failover
When the active central node goes down, the Akka cluster's keep-oldest split-brain resolver promotes the surviving node to leader (roughly 25 seconds: 10-second heartbeat threshold plus a 15-second stable-after period). Once the surviving node's `ActiveNodeHealthCheck` starts returning 200, Traefik's next poll cycle — within the 5-second interval — removes the failed backend from the pool and routes all subsequent requests to the new active node. No config change or restart is required on the Traefik side.
## Architecture
### Docker topology
```text
Clients (CLI, browser, external API)
host:9000 (HTTP)
┌───────▼──────────────────┐
│ scadabridge-traefik │ (Traefik container)
│ entrypoint :80 │
└──────┬──────────┬─────────┘
│ /health/active poll (5s)
▼ ▼
scadabridge- scadabridge-
central-a:5000 central-b:5000
(ACTIVE → 200) (STANDBY → 503)
```
Clients always connect to `http://localhost:9000`. The two central nodes are also reachable directly — `central-a` on host port 9001, `central-b` on host port 9002 — but these bypass the load balancer and should be used only for direct debugging. The Traefik dashboard is accessible at `http://localhost:8180`.
### Request flow
Every incoming request on the `web` entrypoint hits the `central` router, which matches all paths (`PathPrefix("/")`) and forwards to the `central` load-balancer service. The load balancer only includes servers that are currently passing the health check, so in normal operation all traffic goes to the single healthy (active) backend.
## Usage
Traefik starts automatically with the cluster compose stack:
```bash
# Start full cluster (includes Traefik)
docker compose -f docker/docker-compose.yml up -d
# Check Traefik dashboard (shows backend health status)
open http://localhost:8180
# Verify routing — reaches the active node
curl http://localhost:9000/health/active
# Direct node access (bypasses Traefik — use for debugging only)
curl http://localhost:9001/health/active # central-a
curl http://localhost:9002/health/active # central-b
```
The Traefik container's `restart: unless-stopped` policy means it recovers automatically after a Docker host restart.
## Configuration
### Static config (`docker/traefik/traefik.yml`)
```yaml
entryPoints:
web:
address: ":80"
api:
dashboard: true
insecure: true
providers:
file:
filename: /etc/traefik/dynamic.yml
```
| Key | Value | Effect |
|-----|-------|--------|
| `entryPoints.web.address` | `:80` | Listens on container port 80, mapped to host port 9000. |
| `api.dashboard` | `true` | Enables the Traefik web dashboard. |
| `api.insecure` | `true` | Serves the dashboard on port 8080 without auth (development only). |
| `providers.file.filename` | `/etc/traefik/dynamic.yml` | Loads routing rules from the mounted dynamic config; no Docker socket required. |
### Dynamic config (`docker/traefik/dynamic.yml`)
```yaml
http:
routers:
central:
rule: "PathPrefix(`/`)"
service: central
entryPoints:
- web
services:
central:
loadBalancer:
healthCheck:
path: /health/active
interval: 5s
timeout: 3s
servers:
- url: "http://scadabridge-central-a:5000"
- url: "http://scadabridge-central-b:5000"
```
| Setting | Value | Effect |
|---------|-------|--------|
| `routers.central.rule` | `PathPrefix("/")` | Catches every request on the `web` entrypoint. |
| `services.central.loadBalancer.healthCheck.path` | `/health/active` | The endpoint Traefik polls on each backend. |
| `services.central.loadBalancer.healthCheck.interval` | `5s` | Poll cadence; a backend failing the check is removed within one interval. |
| `services.central.loadBalancer.healthCheck.timeout` | `3s` | Per-poll timeout; a non-responding backend counts as unhealthy. |
| `servers[0].url` | `http://scadabridge-central-a:5000` | `central-a` backend, reachable by container name on `scadabridge-net`. |
| `servers[1].url` | `http://scadabridge-central-b:5000` | `central-b` backend, reachable by container name on `scadabridge-net`. |
### Port mapping
| Host port | Container port | Purpose |
|-----------|---------------|---------|
| `9000` | `80` | Load-balanced entrypoint — all central traffic (Central UI, Management API, Inbound API). |
| `8180` | `8080` | Traefik dashboard. |
| `9001` | `5000` | Direct access to `central-a` (bypasses Traefik). |
| `9002` | `5000` | Direct access to `central-b` (bypasses Traefik). |
## Dependencies & Interactions
- [Host (#15)](./Host.md) — implements and serves `/health/active` via `ActiveNodeHealthCheck` (tagged `Active`, mounted by `app.MapZbHealth()`). Also implements `ActiveNodeGate`, which enforces the same active-node contract at the Inbound API filter level, providing a defence-in-depth layer if traffic reaches the standby directly.
- [Cluster Infrastructure (#13)](./ClusterInfrastructure.md) — the underlying Akka.NET cluster determines which node is the leader. Traefik's routing decision is derived entirely from cluster leadership state via the health-check poll; Traefik has no Akka dependency of its own.
- [Central UI (#9)](./CentralUI.md) — Blazor Server (SignalR/WebSocket circuits) is proxied through Traefik. Traefik proxies WebSocket connections natively with no additional config. On failover, active SignalR circuits on the failed node are lost; the browser's reconnection logic re-establishes the circuit on the new active node. Session continuity is preserved because authentication uses a cookie-embedded JWT with Data Protection keys shared across both central nodes.
- [Inbound API (#14)](./InboundAPI.md) — external API consumers target `http://localhost:9000/api/{methodName}`. Traefik routes each request to the active node; if a request reaches the standby directly (bypassing Traefik), `ActiveNodeGate` responds with HTTP 503.
- [CLI (#19)](./CLI.md) — the CLI connects to the Management API via `http://localhost:9000` (the Traefik entrypoint) by default, so it always reaches the active central node without needing to know which node is active.
## Troubleshooting
### Both backends show unhealthy on the dashboard
If both `central-a` and `central-b` appear red on the Traefik dashboard, neither node's `ActiveNodeHealthCheck` is returning 200. Common causes:
1. **Akka cluster has not formed yet** — both nodes are still starting. Wait for the cluster to stabilise (typically 1015 seconds after both containers are up). Check the central node logs for `Cluster is now ready`.
2. **Split-brain resolver has downed both nodes** — a network partition followed by a split-brain condition. Restart the cluster via `bash docker/deploy.sh`.
3. **Traefik cannot reach the backends** — the `scadabridge-net` Docker network may not exist. Create it: `docker network create scadabridge-net`.
### Traffic reaches a standby node
If a client receives HTTP 503 with `X-ScadaBridge-Active: false`, the request reached a standby node — either because Traefik has not yet completed its health-check poll after a failover (up to 5 seconds), or because the client is connecting directly to port 9001/9002 instead of port 9000. Use `http://localhost:9000` for all normal access. The 503 is transient during the Traefik poll window; the client should retry.
### Health check succeeds but `/health/ready` returns degraded
`/health/active` and `/health/ready` are independent. A node can pass the active check (it is the leader) but fail the readiness check (database or Akka cluster health probe failed). Traefik only uses `/health/active`; readiness gating is for orchestration and monitoring. Check the node's structured logs for `database` or `akka-cluster` check failures.
## Related Documentation
- [Traefik Proxy design specification](../requirements/Component-TraefikProxy.md)
- [Host](./Host.md)
- [Cluster Infrastructure](./ClusterInfrastructure.md)
- [Central UI](./CentralUI.md)
- [Inbound API](./InboundAPI.md)
- [CLI](./CLI.md)
+257
View File
@@ -0,0 +1,257 @@
# Transport
The Transport component provides file-based, encrypted bundle export and import of central configuration artifacts between ScadaBridge environments via the Central UI. It is purely central — no site nodes are touched, no runtime state moves, and no site-scoped artifacts travel in a bundle.
## Overview
Transport (#24) is a central-only component that lives in `src/ZB.MOM.WW.ScadaBridge.Transport/`, split into three functional areas:
- `Export/``BundleExporter`, `DependencyResolver`, `ResolvedExport`. The export pipeline resolves artifact dependencies, serializes entities to wire-shaped DTOs, optionally encrypts the content, and produces a ZIP-formatted `.scadabundle` stream.
- `Import/``BundleImporter`, `BundleSessionStore`, `BundleSessionEvictionService`, `ArtifactDiff`, `BundleUnlockRateLimiter`. The import pipeline validates the bundle envelope, decrypts it, diffs against the target environment, and applies operator-chosen conflict resolutions in a single EF transaction.
- `Serialization/` and `Encryption/``BundleSerializer`, `EntitySerializer`, `ManifestBuilder`, `ManifestValidator`, `BundleSecretEncryptor`, `BundleManifestAad`. Stateless helpers that handle ZIP packing/unpacking, DTO projection, SHA-256 hashing, and AES-256-GCM authenticated encryption.
The single DI entry point is `ServiceCollectionExtensions.AddTransport`, registered by `Host` for central roles only. Stateless helpers are singletons; the exporter and importer are scoped because they reach into per-request EF Core scopes and audited repositories.
## Key Concepts
### Bundle format
A `.scadabundle` file is a ZIP archive with exactly two entries:
```text
bundle.scadabundle
├── manifest.json # always plaintext; never encrypted
└── content.json # plaintext artifact data (no passphrase)
OR content.enc # AES-256-GCM ciphertext (passphrase supplied)
```
`manifest.json` is always plaintext so the import wizard can display source provenance and artifact counts before the operator supplies a passphrase. `BundleManifest` carries: `BundleFormatVersion`, `SchemaVersion`, `CreatedAtUtc`, `SourceEnvironment`, `ExportedBy`, `ScadaBridgeVersion`, `ContentHash` (`sha256:<hex>` of the raw content bytes), optional `Encryption` metadata, a `Summary` (artifact counts by type), and a `Contents` list (one `ManifestContentEntry` per artifact with its `dependsOn` edges).
`BundleFormatVersion` is an integer gate: the importer requires `BundleFormatVersion` to equal `ManifestBuilder.CurrentBundleFormatVersion` (currently `1`) and rejects any other value — higher or lower — with `ManifestValidationResult.UnsupportedFormatVersion`. `TransportOptions.SchemaVersionMajor` is not read during import. Unknown entity types in `Contents` produce a preview-row classification of "unsupported" rather than aborting the whole import.
### Encrypted vs plaintext bundles
When a passphrase is supplied, `BundleSecretEncryptor` derives a 256-bit key via PBKDF2-SHA256 (default 600,000 iterations, configurable) from a fresh 16-byte random salt, then encrypts the UTF-8 JSON content using AES-256-GCM with a fresh 12-byte nonce. Output format is `ciphertext ‖ GCM-tag` (16-byte tag appended). The salt and nonce are stored in `manifest.json`; the passphrase is never persisted. An unencrypted export is permitted but produces an `UnencryptedBundleExport` audit event rather than `BundleExported`.
### AAD binding (T-005)
`BundleManifestAad.Compute` produces AES-GCM Associated Authenticated Data by SHA-256-hashing a canonicalized form of the manifest with `ContentHash` zeroed and `Encryption` nulled. This binds `SourceEnvironment`, `ExportedBy`, `Summary`, and `Contents` to the GCM authentication tag. Tampering with any of those fields on a stolen bundle yields an `AuthenticationTagMismatchException` on decryption, making the Step-4 "type the source environment to confirm" gate tamper-evident.
### BundleSession and session lifecycle
After `LoadAsync` validates and decrypts a bundle, the plaintext content is stored in a `BundleSession` held by the singleton `BundleSessionStore` (a `ConcurrentDictionary<Guid, BundleSession>`). Sessions have a 30-minute TTL (`BundleSessionTtlMinutes`). `BundleSessionEvictionService` sweeps the store every minute so abandoned sessions — and the secrets they carry — are released without waiting for the next `Get` call. `ApplyAsync` explicitly zeros and removes the session on both success and failure (T-007).
### Conflict resolution and `BundleImportId` correlation
`PreviewAsync` compares each bundle artifact against the target environment using `ArtifactDiff`, classifying items as `Identical`, `Modified`, `New`, or `Blocker`. The operator assigns a `ResolutionAction` (`Add`, `Overwrite`, `Skip`, `Rename`) per item. `ApplyAsync` honours those resolutions in a single EF transaction and threads a new `BundleImportId` GUID through every per-entity audit row via the scoped `IAuditCorrelationContext`. This makes every configuration row written by a bundle import queryable as a group from the audit log.
## Architecture
### Export pipeline
`BundleExporter.ExportAsync` orchestrates these steps in sequence:
1. `DependencyResolver.ResolveAsync` — expands the operator's `ExportSelection` to the full transitive closure, then topologically sorts templates (base-before-derived via Kahn's algorithm).
2. `EntitySerializer.ToBundleContent` — projects EF entity POCOs to wire-shaped DTOs (`BundleContentDto`), carving secret fields (connection strings, credentials, OAuth tokens) into per-entity `SecretsBlock` records.
3. `ManifestBuilder.Build` — stamps `BundleFormatVersion`, `SchemaVersion`, SHA-256 `ContentHash`, `Summary`, and `Contents` into a `BundleManifest`.
4. `BundleSerializer.Pack` — serializes the manifest and content into a ZIP stream. When a passphrase is present, `BundleSecretEncryptor.Encrypt` runs with a fresh salt and nonce; `Pack` re-stamps `ContentHash` and `Encryption` in the manifest against the ciphertext it actually writes.
5. Audit — `IAuditService.LogAsync` writes one `BundleExported` (or `UnencryptedBundleExport`) row with the SHA-256 of the full ZIP stream as `EntityId`.
```csharp
public sealed class BundleExporter : IBundleExporter
{
public async Task<Stream> ExportAsync(
ExportSelection selection,
string user,
string sourceEnvironment,
string? passphrase,
CancellationToken cancellationToken = default)
{
var resolved = await _resolver.ResolveAsync(selection, cancellationToken);
var aggregate = new EntityAggregate(/* resolved collections */);
var contentDto = _entitySerializer.ToBundleContent(aggregate);
var summary = new BundleSummary(/* counts from resolved */);
EncryptionMetadata? encryptionSeed = passphrase is null ? null
: new EncryptionMetadata("AES-256-GCM", "PBKDF2-SHA256",
_options.Value.Pbkdf2Iterations, string.Empty, string.Empty);
var manifest = _manifestBuilder.Build(sourceEnvironment, user, assemblyVersion,
encryptionSeed, summary, resolved.ContentManifest,
_bundleSerializer.SerializeContentBytes(contentDto));
var zipStream = _bundleSerializer.Pack(contentDto, manifest, passphrase, _encryptor);
var bundleHash = ComputeStreamSha256(zipStream);
await _auditService.LogAsync(user,
passphrase is null ? "UnencryptedBundleExport" : "BundleExported",
"Bundle", bundleHash, sourceEnvironment, /* afterState */, cancellationToken);
zipStream.Position = 0;
return zipStream;
}
}
```
### Dependency expansion
`DependencyResolver` walks five dependency edge types when `ExportSelection.IncludeDependencies` is true:
| Edge | Mechanism |
|------|-----------|
| Template composes Template | `TemplateComposition.ComposedTemplateId` (BFS over composition graph) |
| Template references SharedScript | Name-scan of `TemplateScript.Code`, `TemplateAttribute.Value`, and `TemplateAttribute.DataSourceReference` |
| Template references ExternalSystem | Name-scan of `TemplateScript.Code`, `TemplateAttribute.DataSourceReference`, and `TemplateAttribute.Value` |
| ApiMethod references SharedScript | Name-scan of `ApiMethod.Script` |
| Template folder ancestor chain | Always included regardless of `IncludeDependencies` |
`ExternalSystemMethod` records always travel with their parent `ExternalSystemDefinition`. Templates are emitted in topological order (base-before-derived) so the importer can `Apply*` them in sequence without forward-reference gaps.
Inbound API keys are explicitly excluded — per re-architecture C4, keys are environment-specific and must be re-issued on the target cluster. `ApiMethod` definitions travel without key bindings.
### Import pipeline
`BundleImporter` is a three-phase service:
**Phase 1 — `LoadAsync`**: copies the upload stream to a seekable `MemoryStream`, enforces the bundle size cap (`MaxBundleSizeMb`), validates the ZIP envelope (entry count, per-entry decompressed size, compression ratio — all before decompression), reads and validates `manifest.json` (format version, SHA-256 content hash), and decrypts `content.enc` when `Encryption` is present. The decrypted bytes are stored in a new `BundleSession`.
Passphrase lockout operates at two levels: per-bundle (3-strike counter in `BundleSessionStore`, keyed by `ContentHash` so a second browser tab sharing the same bundle bytes cannot reset the counter) and per-IP-per-hour (`BundleUnlockRateLimiter`, default 10 attempts). A successful decrypt clears the per-bundle counter.
```csharp
// From BundleImporter.LoadAsync — decrypt path (simplified)
var aad = Encryption.BundleManifestAad.Compute(manifest);
try
{
decryptedContent = _encryptor.Decrypt(contentBytes, manifest.Encryption, passphrase, aad);
}
catch (CryptographicException)
{
var newCount = _sessionStore.IncrementUnlockFailureCount(manifest.ContentHash);
if (newCount >= maxAttempts)
throw new BundleLockedException(manifest.ContentHash, newCount);
throw;
}
_sessionStore.ClearUnlockFailures(manifest.ContentHash);
```
**Phase 2 — `PreviewAsync`**: deserializes the session's plaintext bytes to `BundleContentDto` and calls `ArtifactDiff.Compare*` methods for each entity type. Diff results use `ConflictKind` (`Identical`, `Modified`, `New`, `Blocker`). `Modified` items carry a `FieldDiffJson` payload with changed-field names and old/new values; script bodies record a line-count delta rather than full text to keep the diff compact. `DetectBlockersAsync` scans script bodies for unresolvable `SharedScript` or `ExternalSystem` name references.
**Phase 3 — `ApplyAsync`**: runs semantic validation first (a name-resolution scan plus the full `SemanticValidator` from `TemplateEngine`), then applies all resolutions inside one EF transaction. The correlation GUID is set on `IAuditCorrelationContext.BundleImportId` before any writes so that every `IAuditService.LogAsync` call during the apply picks it up automatically. Three `SaveChangesAsync` calls handle forward references: an intermediate flush inside `ApplyTemplatesAsync` materializes folder identity values so that template `FolderId` foreign keys can be wired correctly; a second flush after all `Apply*` helpers materializes row identities before `ResolveAlarmScriptLinksAsync` and `ResolveCompositionEdgesAsync` run; a third flush commits the `BundleImported` audit row just before `CommitAsync`. All three flushes operate inside the same outer transaction. On failure, the transaction rolls back, `BundleImportId` is cleared, and a `BundleImportFailed` row is written outside the rolled-back transaction before the exception propagates.
```csharp
// From BundleImporter.ApplyAsync — correlation + transaction pattern
_correlationContext.BundleImportId = bundleImportId;
await using var tx = await _dbContext.Database.BeginTransactionAsync(ct);
try
{
var errors = await RunSemanticValidationAsync(content, resolutionMap, ct);
if (errors.Count > 0) throw new SemanticValidationException(errors);
await ApplyTemplateFoldersAsync(/* ... */);
await ApplyTemplatesAsync(/* ... */);
// ... other entity types ...
await _dbContext.SaveChangesAsync(ct); // flush for FK resolution
await ResolveAlarmScriptLinksAsync(/* ... */);
await ResolveCompositionEdgesAsync(/* ... */);
await _auditService.LogAsync(user, "BundleImported", "Bundle",
bundleImportId.ToString(), session.Manifest.SourceEnvironment, /* afterState */, ct);
await _dbContext.SaveChangesAsync(ct);
await tx.CommitAsync(ct);
ZeroDecryptedContent(session);
_sessionStore.Remove(sessionId);
return new ImportResult(BundleImportId: bundleImportId, /* counts */);
}
catch
{
await tx.RollbackAsync(ct);
_correlationContext.BundleImportId = null;
await _auditService.LogAsync(user, "BundleImportFailed", /* ... */);
ZeroDecryptedContent(session);
_sessionStore.Remove(sessionId);
throw;
}
```
## Usage
The Central UI surfaces Transport through two wizard pages (see [Central UI](./CentralUI.md)):
- **Export** (`/design/transport/export`, Design role): a 4-step wizard — select artifacts, review resolved dependencies, set a passphrase, download the `.scadabundle` file.
- **Import** (`/design/transport/import`, Admin role): a 5-step wizard — upload bundle, enter passphrase, review the diff and set per-artifact resolutions, confirm (operator types the source environment name), view the result and navigate to the Deployments page for any newly stale instances.
The same operations are available via the CLI:
```bash
scadabridge bundle export --output FILE --passphrase X [--templates A,B] \
[--include-dependencies] [--source-environment NAME]
scadabridge bundle preview --input FILE --passphrase X
scadabridge bundle import --input FILE --passphrase X [--on-conflict skip|overwrite|rename]
```
CLI commands route through `ManagementActor` handlers (`ExportBundleCommand`, `PreviewBundleCommand`, `ImportBundleCommand`), which delegate to the same `IBundleExporter` / `IBundleImporter` scoped services. Bundle bytes ride the existing `/management` JSON envelope as base64.
After import, template changes propagate to deployed instances through revision-hash drift detection in `DeploymentService.CompareAsync`. Transport does not write a stale marker — the existing Deployments page surfaces affected instances automatically.
## Configuration
`TransportOptions` is bound from the `ScadaBridge:Transport` section.
| Key | Default | Description |
|-----|---------|-------------|
| `SourceEnvironment` | `"scadabridge"` | Environment label stamped in `manifest.json` and used in export filenames. |
| `SchemaVersionMajor` | `1` | Major schema version stamped in exported manifests. Not read by the importer; import version-gating uses `ManifestBuilder.CurrentBundleFormatVersion` directly. |
| `BundleSessionTtlMinutes` | `30` | TTL for an in-progress import session. |
| `MaxBundleSizeMb` | `100` | Upload size cap; enforced before any decompression. |
| `MaxBundleEntryCount` | `4` | Maximum ZIP entries (a valid bundle has exactly 2). |
| `MaxBundleEntryDecompressedMb` | `200` | Per-entry decompressed size cap (ZIP-bomb defence). |
| `MaxBundleEntryCompressionRatio` | `50` | Per-entry compression ratio cap (ZIP-bomb defence). |
| `MaxUnlockAttemptsPerSession` | `3` | Per-bundle passphrase strike limit. |
| `MaxUnlockAttemptsPerIpPerHour` | `10` | Per-IP trailing-hour unlock attempts. |
| `Pbkdf2Iterations` | `600000` | PBKDF2-SHA256 iteration count for key derivation. |
`SourceEnvironment` should be set per environment (e.g., `dev-cluster`, `prod-cluster`) so the import wizard's confirmation gate works correctly.
## Dependencies & Interactions
- [Commons (#16)](./Commons.md) — owns `BundleManifest`, `ExportSelection`, `ImportPreview`, `ImportResolution`, `ImportResult`, `BundleSession`, `EncryptionMetadata`, `BundleSummary`, `ManifestContentEntry`, `ConflictKind`, and the `IBundleExporter`, `IBundleImporter`, `IBundleSessionStore`, `IAuditCorrelationContext` interfaces. Transport implementations bind to these contracts; Commons defines nothing Transport-specific beyond the DTOs and interfaces.
- [Configuration Database (#17)](./ConfigurationDatabase.md) — supplies all repository implementations (`ITemplateEngineRepository`, `IExternalSystemRepository`, `INotificationRepository`, `IInboundApiRepository`), `IAuditService` for per-entity audit rows, the `IAuditCorrelationContext` implementation (`AuditCorrelationContext`) registered as scoped, the `ScadaBridgeDbContext` used for the import transaction, and the EF migration that adds `BundleImportId uniqueidentifier NULL` (with index `IX_AuditLogEntries_BundleImportId`) to `AuditLogEntries`.
- [Template Engine (#1)](./TemplateEngine.md) — provides `SemanticValidator`, invoked inside `ApplyAsync` before the transaction commits. The importer feeds each imported `TemplateDto` through the validator alongside the combined in-bundle + pre-existing `SharedScript` catalog; validation errors surface as `SemanticValidationException` and roll back the entire import.
- [Audit Log / Configuration Audit](./AuditLog.md) — every export produces a `BundleExported` or `UnencryptedBundleExport` row; every import produces a `BundleImported` summary row (or `BundleImportFailed` on rollback). Per-entity rows written by `Apply*` helpers carry `BundleImportId` so operators can query all configuration changes from a single import as a group. `BundleImportUnlockFailed` rows are written on passphrase failures. Warning rows `BundleImportAlarmScriptUnresolved` and `BundleImportCompositionUnresolved` are written when second-pass FK rewire cannot resolve a name.
- [Central UI](./CentralUI.md) — hosts the Export Bundle page under the Design nav group and the Import Bundle page under the Admin nav group; the import result page links to the Deployments page and to the filtered Configuration Audit Log Viewer pre-populated with the completed `BundleImportId`.
- [Security & Auth (#10)](./Security.md) — enforces `RequireDesign` on export and `RequireAdmin` on import, both at the Razor page layer and inside the `IBundleExporter` / `IBundleImporter` service entrypoints (defense in depth).
- [Deployment Manager (#2)](./DeploymentManager.md) — not directly called by Transport; template overwrites naturally change the flattened-config hash that `DeploymentService.CompareAsync` reads, causing affected instances to surface as stale on the Deployments page.
## Troubleshooting
### Bundle upload rejected at format version check
`LoadAsync` throws `NotSupportedException` when `manifest.json` carries a `bundleFormatVersion` that does not equal `ManifestBuilder.CurrentBundleFormatVersion` (currently `1`). Any non-matching value — whether higher or lower — is rejected. Upgrade the target cluster or re-export from a version that produces format version `1`.
### Content hash mismatch on upload
`LoadAsync` throws `InvalidDataException("Bundle content hash does not match manifest — file may be corrupt.")`. The ZIP was corrupted in transit. Compare the SHA-256 shown in the export wizard's Step 4 against the downloaded file and re-export if they differ.
### Session expired between diff and apply
`PreviewAsync` or `ApplyAsync` throws when the `BundleSession` is not found. The 30-minute TTL elapsed while the operator was reviewing the diff. Re-upload the bundle to start a new session.
### Apply rolls back with SemanticValidationException
A template's scripts reference a `SharedScript` or `ExternalSystem` that exists neither in the bundle nor in the target environment, or a type mismatch exists in a call argument. The exception lists per-template errors. Either re-export with the missing dependency included, or pre-create the missing artifact in the target environment before importing.
### Passphrase lockout
After 3 wrong passphrase attempts against the same bundle (keyed by `ContentHash`), `BundleImporter.LoadAsync` throws `BundleLockedException`. The session is unusable. Re-upload the bundle file to get a new session with a fresh counter. A `BundleImportUnlockFailed` audit row is written on each failed attempt.
### BundleImportAlarmScriptUnresolved / BundleImportCompositionUnresolved warnings
These audit rows appear when the second-pass rewire (`ResolveAlarmScriptLinksAsync` / `ResolveCompositionEdgesAsync`) cannot match a name to a persisted row. The import commits — the FK is left null / the composition row is skipped — but the warning signals an incomplete import. Re-examine the bundle's dependency graph and re-export with the missing artifacts included.
## Related Documentation
- [Transport design specification](../requirements/Component-Transport.md)
- [Central UI](./CentralUI.md)
- [Audit Log](./AuditLog.md)
- [Template Engine](./TemplateEngine.md)
- [Configuration Database](./ConfigurationDatabase.md)
- [Commons](./Commons.md)
- [Security](./Security.md)
- [Deployment Manager](./DeploymentManager.md)
+197
View File
@@ -0,0 +1,197 @@
# Tree View
`TreeView<TItem>` is a generic, reusable Blazor Server component that renders any tree-shaped data as an expandable/collapsible hierarchy with ARIA roles, optional guide lines, single or checkbox selection, and session-persistent expansion state.
## Overview
The component lives at `src/ZB.MOM.WW.ScadaBridge.CentralUI/Components/Shared/TreeView.razor` alongside its scoped stylesheet `TreeView.razor.css`. It is data-agnostic: the caller supplies the data source, a children accessor, a key function, and a `RenderFragment<TItem>` for node content — the component owns only the structural chrome (indentation, toggle, guide lines, ARIA attributes, selection highlight, context menu positioning).
Active uses within the Central UI:
- **Data Connections page** (`DataConnections.razor`) — two-level Site → Connection tree with a kebab action menu per node and search-based dimming.
- **Topology page** (`Topology.razor`) — three-level Site → Area → Instance tree with inline rename, context menus, and `StorageKey = "topology-tree"` expansion persistence.
- **`TemplateFolderTree`** (`TemplateFolderTree.razor`) — a domain-specific wrapper that projects `TemplateFolder` / `Template` entities into `TemplateTreeNode` items and delegates to `TreeView<TemplateTreeNode>`. Consumed by the Templates browser page (Single mode, click-to-navigate) and the Transport Export wizard (Checkbox mode, bulk template selection).
## Key Concepts
### Generic node typing
`@typeparam TItem` means the component imposes no node base class. The consumer brings its own model — a record, a class, an interface — and wires four `[EditorRequired]` delegates that teach the component how to navigate it. The `TemplateTreeNode` class in `Components/Shared/TemplateTreeNode.cs` is the shared adapter type for template/folder hierarchies; the Data Connections and Topology pages use local `record` types.
### Expansion state
Expanded-node keys are stored in `_expandedKeys` (`HashSet<string>`). Keys are stringified via `KeyStr(object) => key.ToString()!` for consistency with `sessionStorage`, which stores them as a JSON array when `StorageKey` is set. On first render the component reads `sessionStorage` via `treeviewStorage.load`; on every toggle it writes back via `treeviewStorage.save`. `InitiallyExpanded` applies only when no persisted state exists (or `StorageKey` is null). The two sources are always unioned — a `RevealNode` call before the async storage read completes is not clobbered.
### Selection modes
`TreeViewSelectionMode` (defined in `TreeViewSelectionMode.cs`) controls how nodes are selected:
```csharp
public enum TreeViewSelectionMode
{
Single, // Default. Clicking node content fires SelectedKeyChanged with the node key.
Checkbox, // Renders a tri-state checkbox per node. Folder check state is aggregated
// from descendant leaves; only leaf keys enter SelectedKeys.
}
```
In `Single` mode the component uses `SelectedKey` / `SelectedKeyChanged` (two-way binding on a single `object?`). In `Checkbox` mode it uses `SelectedKeys` / `SelectedKeysChanged` (`HashSet<object>`). Checking a folder selects or deselects all of its descendant leaf keys. The `indeterminate` checkbox property is set via JS interop (`treeviewStorage.setIndeterminate`) after every render because Blazor does not bind `input.indeterminate` natively.
### Context menu
When `ContextMenu` is non-null, right-clicking any row suppresses the browser default and positions a Bootstrap `dropdown-menu show` div at the cursor coordinates using `position: fixed`. An invisible overlay behind the menu dismisses it on click-outside; Escape also dismisses it. The menu receives the `TItem` of the right-clicked node, so the consumer's fragment can branch on node type.
## Architecture
The component is a single `@typeparam` `.razor` file with a private `void RenderNode(TItem item, int depth)` local function that recurses the tree at render time — no intermediate view model is built inside the component. Every `<li>` carries `@key="key"` so Blazor can diff the list efficiently.
`IJSRuntime` is injected for two purposes: reading/writing `sessionStorage` for expansion persistence, and setting `input.indeterminate` for tri-state checkboxes. Both call sites guard `JSDisconnectedException` so a disconnected circuit never throws out of the lifecycle methods.
The public surface the caller can invoke via `@ref`:
```csharp
public bool IsExpanded(object key); // Whether the given key is currently expanded.
public void ExpandAll(); // Expand every branch node; persists if StorageKey set.
public void CollapseAll(); // Collapse every node; clears persisted state.
public async Task RevealNode(object key, bool select = false);
// Expands all ancestors of the given key. Optionally selects the node.
// No-op if the key does not exist in the current tree.
```
## Usage
The Data Connections page binds a two-level Site → Connection tree with `StorageKey`, single selection, and a context menu for Edit and Delete actions on connection nodes:
```razor
<TreeView @ref="_tree" TItem="DcTreeNode" Items="_treeRoots"
ChildrenSelector="n => n.Children"
HasChildrenSelector="n => n.Children.Count > 0"
KeySelector="n => (object)n.Key"
StorageKey="data-connections-tree"
Selectable="true"
SelectedKey="_selectedKey"
SelectedKeyChanged="OnTreeNodeSelected">
<NodeContent Context="node">
@if (node.Kind == DcNodeKind.Site)
{
<span class="tv-label fw-semibold">@node.Label</span>
<span class="badge bg-secondary ms-1">@node.Children.Count</span>
}
else
{
<span class="tv-label">@node.Label</span>
<span class="badge bg-info ms-2">@node.Connection!.Protocol</span>
}
</NodeContent>
<ContextMenu Context="node">
@if (node.Kind == DcNodeKind.Site)
{
<button class="dropdown-item"
@onclick="() => AddConnectionForSite(node.SiteId!.Value)">
Add Connection here
</button>
}
else
{
<button class="dropdown-item"
@onclick='() => NavigationManager.NavigateTo($"/design/connections/{node.Connection!.Id}/edit")'>
Edit
</button>
<div class="dropdown-divider"></div>
<button class="dropdown-item text-danger"
@onclick="() => DeleteConnection(node.Connection!)">
Delete
</button>
}
</ContextMenu>
<EmptyContent>
<span class="text-muted fst-italic">No sites configured. Add sites under Admin → Sites.</span>
</EmptyContent>
</TreeView>
@code {
record DcTreeNode(string Key, string Label, DcNodeKind Kind, List<DcTreeNode> Children,
int? SiteId = null, DataConnection? Connection = null);
enum DcNodeKind { Site, DataConnection }
private TreeView<DcTreeNode>? _tree;
private object? _selectedKey;
private void OnTreeNodeSelected(object? key) => _selectedKey = key;
}
```
The `TemplateFolderTree` wrapper demonstrates `Checkbox` mode, where `SelectedKeys` / `SelectedKeysChanged` drive bulk template selection in the Transport Export wizard:
```razor
<TreeView @ref="_tree" TItem="TemplateTreeNode"
Items="_visibleRoots"
ChildrenSelector="n => n.Children"
HasChildrenSelector="n => n.Children.Count > 0"
KeySelector="n => (object)n.Key"
Selectable="@(SelectionMode == TreeViewSelectionMode.Single)"
SelectionMode="SelectionMode"
SelectedKeys="SelectedKeys"
SelectedKeysChanged="SelectedKeysChanged"
InitiallyExpanded="@(_initiallyExpanded)"
StorageKey="@StorageKey">
<NodeContent Context="node">
<span class="tv-glyph"><i class="bi @(NodeGlyph(node))"></i></span>
<span class="tv-label @(node.Children.Count > 0 ? "fw-semibold" : "")"
title="@node.Name">@node.Name</span>
</NodeContent>
</TreeView>
```
## Configuration
All `[Parameter]` properties on `TreeView<TItem>`. Parameters marked **required** carry `[EditorRequired]` and must be supplied; omitting them produces a build warning.
| Parameter | Type | Default | Description |
|---|---|---|---|
| `Items` | `IReadOnlyList<TItem>` | — | **Required.** Root-level nodes to render. |
| `ChildrenSelector` | `Func<TItem, IReadOnlyList<TItem>>` | — | **Required.** Returns the ordered children of a node. |
| `HasChildrenSelector` | `Func<TItem, bool>` | — | **Required.** Returns `true` for branch nodes. Determines whether the expand toggle is rendered. |
| `KeySelector` | `Func<TItem, object>` | — | **Required.** Unique stable key per node. Used for expansion tracking, selection, and `@key` diffing. |
| `NodeContent` | `RenderFragment<TItem>` | — | **Required.** Render fragment for node label content. Receives the `TItem`; responsible for all domain-specific markup (glyphs, labels, badges). |
| `EmptyContent` | `RenderFragment?` | `null` | Shown when `Items` is empty or null. |
| `ContextMenu` | `RenderFragment<TItem>?` | `null` | Right-click menu content. Receives the right-clicked node. If null, right-click is not intercepted and the browser default is preserved. If non-null, `@oncontextmenu:preventDefault` is always active — the browser default is suppressed for every node regardless of whether the fragment renders any items for that node type. |
| `IndentPx` | `int` | `24` | Pixels of left padding added per depth level via inline `style`. |
| `ShowGuideLines` | `bool` | `true` | Adds `tv-guides` CSS class to the root `<ul>`, enabling the depth guide lines drawn by a `linear-gradient` pseudo-element in `TreeView.razor.css`. |
| `InitiallyExpanded` | `Func<TItem, bool>?` | `null` | Predicate used to expand matching nodes on first load. When `StorageKey` is null it is applied immediately (synchronously in `OnParametersSet`). When `StorageKey` is set, the predicate is applied only after the async storage read completes and returns empty — persisted state takes precedence and `InitiallyExpanded` is a fallback for first-ever loads. |
| `StorageKey` | `string?` | `null` | Browser `sessionStorage` key for expansion persistence (`treeview:{StorageKey}`). When null, expansion is in-memory only. |
| `Selectable` | `bool` | `false` | Enables click-to-select on node content. Clicking the expand toggle never changes selection. |
| `SelectedKey` | `object?` | `null` | Currently selected node key for `Single` mode. Supports two-way binding (`@bind-SelectedKey`). |
| `SelectedKeyChanged` | `EventCallback<object?>` | — | Fires when selection changes in `Single` mode. Also fires on `RevealNode(..., select: true)`. Fires with `null` when the selected key disappears from the tree. |
| `SelectedCssClass` | `string` | `"bg-primary bg-opacity-10"` | CSS class(es) applied to the selected node's row div in addition to `tv-selected`. |
| `SelectionMode` | `TreeViewSelectionMode` | `Single` | Switches between single-key selection and tri-state checkbox selection. |
| `SelectedKeys` | `HashSet<object>?` | `null` | Set of currently selected leaf keys for `Checkbox` mode. |
| `SelectedKeysChanged` | `EventCallback<HashSet<object>>` | — | Fires with the updated set when any checkbox is toggled in `Checkbox` mode. Always fires with a fresh `HashSet` reference. |
### CSS utility classes in `NodeContent`
The scoped stylesheet defines layout slots that `NodeContent` fragments should use for consistent alignment:
| Class | Purpose |
|---|---|
| `tv-glyph` | 20 px flex slot for a Bootstrap Icon (`<i class="bi bi-…">`). |
| `tv-label` | `flex: 1 1 auto; min-width: 0` — primary text with ellipsis overflow. |
| `tv-meta` | `margin-left: auto` — right-aligned badges or trailing controls. |
| `tv-kebab` | Opt-in hidden-by-default "more actions" slot; revealed on row hover. |
## Dependencies & Interactions
- **Bootstrap 5** — all state visuals use Bootstrap utility classes and CSS variables (`--bs-tertiary-bg`, `--bs-border-color`, `--bs-primary-rgb`). No third-party Blazor component frameworks.
- **Bootstrap Icons** — static files served from `wwwroot/lib/bootstrap-icons/`; referenced once in `MainLayout.razor`. `NodeContent` fragments use `<i class="bi bi-…">` inside the `tv-glyph` slot.
- **`IJSRuntime`** — injected for `treeviewStorage.load` / `treeviewStorage.save` (expansion persistence) and `treeviewStorage.setIndeterminate` (checkbox tri-state). The JS helpers live in the CentralUI's shared JS bundle.
- **`TemplateFolderTree`** (`Components/Shared/TemplateFolderTree.razor`) — a domain-specific wrapper around `TreeView<TemplateTreeNode>` that handles folder/template tree construction, text filtering, and `ExtraTemplateChildren` injection. Consumers that need the template hierarchy use `TemplateFolderTree`; they do not wire `TreeView<TemplateTreeNode>` directly.
- **`TemplateTreeNode` / `TemplateTreeNodeKind`** (`Components/Shared/TemplateTreeNode.cs`) — the shared node model used by `TemplateFolderTree` and its callers. Folder keys are prefixed `f:`, template keys `t:`, composition keys `c:`.
- **Data Connections page** (`Components/Pages/Design/DataConnections.razor`) — binds `TreeView<DcTreeNode>` directly with a local two-level record type.
- **Topology page** (`Components/Pages/Deployment/Topology.razor`) — binds `TreeView` for the Site → Area → Instance hierarchy; calls `ExpandAll` and `CollapseAll` via `@ref`.
- **Central UI component** — see [./CentralUI.md](./CentralUI.md) for the broader Blazor Server application context.
## Related Documentation
- [Tree View design specification](../requirements/Component-TreeView.md)
- [Central UI](./CentralUI.md)
- [Template Engine](./TemplateEngine.md)
+4 -3
View File
@@ -18,9 +18,10 @@
- [ ] EF Core migrations have been applied (SQL script reviewed and executed)
- [ ] `ScadaBridge:Security:JwtSigningKey` is at least 32 characters, randomly generated
- [ ] **Both central nodes use the same JwtSigningKey** (required for JWT failover)
- [ ] `ScadaBridge:Security:LdapServer` points to the production LDAP/AD server
- [ ] `ScadaBridge:Security:LdapUseTls` is `true` (LDAPS required in production)
- [ ] `ScadaBridge:Security:AllowInsecureLdap` is `false`
- [ ] `ScadaBridge:Security:Ldap:Server` points to the production LDAP/AD server
- [ ] `ScadaBridge:Security:Ldap:Transport` is `Ldaps` (LDAPS required in production)
- [ ] `ScadaBridge:Security:Ldap:AllowInsecure` is `false`
- [ ] LDAP service-account password supplied via env var `ScadaBridge__Security__Ldap__ServiceAccountPassword` (renamed from `ScadaBridge__Security__LdapServiceAccountPassword` in the Task 1.4 nested-config cutover)
- [ ] LDAP search base DN is correct for the organization
- [ ] LDAP group-to-role mappings are configured
- [ ] Load balancer is configured in front of central UI (sticky sessions not required)
@@ -0,0 +1,225 @@
# Central report pages hang ~30s — NotificationOutbox / SiteCallAudit singleton query Asks never reply
**Status:** FIXED — verified 2026-06-05 (pending commit) · **Severity:** High (real users see 30s page loads) · **Found:** 2026-06-05
**Components:** Notification Outbox (#21), Site Call Audit (#22), Central UI (#9), Host/cluster (#15/#13)
## FIX APPLIED & VERIFIED (2026-06-05)
`HOST-021`. The Akka `ActorSystem` DI bridge was changed from `AddTransient` to a **singleton**
routed through a new lazy, idempotent, thread-safe `AkkaHostedService.GetOrCreateActorSystem()`
(creates the system once on first call from either `StartAsync` or the DI factory). A singleton
is resolved from the root provider and is never disposed by a per-probe health-check child
scope, so the `ActorSystem.Dispose()``Terminate()` no longer fires; routing through the
creator (rather than a plain `AddSingleton(sp => …ActorSystem)` factory) avoids caching a
`null` if a probe wins the startup race.
Files:
- `src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs` — new `GetOrCreateActorSystem()`
+ `_actorSystemLock`; `StartAsync` calls it instead of creating the system inline.
- `src/ZB.MOM.WW.ScadaBridge.Host/Program.cs` (central) and `SiteServiceRegistration.cs` (site)
`AddTransient<ActorSystem>``AddSingleton<ActorSystem>(sp => …GetOrCreateActorSystem())`.
Verification after `bash docker/deploy.sh`:
- `ActorSystemTerminateReason` post-startup occurrences: **0** on both central nodes (was 1/boot).
- `/health/active`: central-a **Healthy "Active node (cluster leader)"**, central-b **"Up but
not the cluster leader"** — correct active/standby (was both Standby Exiting/Removed).
- Page render: `/notifications/report` **0.069s**, `/notifications/kpis` **0.091s**,
`/site-calls/report` **0.026s**, `/monitoring/health` **0.058s** (all were ~30s+).
- Playwright E2E: **68 passed / 0 failed / 0 skipped** (was 62/6/0).
## ROOT CAUSE (confirmed 2026-06-05 — supersedes the hypotheses below)
The Akka `ActorSystem` is a **process singleton owned by `AkkaHostedService`**, but it is
registered into DI as a **`Transient`** via a factory:
```csharp
// Program.cs:211 (central) and SiteServiceRegistration.cs:82 (site)
builder.Services.AddTransient<Akka.Actor.ActorSystem>(sp =>
sp.GetRequiredService<AkkaHostedService>().ActorSystem!);
```
`ActorSystem` is `IDisposable`. In Microsoft.Extensions.DependencyInjection, an `IDisposable`
produced by a `Transient`/`Scoped` factory is **captured for disposal by the scope that
resolved it**. The shared `ZB.MOM.WW.Health.Akka` checks (`AkkaClusterHealthCheck`,
`ActiveNodeHealthCheck`) are registered with `AddTypeActivatedCheck` and resolve the system
**lazily per probe** — `_serviceProvider.GetService<ActorSystem>()`
(`AkkaClusterHealthCheck.cs:42`, `ActiveNodeHealthCheck.cs:102`). `HealthCheckService` runs
each probe in its **own child scope**, so every `/health/ready` and `/health/active` probe:
1. resolves the live `ActorSystem` (a `Transient`) into the probe's child scope,
2. the probe completes and `HealthCheckService` disposes the scope,
3. the container disposes the captured `ActorSystem``ActorSystem.Dispose()`
`CoordinatedShutdown.Run(ActorSystemTerminateReason)` → the node Leaves → Exiting → the
actor system terminates.
The ASP.NET host process keeps running (only a DI-tracked transient was disposed; the
hosted service's `StopAsync`/`ClrExitReason` path never runs), so the node is left
**permanently dead** — member status frozen at `Exiting` (central-a) / `Removed` (central-b),
no `Up` member, the cluster singletons have no host, and every Central UI page that `Ask`s a
singleton proxy buffers the message until the 30s `QueryTimeout`. The health checks meant to
*report* cluster status are what *kill* the cluster.
**Evidence (clean redeploy, 2026-06-05 11:43):** central-a forms its cluster, goes `Up`, the
singletons start + are identified (11:43:10.77); the first `GET /health/active` lands at
11:43:14; `CoordinatedShutdown … ActorSystemTerminateReason … ExitCode:0` fires immediately
(11:43:14.801); node Leaves → Exiting → "Successfully shut down" (11:43:24); process stays up
serving HTTP. central-b shows the identical pattern at 11:43:17. `/health/ready` then = 503 on
central-b, and `/health/active` = `Standby: node is not Up (status: Exiting/Removed)` on both.
No application code calls `.Terminate()` (grep), confirming the disposal path.
**Why earlier analysis missed it:** the prior hypotheses examined the actor handler, proxy
wiring, singleton lifecycle, and DB — all of which are correct. They are irrelevant because
the `ActorSystem` is simply **dead** by the time a page queries it. "Deterministic, survives
restart and full redeploy" is fully explained: it is a DI-lifetime code defect that
re-triggers on the first post-`Up` health probe every boot.
**Fix (pending, task #48):** stop the container from disposing the externally-owned
`ActorSystem`. It must be resolvable from DI as the live instance (the kit calls
`GetService<ActorSystem>()`), re-readable (must not cache `null` during warmup), and never
disposed by a child scope. A `Transient`/`Scoped` factory returning the `IDisposable` system
is always captured by the resolving scope, and a plain `AddSingleton(factory)` caches whatever
the first resolve sees (→ permanent `null` if a probe wins the warmup race). The chosen fix is
a lazy, idempotent, thread-safe `AkkaHostedService.GetOrCreateActorSystem()` (creates the
system once on first call from either `StartAsync` or the DI factory) registered as
`AddSingleton<ActorSystem>(sp => sp.GetRequiredService<AkkaHostedService>().GetOrCreateActorSystem())`
— a process singleton, so child-scope disposal never touches it, and never `null` because the
first resolve creates it. Apply in **both** the central (`Program.cs`) and site
(`SiteServiceRegistration.cs`) registrations.
## Summary
The Central UI pages that query the central **cluster singletons**`/notifications/report`,
`/notifications/kpis`, `/site-calls/report`, and the `/monitoring/health` KPI tiles — hang for
**exactly ~30 seconds** during server render, then render an empty/error state. The hang is the
Akka `Ask` to the `notification-outbox` / `site-call-audit` singletons timing out at
`CommunicationOptions.QueryTimeout` (30s): **the singleton never replies**. Every other page is
fast. This is **deterministic** (it survived a clean cluster restart) and is **not** a test
problem — the E2E tests that load these pages are correctly failing. The root cause was *not*
pinned by static analysis + a restart; the remaining step is runtime instrumentation (below).
## Affected surface
| Page | Server render time | Path |
|------|--------------------|------|
| `/admin/sites` | 0.026s | DB (`ISiteRepository`) |
| `/audit/log` | 0.044s | DB |
| `/deployment/deployments` | 0.018s | `CentralCommunicationActor` (local actor) Ask |
| `/design/templates` | 0.013s | DB |
| **`/notifications/report`** | **30.01s** | `GetNotificationOutbox()` singleton-proxy Ask |
| **`/notifications/kpis`** | **30.05s** | `GetNotificationOutbox()` singleton-proxy Ask |
| **`/site-calls/report`** | **30.02s** | `GetSiteCallAudit()` singleton-proxy Ask |
| **`/monitoring/health`** | **>35s** | both singleton KPIs |
Measured with an authenticated `curl` against the live cluster (`http://localhost:9000`), so it
is a **server-side prerender** hang, independent of the browser.
## Trigger path
`NotificationReport.OnInitializedAsync``RefreshAll()``FetchPage()`
`CommunicationService.QueryNotificationOutboxAsync(request)`
`GetNotificationOutbox().Ask<NotificationOutboxQueryResponse>(request, _options.QueryTimeout)`.
The page auto-queries the singleton on init (during prerender); the `Ask` times out at 30s.
`SiteCallsReport` does the analogous thing for the `site-call-audit` singleton.
- `CommunicationService.QueryNotificationOutboxAsync``src/ZB.MOM.WW.ScadaBridge.Communication/CommunicationService.cs:456`
- `GetNotificationOutbox()` returns the cached `_notificationOutboxProxy``CommunicationService.cs:100`
## What was verified correct / ruled out (with evidence)
1. **The CentralCommunicationActor path is healthy.** `/deployment/deployments`
(`GetActor().Ask`, a node-local actor) returns in 0.018s. The cluster/ClusterClient transport
and the Ask machinery work. Only the **singleton-proxy** Asks hang. This is the key asymmetry.
2. **The singletons start and are reachable by their proxies.** On the current boot the active
node (central-a) logged `NotificationOutbox singleton created and registered…`,
`SiteCallAuditActor singleton created and registered…`,
`ClusterSingletonManager state change [Start -> Oldest]`,
`Singleton manager started singleton actor [.../notification-outbox]`, and the proxy logged
`Singleton identified at [.../notification-outbox-singleton/notification-outbox]`.
3. **Not cluster state — neither a restart nor a full redeploy fixes it.** Restarting the central
nodes (sequentially, then both together) re-formed a healthy active/standby cluster (central-a
active leader, central-b standby) with the singletons started + identified on the active node
(so the Ask is **local**), and the pages **still hung exactly 30s**. A subsequent **full
`docker/deploy.sh`** (fresh image rebuild + recreation of all containers) *also* left the pages
hanging exactly 30s. This rules out a stale-proxy / wedged-singleton cluster-state explanation,
a stale binary, and cross-node serialization (a local Ask is not serialized) — the defect is
**deterministic**.
4. **The query handlers are correct.** Both `PipeTo` the async query to the captured `Sender`
with a **failure projection on every path** (a faulted query replies `Success:false`, it does
not hang):
- `NotificationOutboxActor.HandleQuery``src/ZB.MOM.WW.ScadaBridge.NotificationOutbox/NotificationOutboxActor.cs:760` (PipeTo at :765, failure arm :768)
- `SiteCallAuditActor.HandleQuery``src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/SiteCallAuditActor.cs:224` (PipeTo at :229, failure arm :231)
5. **The DB query would be instant.** `Notifications` and `SiteCalls` are **empty (0 rows, 0 ms)**
in the live DB, and the repository query is a plain EF `Where`/paginate
(`NotificationOutboxRepository.QueryAsync``…/Repositories/NotificationOutboxRepository.cs:132`).
So a query that actually executes returns in well under a second.
6. **The proxy wiring is textbook.** `notification-outbox-proxy` is a standard
`ClusterSingletonProxy` for `/user/notification-outbox-singleton`, handed to
`CommunicationService.SetNotificationOutbox(...)`
`src/ZB.MOM.WW.ScadaBridge.Host/Actors/AkkaHostedService.cs:367-379` (and `:515-524` for site-call-audit).
`GetNotificationOutbox()` is non-null (a null ref would throw fast, not hang).
7. **No relevant exception/serialization error is logged** on either central node at query time
(`ActorInitializationException`, restart loop, `cannot be serialized`, `no serializer` — none).
8. **Singleton-agnostic to the dispatch loop.** `NotificationOutbox` has a 5s dispatcher loop;
`SiteCallAudit` has **no** periodic loop at all (deferred) — yet both hang identically, so the
dispatch loop is not the shared cause. The loop is also fire-and-forget
(`RunDispatchPass(...).PipeTo(Self)`), so it cannot starve the mailbox.
Net: handler, proxy wiring, singleton lifecycle, and DB query are all correct, and the table is
empty — yet a **local** Ask to the singleton never replies within 30s. The defect is at the
singleton **activation / message-processing boundary** on the live node, not in the visible code.
## Leading hypotheses (not yet confirmed — need runtime instrumentation)
1. **The singleton instance is not draining its mailbox** even though the
`ClusterSingletonManager` reports it started (a half-activated / perpetually-restarting / stuck
instance). The manager holds the name and the proxy "identifies" it, but messages are buffered
and never processed → 30s timeout with no per-query exception. Both singletons share
construction via `Props.Create(() => new …(_serviceProvider, …))`
(`AkkaHostedService.cs:357`, `:471`) and the `_serviceProvider.CreateScope()` +
`ScadaBridgeDbContext` pattern in their handlers — a shared activation-time defect would hit
both.
2. **DB scope/connection acquisition from the actor's root-provider scope hangs** (e.g. a leaked-
connection / pool-wait specific to `_serviceProvider.CreateScope()` in the actor, vs the
request-scoped DbContext that `/admin/sites` uses successfully). The 30s is *exactly* the Ask
timeout, so any handler-side hang ≥30s presents identically.
3. **The reply cannot be delivered back to the Ask's temporary actor** (less likely for a local
Ask, but not disproven).
## How to confirm (next step)
Bisect "message never reaches the singleton (routing)" vs "singleton receives but never replies
(handler/DB)":
- Turn on Akka receive logging for the run — `akka.loglevel = DEBUG` and
`akka.actor.debug.receive = on` in the Host's HOCON (`AkkaHostedService.BuildHocon`,
~`AkkaHostedService.cs:171-216`) — or add a single `_log.Info("HandleQuery received …")` line at
the top of `NotificationOutboxActor.HandleQuery`, then `bash docker/deploy.sh` and hit
`/notifications/report` once.
- If the log line **does not** appear → the message isn't reaching the singleton (routing /
proxy / mailbox-stuck) → investigate the singleton activation + proxy delivery.
- If it **does** appear → the handler/`QueryOutboxAsync` is hanging → wrap with timing around
`CreateScope()`, `GetRequiredService`, and `await repository.QueryAsync(...)` to find which
awaits.
## Blocked tests
All currently-failing Playwright tests are blocked solely by this hang (they load the affected
pages):
- `Audit.AuditLogPageTests.NotificationsPage_RendersAuditDrillInLinkPattern` (loads `/notifications/report`)
- All `SiteCalls.SiteCallsPageTests` page tests (load `/site-calls/report`): `PageLoads_ForDeploymentUser`,
`FilterNarrowing_ChannelFilterShrinksGrid`, `RetryClickThrough_OnParkedRow_ConfirmsRelayAndShowsOutcomeToast`,
`RetryDiscard_VisibleOnlyOnParkedRows`, `DrillIn_ViewAuditHistory_NavigatesToPreFilteredAuditLog`.
The rest of the suite is green (the Audit grid/drawer tests pass after the `AuditDataSeeder`
canonical-schema fix landed in the same session).
## Notes
- Pre-existing: the hang was present before any test-suite or cluster-restart work this session,
and the restarts did not cause it (the cluster is healthy active/standby afterward).
- Timeframe correlation only (not proven causal): this surfaced around the audit subsystem
re-architecture (`CollapseAuditLogToCanonical`) — but the NotificationOutbox/SiteCallAudit
handlers and repositories read the unchanged-and-empty `Notifications`/`SiteCalls` tables and
are themselves correct, so the defect is at the singleton hosting/messaging layer rather than in
the audit-table change.
+175
View File
@@ -0,0 +1,175 @@
# Inbound API Key Re-issue Runbook
**Status:** BREAKING change — action required on every environment that uses the
inbound API (`POST /api/{methodName}`).
**Date:** 2026-06-02
**Migration:** `RetireInboundApiKeyStore`
This runbook covers the migration of inbound API authentication from the legacy SQL
Server `X-API-Key` scheme to the shared `ZB.MOM.WW.Auth.ApiKeys` store. After this
change **all existing inbound API keys are invalidated** and every API client must be
re-issued a new credential.
---
## 1. What changed and why
| | Before | After |
|---|---|---|
| Header | `X-API-Key: <key>` | `Authorization: Bearer sbk_<keyId>_<secret>` |
| Verification | Deterministic HMAC hash, looked up in SQL Server | Peppered, constant-time HMAC compare in the shared `ZB.MOM.WW.Auth.ApiKeys` verifier |
| Storage | SQL Server `ApiKeys` table (config DB) | `ZB.MOM.WW.Auth.ApiKeys` SQLite store |
| Authorization | `ApiMethod.ApprovedApiKeyIds` CSV linking methods to key IDs | Per-key **scopes**, where each scope string is an allowed method name (ordinal, case-sensitive) |
**Why:** the inbound credential path now reuses the shared auth library that the rest
of the `ZB.MOM.WW.*` family uses, with a single, tested, peppered verifier and a
proper one-time-token issuance model. The deterministic SQL Server hash table and its
method-link CSV are retired. The legacy `ApiKeyHasher` / `IApiKeyHasher` and the
in-repo `ApiKeyValidator` are gone — inbound auth runs through `IApiKeyVerifier`.
> The old `X-API-Key` credentials are **not migrated**. There is no automated
> conversion: the stored hashes are not reversible, and the new tokens have a
> different shape (`sbk_<keyId>_<secret>`). Every key must be re-issued.
---
## 2. Required configuration (per environment)
Set these under the ScadaBridge configuration for each environment (appsettings,
environment variables, or your secret store):
| Key | Value | Notes |
|---|---|---|
| `ScadaBridge:InboundApi:ApiKeyStore:SqlitePath` | Filesystem path to the SQLite key store | Defaults to `<content-root>/data/inbound-api-keys.sqlite` if unset. Choose a durable, backed-up path on a writable volume. |
| `ScadaBridge:InboundApi:ApiKeyPepper` | A strong, random string, **≥ 16 characters** | **DIFFERENT per environment.** Keep it secret (secret store, not source control). This is the HMAC pepper that binds every stored key to this deployment; it is also the verifier's pepper secret. |
Notes:
- The pepper must be present and at least 16 characters or the host fails fast at
startup (`AddZbApiKeyAuth`).
- Changing the pepper after keys are issued invalidates all keys in that environment
(they would no longer verify). Set it once, per environment, and keep it stable.
- The token prefix is `sbk` and migrations run on startup by default
(`ScadaBridge:InboundApi:ApiKeyStore:RunMigrationsOnStartup = true`); these are
wired by the Host and normally need no operator change.
---
## 3. Database migration step
Apply the EF Core migration `RetireInboundApiKeyStore` to the SQL Server
configuration database. It:
- drops the `ApiKeys` table, and
- drops the `ApprovedApiKeyIds` column from `ApiMethods`.
If migrations are applied automatically on deploy (the default for the central node),
this happens as part of the rollout. To apply manually:
```bash
dotnet ef database update RetireInboundApiKeyStore \
--project src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase \
--startup-project src/ZB.MOM.WW.ScadaBridge.Host
```
> Applying this migration **permanently drops** the old key data. Take a database
> backup first if you need a record of the prior `ApiKeys` rows for audit purposes
> (the hashes are not usable credentials, but the names/enabled flags may be of
> record-keeping value).
The new inbound keys live in the **SQLite** store (section 2), not in SQL Server.
---
## 4. Operator re-issue procedure
Re-issue one key per client. Each key is created with the exact method names it is
allowed to call (its scopes).
### Option A — Admin UI
1. Navigate to **`/admin/api-keys`** in the central UI.
2. **Create** a new key: enter a display name and select the allowed method(s).
3. The one-time token `sbk_<keyId>_<secret>` is shown **exactly once** — copy it now.
It cannot be retrieved later.
4. Distribute the token securely to the owning client.
### Option B — CLI
```bash
scadabridge --url <central-url> security api-key create \
--name <client-name> \
--methods <method1,method2>
```
- `--methods` is a comma-separated list of allowed method names — these become the
key's scopes. A method name must match the registered `ApiMethod.Name` **exactly**
(case-sensitive).
- The command prints `API key created. KeyId: <id>` and then the one-time token on
stdout (the "save this now — it will not be shown again" advisory goes to stderr, so
piping stdout captures only the token).
Capture the `sbk_…` token at issue time; it is the only moment the secret is available.
To later change which methods a key may call:
```bash
scadabridge --url <central-url> security api-key set-methods --key-id <id> --methods <m1,m2>
```
---
## 5. Client change
Each API client must replace its header:
- **Remove:** `X-API-Key: <old-key>`
- **Add:** `Authorization: Bearer sbk_<keyId>_<secret>`
Example:
```http
POST /api/CreateOrder HTTP/1.1
Host: scadabridge.example.com
Authorization: Bearer sbk_7f3a...._9c1e....
Content-Type: application/json
{ "orderId": "..." }
```
The token is the full `sbk_<keyId>_<secret>` string exactly as issued — do not split
or transform it.
---
## 6. Verification
1. **Authn (valid key):** call an allowed method with the new Bearer token → `200`
(or the method's normal result).
2. **Authn (no/old credential):** call with no `Authorization` header, or with the old
`X-API-Key` header only → `401` with `{"error":"Invalid or missing API key"}`.
3. **Authz (out of scope):** call a method the key is **not** scoped for → `403` with
`{"error":"API key not approved for this method"}`. A non-existent method name
returns the identical `403` body (enumeration-safe — by design).
4. **Audit:** a successful call records the verified key's display name as the audit
actor; an auth failure records `Actor=null`. Confirm via the audit log.
5. Confirm no client is still sending `X-API-Key` (those requests now fail `401`).
---
## 7. Rollback
The migration `Down` recreates the `ApiKeys` table and the `ApprovedApiKeyIds` column,
**but the dropped key rows are not restored** — `Down` only rebuilds empty structures.
Rolling the migration back does **not** recover any credential.
Therefore "rollback" means **reverting the deployment** to the prior build (which still
speaks `X-API-Key`), not reverting the keys:
1. Redeploy the previous ScadaBridge build.
2. If you took a SQL Server backup before section 3, restore the `ApiKeys` table from
it so the old keys verify again.
3. Without that backup, the old keys are gone and must be re-created under the legacy
scheme as well.
Because rollback is costly and lossy, prefer rolling **forward**: complete the re-issue
in section 4 and fix any straggler clients rather than reverting.
+9 -7
View File
@@ -246,13 +246,15 @@ These are clones of `docker/central-node-a/appsettings.Central.json` and `docker
"MachineDataDb": "Server=scadabridge-mssql,1433;Database=ScadaBridgeMachineData2;User Id=scadabridge_app;Password=ScadaBridge_Dev1#;TrustServerCertificate=true"
},
"Security": {
"LdapServer": "scadabridge-ldap",
"LdapPort": 3893,
"LdapUseTls": false,
"AllowInsecureLdap": true,
"LdapSearchBase": "dc=scadabridge,dc=local",
"LdapServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
"LdapServiceAccountPassword": "password",
"Ldap": {
"Server": "scadabridge-ldap",
"Port": 3893,
"Transport": "None",
"AllowInsecure": true,
"SearchBase": "dc=scadabridge,dc=local",
"ServiceAccountDn": "cn=admin,dc=scadabridge,dc=local",
"ServiceAccountPassword": "password"
},
"JwtSigningKey": "scadabridge-env2-dev-jwt-signing-key-must-be-at-least-32-characters-long",
"JwtExpiryMinutes": 15,
"IdleTimeoutMinutes": 30,
@@ -0,0 +1,127 @@
# Component Reference Documentation — Design
Generate a complete set of per-component developer-reference documents in
`docs/components/`, derived from the actual `src/` code and written to
`StyleGuide.md`. This is the design produced by the documentation-audit
brainstorming session; the goal is documentation that is accurate (matches the
code) and complete (one doc per component).
## Goal
Produce 25 new reference docs — one per component — that describe **how the code
works** for a developer, with real code examples. These complement (do not
replace) the existing `docs/requirements/Component-*.md` design specs: the specs
say *what the component should do and why*; the new reference docs say *how the
shipped code does it*.
The requirements specs, the `src/` code, and the XML doc comments are **not**
modified by this work.
## Scope
One reference doc per component, PascalCase, 1:1 with the existing spec set (minus
the `Component-` prefix), under a new `docs/components/` folder:
```
docs/components/
README.md # index linking all 25, one-line description each
AuditLog.md CentralUI.md CLI.md ClusterInfrastructure.md Commons.md
Communication.md ConfigurationDatabase.md DataConnectionLayer.md
DeploymentManager.md ExternalSystemGateway.md HealthMonitoring.md Host.md
InboundAPI.md ManagementService.md NotificationOutbox.md NotificationService.md
Security.md SiteCallAudit.md SiteEventLogging.md SiteRuntime.md
StoreAndForward.md TemplateEngine.md TraefikProxy.md Transport.md TreeView.md
```
23 of the 25 map 1:1 to a `src/ZB.MOM.WW.ScadaBridge.<Name>` project. The other
two are documented in full (decision: keep `docs/components/` parallel to the
spec set):
- `TraefikProxy.md` — no C# project; documented from the `docker/` / `infra/`
Traefik configuration and the Host `/health/active` active-node endpoint.
Examples in `yaml` / `bash` / `json`.
- `TreeView.md` — a Blazor component inside the CentralUI project; documented from
the actual `.razor` / `.cs` component code. Examples in `razor` / `csharp` / `css`.
A link from the root `README.md` points to the new reference set.
## Per-Document Structure
Each doc follows the StyleGuide section organization (general to specific),
adapted to a code-reference doc. Sections scale to the component — simple
components omit sections they do not need.
1. `#` H1 title (Title Case, matching the spec) + 12 sentence purpose.
2. `## Overview` — what it is, where it runs (central / site), its role and the "why".
3. `## Key Concepts` — domain terms a developer must know (only if needed).
4. `## Architecture` — main types (actors / services / entities), data and message
flow, with real `csharp` snippets (525 lines, with class context) taken from
the actual source.
5. `## Usage` — primary entry points and how the component is invoked, with real code.
6. `## Configuration``appsettings` sections / options classes as a table
(Option | Default | Description); only keys that exist in code.
7. `## Dependencies & Interactions` — what it depends on and talks to, cross-linked.
8. `## Troubleshooting` — common failure modes / health signals (where applicable).
9. `## Related Documentation` — link to the spec (`../requirements/Component-<Name>.md`),
related component reference docs, and relevant `AkkaDotNet/` notes.
## Generation Workflow
Approach: pilot exemplar, then parallel fan-out, then verification.
### Phase 1 — Pilot
Author `AuditLog.md` by reading the `src/ZB.MOM.WW.ScadaBridge.AuditLog` project
and its existing spec, fully StyleGuide-conformant. AuditLog spans central and
site, actors, database, telemetry, and configuration, so it exercises every
section and makes a strong template. The pilot is reviewed and approved before
fan-out.
### Phase 2 — Fan-out
One subagent per remaining component, dispatched in batches of roughly five to
six concurrently (`model: sonnet`). Each subagent receives:
- The approved exemplar as the literal template to mirror.
- A condensed StyleGuide checklist (tone, present tense, real code only, names
match code exactly, language-tagged code blocks, relative links, a
Related Documentation section, no marketing words, no dates or version numbers).
- Its `src/` project path, to read the real code.
- Its existing `docs/requirements/Component-<Name>.md` spec, for domain
cross-check (where code and spec disagree, the code wins).
- The fixed section template and the output path.
Each subagent returns the written doc and the list of source files it drew from.
### Phase 3 — Verification
For each generated doc, a reviewer pass checks:
- Every type, method, file, and config key referenced exists in the source
(grep-verifiable). No invented APIs.
- StyleGuide conformance: heading casing, language-tagged code blocks, present
tense, no banned marketing words, no dates / version numbers, links resolve.
- Required sections present (at minimum Overview, Dependencies & Interactions,
Related Documentation).
Flagged issues are fixed. Then the `docs/components/README.md` index and the root
README link are added, and all internal links are confirmed to resolve.
## Acceptance Criteria
- **Accuracy** — no invented code; every snippet, type, method, and config key
verifiably exists in `src/`; cross-links resolve; described behavior matches code.
- **Completeness** — 25 docs, none missing; each has at least Overview,
Dependencies & Interactions, and Related Documentation; index and README updated.
- **Conformance** — passes the StyleGuide checklist.
## Out of Scope
- No changes to `src/` code, XML doc comments, or the `docs/requirements/` specs.
- Nothing is committed mid-generation; the full set is reviewed before any push.
## Related Documentation
- [Documentation Style Guide](../../StyleGuide.md)
- [docs/requirements/](../requirements/) — the component design specs the code implements
- [README.md](../../README.md) — master component index
@@ -0,0 +1,312 @@
# Component Reference Documentation Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
**Goal:** Generate 25 StyleGuide-conformant developer-reference docs in `docs/components/` (one per component), derived from the actual `src/` code, accurate and complete.
**Architecture:** Pilot exemplar (`AuditLog.md`) sets the template and voice; once approved it is the literal pattern for a parallel fan-out (one subagent per remaining component, each reading its real source + spec); a verification pass then confirms code-accuracy, StyleGuide conformance, and resolving links before an index + README link are added.
**Tech Stack:** Markdown docs; source is C#/.NET (Akka.NET) under `src/`; `StyleGuide.md` defines the writing rules; the design is in `docs/plans/2026-06-03-component-reference-docs-design.md`.
**The "test" for a doc:** unlike code TDD, the pass/fail gate is the **verification checklist** (Task 27): every referenced type/method/file/config-key exists in `src/`; StyleGuide rules hold; required sections present; relative links resolve. Treat that checklist as the acceptance test each doc must pass.
---
## Reference: the StyleGuide checklist (every doc must satisfy)
Every generated doc is written to these rules (condensed from `StyleGuide.md`). This block is handed verbatim to each fan-out subagent.
- **Tone:** technical, direct, present tense ("The actor validates…", not "will validate"). Explain *why*, not only *what*. No marketing words (`powerful`, `robust`, `seamless`, `blazing`, `cutting-edge`, `efficient`, etc.).
- **Real code only:** every `csharp`/`razor`/`yaml`/`json` snippet must be copied or directly derived from actual source in the component. **Never invent** types, methods, or config keys. Snippets are 525 lines with enough class/method context to locate them.
- **Names match code exactly:** `ScadaGatewayActor`, `IRequiredActor<T>`, `appsettings.json`, `ScadaBridge:Timeout` — in backticks, exact casing.
- **Headings:** `#` = Title Case document title; `##` = Title Case sections; `###` = Sentence case.
- **Code blocks:** always language-tagged (`csharp`, `json`, `bash`, `xml`, `sql`, `yaml`, `html`, `css`, `javascript`, `razor`).
- **Links:** relative paths, descriptive link text (no "click here"). Spec link target is `../requirements/Component-<Name>.md`.
- **No temporary info:** no dates, version numbers, or "coming soon".
- **Don't document the obvious** or duplicate XML doc comments; reference the file instead.
- **Required sections (minimum):** Overview, Dependencies & Interactions, Related Documentation. Others (Key Concepts, Architecture, Usage, Configuration, Troubleshooting) included only when the component warrants them.
## Reference: per-doc section template
```
# <Title Case Name>
<12 sentence purpose>
## Overview (always)
## Key Concepts (if the component has domain terms a dev must know)
## Architecture (main types, data/message flow, real code snippets)
## Usage (primary entry points / how it is invoked, real code)
## Configuration (appsettings/options as a table; real keys only)
## Dependencies & Interactions (always; cross-linked to other component docs)
## Troubleshooting (failure modes / health signals, where applicable)
## Related Documentation (always; spec link + related docs)
```
## Reference: component → source map
| Component (doc) | Source to read |
|---|---|
| `AuditLog.md` | `src/ZB.MOM.WW.ScadaBridge.AuditLog/` |
| `CentralUI.md` | `src/ZB.MOM.WW.ScadaBridge.CentralUI/` (exclude the TreeView component) |
| `CLI.md` | `src/ZB.MOM.WW.ScadaBridge.CLI/` |
| `ClusterInfrastructure.md` | `src/ZB.MOM.WW.ScadaBridge.ClusterInfrastructure/` |
| `Commons.md` | `src/ZB.MOM.WW.ScadaBridge.Commons/` |
| `Communication.md` | `src/ZB.MOM.WW.ScadaBridge.Communication/` |
| `ConfigurationDatabase.md` | `src/ZB.MOM.WW.ScadaBridge.ConfigurationDatabase/` |
| `DataConnectionLayer.md` | `src/ZB.MOM.WW.ScadaBridge.DataConnectionLayer/` |
| `DeploymentManager.md` | `src/ZB.MOM.WW.ScadaBridge.DeploymentManager/` |
| `ExternalSystemGateway.md` | `src/ZB.MOM.WW.ScadaBridge.ExternalSystemGateway/` |
| `HealthMonitoring.md` | `src/ZB.MOM.WW.ScadaBridge.HealthMonitoring/` |
| `Host.md` | `src/ZB.MOM.WW.ScadaBridge.Host/` |
| `InboundAPI.md` | `src/ZB.MOM.WW.ScadaBridge.InboundAPI/` |
| `ManagementService.md` | `src/ZB.MOM.WW.ScadaBridge.ManagementService/` |
| `NotificationOutbox.md` | `src/ZB.MOM.WW.ScadaBridge.NotificationOutbox/` |
| `NotificationService.md` | `src/ZB.MOM.WW.ScadaBridge.NotificationService/` |
| `Security.md` | `src/ZB.MOM.WW.ScadaBridge.Security/` |
| `SiteCallAudit.md` | `src/ZB.MOM.WW.ScadaBridge.SiteCallAudit/` |
| `SiteEventLogging.md` | `src/ZB.MOM.WW.ScadaBridge.SiteEventLogging/` |
| `SiteRuntime.md` | `src/ZB.MOM.WW.ScadaBridge.SiteRuntime/` |
| `StoreAndForward.md` | `src/ZB.MOM.WW.ScadaBridge.StoreAndForward/` |
| `TemplateEngine.md` | `src/ZB.MOM.WW.ScadaBridge.TemplateEngine/` |
| `Transport.md` | `src/ZB.MOM.WW.ScadaBridge.Transport/` |
| `TraefikProxy.md` | `docker/traefik/traefik.yml` + Host `/health/active` in `src/ZB.MOM.WW.ScadaBridge.Host/Program.cs` (examples in `yaml`/`bash`/`json`) |
| `TreeView.md` | `src/ZB.MOM.WW.ScadaBridge.CentralUI/Components/Shared/TreeView.razor`, `TreeView.razor.css`, `TreeViewSelectionMode.cs` (examples in `razor`/`csharp`/`css`) |
Each doc also reads its existing spec `docs/requirements/Component-<Name>.md` for domain cross-check. **Where code and spec disagree, the code wins.**
---
## Task 0: Scaffold + shared assets
**Classification:** small
**Estimated implement time:** ~3 min
**Parallelizable with:** none (prerequisite for all)
**Files:**
- Create: `docs/components/` (folder)
- Create: `tools/check-doc-links.sh`
**Step 1: Create the docs folder**
```bash
mkdir -p docs/components
```
**Step 2: Write a relative-link checker**
`tools/check-doc-links.sh` — verifies every relative markdown link in `docs/components/*.md` resolves to a real file (used by Task 27).
```bash
#!/usr/bin/env bash
# Check that all relative markdown links under docs/components resolve.
set -uo pipefail
cd "$(dirname "$0")/.."
fail=0
for f in docs/components/*.md; do
[ -e "$f" ] || continue
# extract ](target) links, ignore http(s):, anchors, and mailto
grep -oE '\]\([^)]+\)' "$f" | sed -E 's/^\]\(//; s/\)$//' | while read -r link; do
case "$link" in
http://*|https://*|mailto:*|\#*) continue ;;
esac
target="${link%%#*}" # strip #anchor
[ -z "$target" ] && continue
resolved="$(cd "$(dirname "$f")" && cd "$(dirname "$target")" 2>/dev/null && pwd)/$(basename "$target")"
if [ ! -e "$resolved" ]; then
echo "BROKEN: $f -> $link"
fi
done
done
echo "link check done"
```
**Step 3: Make it executable and run on the (empty) folder**
Run: `chmod +x tools/check-doc-links.sh && bash tools/check-doc-links.sh`
Expected: `link check done` with no `BROKEN:` lines.
**Step 4: Commit**
```bash
git add docs/components tools/check-doc-links.sh
git commit -m "docs(components): scaffold reference-docs folder + link checker"
```
---
## Task 1: Pilot exemplar — `AuditLog.md` ⛔ approval gate
**Classification:** standard
**Estimated implement time:** ~6 min
**Parallelizable with:** none (blocks all fan-out tasks)
**Files:**
- Create: `docs/components/AuditLog.md`
**Read for context (do not edit):**
- `src/ZB.MOM.WW.ScadaBridge.AuditLog/` (all `.cs`)
- `docs/requirements/Component-AuditLog.md`
- `StyleGuide.md`
**Step 1: Read the source and spec**
Read every `.cs` under `src/ZB.MOM.WW.ScadaBridge.AuditLog/` and the spec. Note the real type names (`AuditLogIngestActor`, `SiteAuditTelemetryActor`, `SiteAuditReconciliationActor`, `AuditLogPurgeActor`, the `AuditLog` entity/table, options classes) and config keys.
**Step 2: Write `docs/components/AuditLog.md`**
Follow the section template and the StyleGuide checklist above. Every `csharp` snippet must be real code from the project. Include: Overview (central+site role, the "why"); Key Concepts (script trust boundary, `ExecutionId` vs `CorrelationId`); Architecture (the singleton actors + ingest paths + the table, with real snippets); Usage (how rows are written on the hot path + central direct-write); Configuration (payload caps, retention, redaction — real keys as a table); Dependencies & Interactions (Site Call Audit, Notification Outbox, ConfigurationDatabase, Communication — cross-linked); Troubleshooting (telemetry loss → reconciliation, audit-write-never-aborts-action); Related Documentation (`../requirements/Component-AuditLog.md` + related component docs).
**Step 3: Self-check against the checklist**
Re-read the doc against the StyleGuide checklist. Confirm: no invented APIs, present tense, language-tagged blocks, no marketing words, no dates/versions, relative spec link present.
**Step 4: STOP — request user approval of the exemplar**
This doc is the template for all 24 fan-out tasks. Present it to the user and get explicit approval (adjust voice/depth/sections per feedback) **before** any fan-out task runs. Do not proceed past this gate automatically.
**Step 5: Commit**
```bash
git add docs/components/AuditLog.md
git commit -m "docs(components): AuditLog reference doc (pilot exemplar)"
```
---
## Tasks 225: Fan-out — one reference doc per remaining component
**These 24 tasks are identical in shape; they differ only by component.** Each is dispatched as its own subagent. **All 24 are mutually parallelizable** (each writes a distinct file) and **all depend on Task 1** (they use the approved `AuditLog.md` as the literal template). The executor caps concurrency at ~56.
**Canonical task contract (applies to every Task 225):**
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** every other fan-out task (Tasks 225)
**Files:**
- Create: `docs/components/<Name>.md`
**Read for context (do not edit):**
- The component's source (see the component → source map above)
- `docs/requirements/Component-<Name>.md`
- `docs/components/AuditLog.md` (the approved template)
- `StyleGuide.md`
**Steps:**
1. Read the component's real source (all relevant `.cs`/`.razor`/config) and its spec. Inventory the real type names, entry points, and config keys.
2. Write `docs/components/<Name>.md` mirroring the structure, voice, and depth of the approved `AuditLog.md`, following the section template and the StyleGuide checklist. Include only sections the component warrants (always Overview, Dependencies & Interactions, Related Documentation).
3. Every code snippet must be real code from this component — no invented APIs. Cross-link dependencies to sibling `docs/components/*.md` and the spec to `../requirements/Component-<Name>.md`.
4. Return the list of source files the doc drew from.
5. Commit: `git add docs/components/<Name>.md && git commit -m "docs(components): <Name> reference doc"`.
**Per-task instances:**
| Task | Doc | Notes |
|---|---|---|
| 2 | `Commons.md` | Shared POCO entities, interfaces, message contracts; Types/Interfaces/Entities/Messages layout |
| 3 | `ConfigurationDatabase.md` | EF Core repos, unit-of-work, `IAuditService`, migrations |
| 4 | `Communication.md` | ClusterClient command/control + gRPC streaming; Central/Site comm actors |
| 5 | `ClusterInfrastructure.md` | Akka cluster setup, active/standby, SBR, singletons |
| 6 | `Host.md` | Single binary, role-based component registration, Akka bootstrap, `/health/ready` |
| 7 | `Security.md` | LDAP bind, cookie+JWT sessions, role/site-scoped authz |
| 8 | `TemplateEngine.md` | Modeling, inheritance, composition, validation, flattening, diffs (large — read broadly) |
| 9 | `DeploymentManager.md` | Central deploy pipeline, instance lifecycle |
| 10 | `SiteRuntime.md` | Site actor hierarchy, script compilation, alarms, site stream (large) |
| 11 | `DataConnectionLayer.md` | Protocol abstraction, Become/Stash state machine, native alarm seam |
| 12 | `StoreAndForward.md` | Buffering, retry, parking, SQLite, replication |
| 13 | `ExternalSystemGateway.md` | HTTP/REST, Call/CachedCall, error classification |
| 14 | `NotificationService.md` | SMTP/OAuth2 delivery adapters, central-only delivery |
| 15 | `NotificationOutbox.md` | Central S&F singleton, `Notifications` table, dispatcher, KPIs |
| 16 | `SiteCallAudit.md` | `SiteCalls` table, telemetry, reconciliation, Retry/Discard relay |
| 17 | `HealthMonitoring.md` | Site metrics collection + central reporting |
| 18 | `SiteEventLogging.md` | Local event logs, retention/cap, central query |
| 19 | `InboundAPI.md` | `POST /api/{method}`, API-key auth, extended type system |
| 20 | `ManagementService.md` | Management actor, receptionist registration, admin ops |
| 21 | `CLI.md` | System.CommandLine tool over HTTP management API |
| 22 | `Transport.md` | Encrypted bundle export/import, conflict resolution, `BundleImportId` |
| 23 | `CentralUI.md` | Blazor Server, custom components, nav, real-time push (exclude TreeView) |
| 24 | `TraefikProxy.md` | From `docker/traefik/traefik.yml` + Host `/health/active`; `yaml`/`bash`/`json` examples |
| 25 | `TreeView.md` | From CentralUI `TreeView.razor`/`.css`/`TreeViewSelectionMode.cs`; `razor`/`csharp`/`css` |
---
## Task 26: Index + README link
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** none (needs all docs present)
**Files:**
- Create: `docs/components/README.md`
- Modify: `README.md` (add a link to the new reference set)
**Step 1: Write `docs/components/README.md`**
An index: H1 + 12 sentence purpose, then a table of all 25 docs with a one-line description each (mirror the README component table ordering / CLAUDE.md component list). Relative links to each `*.md`.
**Step 2: Link from the root README**
Add a short subsection under the Repository Layout / docs description pointing to `docs/components/` as the developer reference set (distinct from the `docs/requirements/` specs).
**Step 3: Commit**
```bash
git add docs/components/README.md README.md
git commit -m "docs(components): index + link from README"
```
---
## Task 27: Verification & fix pass
**Classification:** standard
**Estimated implement time:** ~6 min (fans out one reviewer per doc; fixes are small follow-ups)
**Files:**
- Modify: any `docs/components/*.md` flagged by review
**Step 1: Link check (scripted)**
Run: `bash tools/check-doc-links.sh`
Expected: no `BROKEN:` lines. Fix any broken relative links.
**Step 2: StyleGuide lint (scripted spot-checks)**
```bash
# banned marketing words
grep -rniE 'powerful|robust|seamless|blazing|cutting-edge|world-class' docs/components/ || echo "no marketing words"
# untagged code fences (a bare ``` opening a block)
grep -rnE '^```$' docs/components/ || echo "all fences tagged"
# stray dates / version numbers
grep -rnE '\b20[0-9]{2}-[0-9]{2}-[0-9]{2}\b|\bv[0-9]+\.[0-9]+' docs/components/ || echo "no dates/versions"
```
Fix anything these surface (note: a closing fence also matches `^```$`; verify each hit is genuinely an untagged opener).
**Step 3: Accuracy review (agent fan-out, one per doc)**
Dispatch a reviewer subagent per doc: given `docs/components/<Name>.md` + the component's source, verify **every** referenced type, method, file, and config key actually exists in `src/` (no invented APIs), and that described behavior matches the code. Each reviewer returns a list of inaccuracies (or "clean").
**Step 4: Apply fixes**
Correct every inaccuracy flagged in Step 3. Re-run Steps 12 after edits.
**Step 5: Final acceptance check**
Confirm: 25 docs present in `docs/components/` + index; each has Overview, Dependencies & Interactions, Related Documentation; link check clean; no flagged inaccuracies remain.
**Step 6: Commit**
```bash
git add docs/components
git commit -m "docs(components): verification pass — fix accuracy/conformance/links"
```
---
## Notes
- **Do not push.** The user runs `/pushit` as a separate, explicit step after reviewing the full set.
- **Untracked siblings** (`StyleGuide.md`, `ScadaBridge-docs-*.md`) are out of scope for these commits; stage only the files each task names.
- The pilot approval gate (Task 1, Step 4) is mandatory — fan-out must not begin until the exemplar is approved.
@@ -0,0 +1,34 @@
{
"planPath": "docs/plans/2026-06-03-component-reference-docs.md",
"tasks": [
{"id": 19, "subject": "Task 0: Scaffold + shared assets", "status": "pending"},
{"id": 20, "subject": "Task 1: Pilot exemplar AuditLog.md (approval gate)", "status": "pending", "blockedBy": [19]},
{"id": 21, "subject": "Task 2: Commons.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 22, "subject": "Task 3: ConfigurationDatabase.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 23, "subject": "Task 4: Communication.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 24, "subject": "Task 5: ClusterInfrastructure.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 25, "subject": "Task 6: Host.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 26, "subject": "Task 7: Security.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 27, "subject": "Task 8: TemplateEngine.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 28, "subject": "Task 9: DeploymentManager.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 29, "subject": "Task 10: SiteRuntime.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 30, "subject": "Task 11: DataConnectionLayer.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 31, "subject": "Task 12: StoreAndForward.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 32, "subject": "Task 13: ExternalSystemGateway.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 33, "subject": "Task 14: NotificationService.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 34, "subject": "Task 15: NotificationOutbox.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 35, "subject": "Task 16: SiteCallAudit.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 36, "subject": "Task 17: HealthMonitoring.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 37, "subject": "Task 18: SiteEventLogging.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 38, "subject": "Task 19: InboundAPI.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 39, "subject": "Task 20: ManagementService.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 40, "subject": "Task 21: CLI.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 41, "subject": "Task 22: Transport.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 42, "subject": "Task 23: CentralUI.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 43, "subject": "Task 24: TraefikProxy.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 44, "subject": "Task 25: TreeView.md reference doc", "status": "pending", "blockedBy": [20]},
{"id": 45, "subject": "Task 26: Index + README link", "status": "pending", "blockedBy": [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]},
{"id": 46, "subject": "Task 27: Verification & fix pass", "status": "pending", "blockedBy": [45]}
],
"lastUpdated": "2026-06-03"
}
@@ -0,0 +1,156 @@
# Playwright Coverage Expansion — Design
**Date:** 2026-06-05
**Status:** Approved (brainstorming complete) → ready for writing-plans
**Component:** #9 Central UI — `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests`
## Goal
Close the functional-coverage gaps found in the 2026-06-05 Playwright coverage audit by
implementing all 7 audit recommendations: add ~1518 functional E2E tests, upgrade the
shallow navigation tests, and standardize the skip policy — all against the live 8-node
docker cluster, inside the existing xunit + `PlaywrightFixture` structure.
## Background — the audit
The suite (~68 tests) is **bimodal**: a deep, well-built audit/site-calls core wrapped in
a thin shell of navigation + nav-visibility tests, with a large blind spot over the app's
**mutating actions** (deploy, import, retry/discard, all CRUD writes).
Key gaps the 7 recommendations target:
1. **Topology instance lifecycle** (Deploy/Enable/Disable/Delete) — Akka-singleton relays,
the exact surface the recent report-page hang lived in — untested beyond URL change.
2. **Parked-message / notification Retry/Discard** relays — untested (Site-Calls relay *is*
tested; it is the pattern to copy).
3. **Transport Import → Apply** — bulk writes across all central config, Admin-only — zero
coverage.
4. **Navigation tests assert URL only** — never that the destination rendered; a route
could 500 after navigation and stay green.
5. **No Health-dashboard load test** — the page that fans out to three singleton `Ask`s
every 10s has no assertion its KPI tiles resolve vs. hang/degrade.
6. **No successful persisted write through the UI** anywhere — the entire create/edit/delete
surface is functionally unverified end-to-end (`SiteCrudTests` only covers validation
failure; audit/site-calls "writes" are direct SQL seeds).
7. **Silent coverage cliffs** — DB-dependent tests are inconsistent (`AuditLogPageTests`
*throw* when MSSQL is down; Site-Calls/grid tests *skip*), and skips aren't surfaced.
## Decisions (settled during brainstorming)
| # | Decision | Choice | Rationale |
|---|---|---|---|
| D1 | Mutation fidelity for state-changing tests | **Ephemeral fixtures + outcome-tolerant** | High fidelity, isolated; matches the existing `SiteCallsPageTests` real-relay test that asserts the round-trip happened (toast: `Applied`/`NotParked`/`SiteUnreachable`), not a deep cluster side-effect. |
| D2 | How fixtures are created/torn down | **Shell out to the `scadabridge` CLI** | CLAUDE.md prefers the CLI for state setup; the CLI exposes every needed verb with `--format json` + clean `0`/`1` exit codes. |
| D3 | Skip vs fail when DB/cluster unavailable | **Standardize on Skip + log** | Consistent `SkippableFact` everywhere; a logged skipped-summary prevents a downed dependency from masquerading as full green. Local dev without the cluster still gets green on the rest. |
### CLI surface confirmed (host → `http://localhost:9000`, `multi-role`/`password`)
- `site create --name --identifier [--description]`, `site delete --id`, `site list/get`,
`site area create --site-id --name`, `site area delete --id`, `site deploy-artifacts`.
- `template create --name [--description] [--parent-id]`, `template delete --id`,
`template attribute add --template-id --name --data-type`, `template validate --id`.
- `instance create --name --template-id --site-id [--area-id]`, `instance deploy --id`,
`instance enable --id`, `instance disable --id`, `instance delete --id`.
- `bundle export --output --passphrase [--all|--include-dependencies]`, `bundle preview`,
`bundle import --input --passphrase --on-conflict`.
## Design
### Section 1 — Shared infrastructure (underpins recs 1, 2, 3, 6, 7)
**`CliRunner`** (new helper):
- Resolves the CLI via the built DLL of `src/ZB.MOM.WW.ScadaBridge.CLI` (invoked as
`dotnet <cli>.dll …`), falling back to a `scadabridge` on PATH. The test `.csproj` adds a
build-order `ProjectReference` to the CLI project so the binary always exists.
- Fixed args: `--url http://localhost:9000 --username multi-role --password password --format json`.
- `Task<JsonDocument> RunAsync(params string[] args)` — runs the subprocess, captures
stdout/stderr, throws on non-zero exit (stderr in the message), parses JSON stdout.
- Typed helpers: `CreateSiteAsync`, `CreateAreaAsync`, `CreateTemplateAsync` +
`AddAttributeAsync` (so the template validates), `CreateInstanceAsync`,
`DeleteInstanceAsync` / `DeleteTemplateAsync` / `DeleteSiteAsync`,
`BundleExportAsync(path, templateId, passphrase)`.
**Naming + teardown convention:** every provisioned entity is named `zztest-<8charguid>`
(sorts last; unambiguous for `LIKE 'zztest%'` safety-net deletes). Teardown is best-effort
(swallow errors), mirroring `AuditDataSeeder`/`SiteCallDataSeeder`.
**`ClusterAvailability` gate + skip logging (rec 7):**
- One shared probe (CLI `site list` succeeds *and* the existing DB `IsAvailableAsync`) →
`Skip.IfNot(...)` used uniformly across all DB/cluster tests.
- Convert `AuditLogPageTests`' 11 throw-on-unavailable tests to `SkippableFact`.
- A collection fixture's `DisposeAsync` writes one summary line
(`SKIPPED N tests — cluster/DB unavailable`) so skips are visible.
### Section 2 — Mutating action suites (recs 1, 2, 3)
**`DeploymentActionTests` (rec 1):** a `DeploymentFixture` (collection fixture) provisions
**one** `zztest` site + valid template once; each test creates its **own** throwaway
instance on it (cheap), acts via the Topology UI, deletes it:
- `Deploy_Instance_ConfirmsAndShowsOutcome` — Topology → context-menu Deploy → confirm
dialog → assert exactly one outcome toast (tolerating `Deployed`/`SiteUnreachable`) and
the status badge transitions off "not deployed".
- `Enable_Instance_ShowsOutcome`, `Disable_Instance_ShowsOutcome` — same shape.
- `Delete_Instance_RemovesFromTree` — UI delete → confirm → node disappears.
**`RetryDiscardActionTests` (rec 2):** reuse the existing direct-SQL seeders to seed a
`Parked` row, then drive the UI relay outcome-tolerantly (the `SiteCallsPageTests` pattern):
- Parked Messages: `Retry_ParkedMessage_ShowsOutcomeToast`,
`Discard_ParkedMessage_ShowsOutcomeToast` (seed a parked S&F message for a `zztest`
target on `site-a`).
- Notification Report: `Retry_ParkedNotification_ShowsOutcome`,
`Discard_ParkedNotification_ShowsOutcome` (seed a parked `Notifications` row).
- Each: confirm dialog → exactly one outcome toast (`Applied`/`NotParked`/`SiteUnreachable`);
best-effort row teardown.
**`TransportImportTests` (rec 3):** round-trip via CLI export + UI import:
1. CLI creates a `zztest` template → `bundle export --output /tmp/zztest-<guid>.bundle
--passphrase <p>` (1-template synthetic bundle).
2. UI Import wizard: upload the exported file → passphrase → diff/resolve (Add) →
type-env-name confirm → **Apply**.
3. Assert the result screen shows success, the imported template appears, and the audit
drill-in `?bundleImportId=` link is present.
4. CLI deletes the imported template(s).
- **Impl risk to de-risk first:** remote file upload — Playwright `SetInputFiles` streams the
host file to the container browser; fine for a tiny bundle, but the plan's first transport
step verifies the upload end-to-end before building the rest.
### Section 3 — Happy-path CRUD round-trips (rec 6)
Three pure-UI create→edit→delete tests (the suite currently verifies *no* successful
persisted write). Each ends by deleting what it made; a `zztest`-name safety-net teardown
guards mid-test failure:
- **Site** (extend `SiteCrudTests`): create via `/admin/sites/create` (name + identifier +
node addresses) → appears in list → edit description → delete → gone.
- **Template**: `/design/templates/create` → add an attribute on `/design/templates/{id}`
delete.
- **LDAP mapping**: `/admin/ldap-mappings/create` (group + role) → edit role → delete.
### Section 4 — Shallow-coverage hardening (recs 4, 5)
**Nav render assertions (rec 4):** upgrade the 16 `NavigationTests` theory cases from "URL
changed" to *also* assert the destination's heading/content renders (a per-route
expected-heading map). No new tests — a strengthened helper.
**Health load test (rec 5):** `HealthDashboardTests.KpiTiles_ResolveToValues` — load
`/monitoring/health`, assert the three KPI tile groups (Notification-Outbox, Site-Call,
Audit) resolve to numeric values (not the em-dash degrade) within a timeout. A direct
regression guard for the singleton-hang class of bug.
## Verification
- Each new suite runs green against the live cluster; the full run stays at **0 failed**,
with any skips logged.
- Mutating tests leave **no residual** `zztest-*` entities — verified by a post-run
`site list` / `template list` check.
- Build: `dotnet build` the test project (picks up the new CLI `ProjectReference`), then
`dotnet test`.
## Scope guard (YAGNI)
No new page-object framework, no CI wiring, no parallelization changes. Everything slots
into the existing xunit + `PlaywrightFixture` structure.
## Native tasks
Brainstorming checklist tasks #51#56 track this design through to writing-plans. The
implementation plan (produced next by the writing-plans skill) will carry its own task set.
@@ -0,0 +1,433 @@
# Playwright Coverage Expansion Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans (or subagent-driven-development) to implement this plan task-by-task.
**Goal:** Add ~15 functional Playwright E2E tests + a shared CLI fixture + a standardized skip policy to close the gaps from the 2026-06-05 coverage audit, against the live 8-node docker cluster.
**Architecture:** New tests live in the existing `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests` project, share the existing serial `[Collection("Playwright")]` + `PlaywrightFixture` (remote Chromium at `ws://localhost:3000`, app at `http://scadabridge-traefik`). State-changing tests provision ephemeral `zztest-*` fixtures by shelling out to the `scadabridge` CLI (host → `http://localhost:9000`, `multi-role`/`password`) and assert **outcome-tolerantly** (a confirm-dialog → a single `.toast` appears; the relay outcome may be success or a fast error). Every DB/cluster-dependent test uses `SkippableFact` + a shared availability probe and a logged skip summary.
**Tech Stack:** C# net10.0, xunit + `Xunit.SkippableFact`, Microsoft.Playwright, Microsoft.Data.SqlClient, the `scadabridge` CLI (`src/ZB.MOM.WW.ScadaBridge.CLI`). `TreatWarningsAsErrors=true` — all new code must be warning-clean and nullable-correct.
---
## Shared reference (used by many tasks — read once)
**Project paths**
- Test project dir: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/` (abbrev. `TESTS/`).
- CLI project: `src/ZB.MOM.WW.ScadaBridge.CLI/ZB.MOM.WW.ScadaBridge.CLI.csproj`.
- Existing seeders to mirror: `TESTS/Audit/AuditDataSeeder.cs`, `TESTS/SiteCalls/SiteCallDataSeeder.cs`.
- Existing canonical mutating test to copy: `TESTS/SiteCalls/SiteCallsPageTests.cs` (`RetryClickThrough_OnParkedRow_ConfirmsRelayAndShowsOutcomeToast`).
- Fixture/collection: `TESTS/PlaywrightFixture.cs` (`[CollectionDefinition("Playwright")]`, `BaseUrl`, `NewAuthenticatedPageAsync()`, `ExpandAllNavSectionsAsync()`).
**Global UI selectors (from a full source sweep — use these literally)**
- Confirm dialog (global `DialogHost`): modal `.modal.show.d-block`; footer `.modal-footer`; confirm button = `.modal-footer .btn-danger` (text `Delete`, danger) or `.modal-footer .btn-primary` (text `Confirm`, non-danger); cancel = `.modal-footer .btn-outline-secondary`.
- Toast (`ToastNotification`): `.toast` (each `.toast.show[role=alert]`); body `.toast-body`. Assert `.toast` visible (Timeout 15_000) and `CountAsync()==1`.
- Tree (Topology/Templates): nodes are `li[role=treeitem]` containing `div.tv-row`; label in `span.tv-label`; **right-click** the row to open the context menu `.dropdown-menu.show`; items are `button.dropdown-item` (delete is `button.dropdown-item.text-danger`).
- Card/table kebab (Sites/LdapMappings): click `button[aria-label^="More actions"]` (text `⋮`) → `.dropdown-menu`.
**CLI quick facts**
- Invoke: `dotnet <CLI.dll> --url http://localhost:9000 --username multi-role --password password --format json <verb> <args>`. Exit `0` ok, `1` error; JSON on stdout.
- Verbs: `site list`/`create --name --identifier`/`delete --id`/`area create --site-id --name`/`area delete --id`; `template create --name [--description]`/`delete --id`/`attribute add --template-id --name --data-type`/`list`; `instance create --name --template-id --site-id [--area-id]`/`deploy --id`/`enable --id`/`disable --id`/`delete --id`; `bundle export --output --passphrase --source-environment`/`import`.
**Naming + teardown convention**
- All provisioned entities named `zztest-<8hex>` (`Guid.NewGuid().ToString("N")[..8]`). Teardown is best-effort (swallow exceptions), in `finally`.
**Skip pattern (rec 7)**
- `Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);` at the top of every DB/cluster test.
---
## Task 0: Add CLI ProjectReference to the test project
**Classification:** trivial
**Estimated implement time:** ~2 min
**Parallelizable with:** none (foundation)
**Files:**
- Modify: `TESTS/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.csproj`
**Step 1:** Add a `ProjectReference` to the CLI so its build output (the `scadabridge` dll) is always present for the test process to invoke. Use `ReferenceOutputAssembly="false"` so the CLI's types are NOT linked into the test assembly (we only need its build artifact), but `Private`/build-ordering pulls the build:
```xml
<ItemGroup>
<ProjectReference Include="..\..\src\ZB.MOM.WW.ScadaBridge.CLI\ZB.MOM.WW.ScadaBridge.CLI.csproj"
ReferenceOutputAssembly="false" />
</ItemGroup>
```
**Step 2:** Run `dotnet build tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests` → expect success; confirm `src/ZB.MOM.WW.ScadaBridge.CLI/bin/Debug/net10.0/ZB.MOM.WW.ScadaBridge.CLI.dll` exists.
**Step 3:** Commit: `test(e2e): reference CLI project so tests can shell out to it`.
---
## Task 1: `CliRunner` core (subprocess + JSON + availability probe)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 3, Task 13, Task 14
**Files:**
- Create: `TESTS/Cluster/CliRunner.cs`
- Create: `TESTS/Cluster/ClusterAvailability.cs`
- Test: `TESTS/Cluster/CliRunnerSmokeTests.cs`
**Step 1 — Write the failing smoke test:**
```csharp
using Xunit;
namespace ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Cluster;
[Collection("Playwright")]
public class CliRunnerSmokeTests
{
[SkippableFact]
public async Task SiteList_ReturnsJsonArray()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
using var doc = await CliRunner.RunJsonAsync("site", "list");
Assert.True(doc.RootElement.ValueKind is System.Text.Json.JsonValueKind.Array
or System.Text.Json.JsonValueKind.Object);
}
}
```
**Step 2:** Run `dotnet test --filter CliRunnerSmokeTests` → FAIL (CliRunner not defined).
**Step 3 — Implement `CliRunner`:**
- Resolve the CLI dll: walk up from `AppContext.BaseDirectory` to the repo root, then `src/ZB.MOM.WW.ScadaBridge.CLI/bin/<config>/net10.0/ZB.MOM.WW.ScadaBridge.CLI.dll`. Allow `SCADABRIDGE_CLI_DLL` env override. Throw a clear message if not found.
- `RunAsync(params string[] args)`: `Process.Start` `dotnet <dll> --url <url> --username multi-role --password password --format json <args...>`; capture stdout+stderr; `await WaitForExitAsync()` with a 60s timeout (kill + throw on timeout); on non-zero exit throw `InvalidOperationException($"CLI {string.Join(' ', args)} exited {code}: {stderr}")`. Return stdout.
- `RunJsonAsync(...)`: `JsonDocument.Parse(await RunAsync(...))`.
- URL constant `http://localhost:9000`; allow `SCADABRIDGE_MANAGEMENT_URL` override (matches CLI's own env var).
**Step 4 — Implement `ClusterAvailability`:**
```csharp
public static class ClusterAvailability
{
public const string SkipReason = "Cluster/MSSQL unavailable — start the docker cluster (bash docker/deploy.sh) to run E2E.";
private static bool? _cached;
public static async Task<bool> IsAvailableAsync()
{
if (_cached is { } c) return c;
try { using var _ = await CliRunner.RunJsonAsync("site", "list"); _cached = true; }
catch { _cached = false; }
return _cached.Value;
}
}
```
**Step 5:** Run the smoke test → PASS (cluster up). Verify it SKIPS (not fails) if you stop the cluster (optional manual check; do not leave the cluster down).
**Step 6:** Commit: `test(e2e): add CliRunner + ClusterAvailability probe`.
---
## Task 2: `CliRunner` typed fixture helpers
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 3, Task 13, Task 14
**Blocked by:** Task 1
**Files:**
- Modify: `TESTS/Cluster/CliRunner.cs`
- Test: `TESTS/Cluster/CliRunnerHelpersTests.cs`
**Step 1 — Failing test (round-trips a real template via the CLI):**
```csharp
[SkippableFact]
public async Task CreateThenDeleteTemplate_RoundTrips()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
var name = CliRunner.UniqueName("tmpl");
int id = await CliRunner.CreateTemplateAsync(name);
try
{
var ids = await CliRunner.ListTemplateIdsByNamePrefixAsync(name);
Assert.Contains(id, ids);
}
finally { await CliRunner.DeleteTemplateAsync(id); }
}
```
**Step 2:** Run → FAIL.
**Step 3 — Implement helpers** (each parses the JSON the CLI returns to extract the new integer `id`; inspect a live `template create` JSON shape first via `dotnet <dll> ... template create --name probe` then delete it, to confirm the id field name — likely `id` or `templateId`):
- `static string UniqueName(string kind) => $"zztest-{kind}-{Guid.NewGuid():N}"[..N]` (keep it short).
- `Task<int> CreateTemplateAsync(string name, string? description=null)`
- `Task AddAttributeAsync(int templateId, string name, string dataType="Double")`
- `Task<int> CreateInstanceAsync(string name, int templateId, int siteId, int? areaId=null)`
- `Task<int> CreateAreaAsync(int siteId, string name)`
- `Task<int> ResolveSiteIdAsync(string identifier)` (run `site list`, find by `identifier`/`siteIdentifier` == "site-a", return its `id`)
- `Task DeployInstanceAsync/EnableInstanceAsync/DisableInstanceAsync/DeleteInstanceAsync(int id)`
- `Task DeleteTemplateAsync(int id)` / `DeleteAreaAsync(int id)` / `DeleteSiteAsync(int id)`
- `Task<IReadOnlyList<int>> ListTemplateIdsByNamePrefixAsync(string prefix)` (run `template list`, filter `name` StartsWith prefix)
- `Task BundleExportAsync(string outputPath, int templateId, string passphrase, string sourceEnvironment)` (use `bundle export --output <p> --passphrase <pp> --source-environment <env>` plus whatever selector flag scopes to one template — inspect `bundle export --help`; if it only supports `--all`/`--include-dependencies`, export `--all` into a throwaway-clean cluster slice is unsafe, so prefer a template selector flag; if none exists, note it and fall back to `--all` with the understanding the bundle may carry more — see Task 9 risk note).
- All helpers swallow nothing on create (must throw on failure) but `Delete*` helpers swallow exceptions (best-effort teardown).
**Step 4:** Run → PASS. **Step 5:** Commit: `test(e2e): add CliRunner typed fixture helpers`.
---
## Task 3: Standardize skip policy + skip-count logging (rec 7)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 1, Task 2, Task 13, Task 14
**Files:**
- Create: `TESTS/Cluster/SkipLogCollectionFixture.cs` (or extend `PlaywrightFixture`)
- Modify: `TESTS/Audit/AuditLogPageTests.cs` (convert the 11 throw-on-unavailable guards to `SkippableFact` + `Skip.IfNot`)
**Step 1:** In `AuditLogPageTests.cs`, replace each `[Fact]` whose body throws `InvalidOperationException` when `AuditDataSeeder.IsAvailableAsync()` is false with `[SkippableFact]` + `Skip.IfNot(await AuditDataSeeder.IsAvailableAsync(), DbUnavailableSkipReason);` (mirror `AuditGridColumnTests`). Keep all assertions otherwise unchanged.
**Step 2:** Add skip logging: implement `IDisposable`/`IAsyncLifetime` on a collection fixture (or in `PlaywrightFixture.DisposeAsync`) that, at assembly teardown, writes one line to the console:
`Console.WriteLine($"[E2E] Skipped {SkipTracker.Count} cluster/DB-dependent tests — {ClusterAvailability.SkipReason}");` where `SkipTracker` is a static counter incremented by a tiny helper `SkipUnlessAvailable()` that the tests call. (Simplest: a `static int` on `ClusterAvailability` bumped inside `IsAvailableAsync` when it returns false; log it in fixture dispose.)
**Step 3:** Run `dotnet test --filter AuditLogPageTests` with the cluster UP → all pass (0 skipped, log line shows 0). **Step 4:** Commit: `test(e2e): standardize DB-dependent tests on SkippableFact + skip logging`.
---
## Task 4: `DeploymentFixture` (ephemeral template + area on real site-a)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 8, Task 9 (different files)
**Blocked by:** Task 2
**Files:**
- Create: `TESTS/Deployment/DeploymentFixture.cs`
**Why site-a (not a throwaway site):** Deploy/Enable/Disable relay to the owning site over ClusterClient. An *unknown* site identifier has no registered ClusterClient, so the relay resolves only on a slow 10s Ask timeout and never produces a fast toast (see the explicit comment in `SiteCallsPageTests.RetryClickThrough...`). So the ephemeral instance must live on a **real, running** site — use `site-a`.
**Step 1:** Implement an `IAsyncLifetime` fixture (NOT a collection fixture — instantiate per test class so the shared template/area are created once for the deployment suite):
- `InitializeAsync`: if `!await ClusterAvailability.IsAvailableAsync()` return (tests will skip). Else resolve `SiteAId = await CliRunner.ResolveSiteIdAsync("site-a")`; create `TemplateId = await CliRunner.CreateTemplateAsync(UniqueName("deploytmpl"))`; `await CliRunner.AddAttributeAsync(TemplateId, "Value", "Double")` (so it validates); `AreaId = await CliRunner.CreateAreaAsync(SiteAId, UniqueName("area"))`.
- `Task<int> CreateInstanceAsync()``CliRunner.CreateInstanceAsync(UniqueName("inst"), TemplateId, SiteAId, AreaId)`.
- `DisposeAsync`: best-effort delete the area + template (instances are deleted by the tests; also list+delete any leftover `zztest-inst-*` on site-a as a safety net).
**Step 2:** No standalone test — it's exercised by Tasks 57. Build only (`dotnet build TESTS`). **Step 3:** Commit: `test(e2e): add DeploymentFixture (ephemeral instance on site-a)`.
---
## Task 5: `DeploymentActionTests.Deploy`
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (shares DeploymentFixture file is read-only; but tests in same class serialize)
**Blocked by:** Task 4
**Files:**
- Create: `TESTS/Deployment/DeploymentActionTests.cs`
**Step 1 — Test (TDD: write, watch it fail to compile, then it should pass against the live cluster):**
```csharp
[Collection("Playwright")]
public class DeploymentActionTests : IClassFixture<DeploymentFixture>, IAsyncLifetime
{
private readonly PlaywrightFixture _pw; // via collection
private readonly DeploymentFixture _cluster;
public DeploymentActionTests(PlaywrightFixture pw, DeploymentFixture cluster){ _pw=pw; _cluster=cluster; }
public Task InitializeAsync() => Task.CompletedTask;
public Task DisposeAsync() => Task.CompletedTask;
[SkippableFact]
public async Task Deploy_Instance_ShowsOutcomeToast()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
int instanceId = await _cluster.CreateInstanceAsync();
try
{
var page = await _pw.NewAuthenticatedPageAsync();
await page.GotoAsync($"{PlaywrightFixture.BaseUrl}/deployment/topology");
await page.WaitForLoadStateAsync(LoadState.NetworkIdle);
// Expand to the instance; locate its tree row by the unique name, right-click → Deploy.
var row = page.Locator("li[role=treeitem] .tv-row", new(){ HasText = /* instance unique name */ });
await ExpandToInstanceAsync(page, row); // helper: click ancestor toggles until visible
await row.ClickAsync(new(){ Button = MouseButton.Right });
await page.Locator(".dropdown-menu.show button.dropdown-item", new(){ HasText="Deploy" }).ClickAsync();
// Deploy has NO confirm dialog → outcome toast directly.
var toast = page.Locator(".toast");
await Assertions.Expect(toast).ToBeVisibleAsync(new(){ Timeout = 15_000 });
Assert.Equal(1, await toast.CountAsync());
}
finally { await CliRunner.DeleteInstanceAsync(instanceId); }
}
}
```
- Note: capture the instance unique name from `CreateInstanceAsync` (extend the fixture to return `(int id, string name)` so the test can locate the row). Adjust the helper accordingly.
- `ExpandToInstanceAsync`: click the site-a node then the area node toggles (`.tv-row` chevrons) so the instance row renders; or use the page's `Expand` toolbar button (`button[aria-label="Expand all areas"]`) — simpler: click Expand, then locate the row.
**Step 2:** Run `dotnet test --filter Deploy_Instance` → PASS (toast appears; outcome may be Deployed or a fast error — tolerant). **Step 3:** Commit: `test(e2e): cover Topology Deploy action`.
---
## Task 6: `DeploymentActionTests.Enable` + `Disable`
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (same file as Task 5 — sequential)
**Blocked by:** Task 5
**Files:** Modify `TESTS/Deployment/DeploymentActionTests.cs`
- `Enable_Instance_ShowsOutcomeToast`: deploy first (Enable only shows when state==Disabled; flow depends on cluster state). Simpler and outcome-tolerant: right-click → if `Enable` item present click it, else click `Disable`; assert toast. To keep it deterministic, target **Disable**: after a Deploy the instance is `Enabled`, so `Disable` is offered. `Disable` opens a confirm dialog (danger → `.modal-footer .btn-danger` text `Delete`); click it, then assert toast.
- `Disable_Instance_ShowsOutcomeToast`: deploy → right-click → `Disable` → confirm `.modal-footer .btn-danger` → assert single `.toast`.
- Each test creates its own instance via `_cluster.CreateInstanceAsync()` and deletes it in `finally`.
**Step:** Run filter → PASS. Commit: `test(e2e): cover Topology Enable/Disable actions`.
---
## Task 7: `DeploymentActionTests.Delete`
**Classification:** standard
**Estimated implement time:** ~4 min
**Parallelizable with:** none (same file)
**Blocked by:** Task 6
**Files:** Modify `TESTS/Deployment/DeploymentActionTests.cs`
- `Delete_Instance_RemovesFromTree`: create instance → Topology → locate row → right-click → `Delete` (`button.dropdown-item.text-danger`) → confirm `.modal-footer .btn-danger` (text `Delete`) → assert the row with that instance name is no longer visible (`await Assertions.Expect(row).ToHaveCountAsync(0)` within a timeout). No CLI delete needed in `finally` (the UI deleted it) but call `CliRunner.DeleteInstanceAsync` best-effort anyway in case the UI delete failed.
**Step:** Run → PASS. Commit: `test(e2e): cover Topology Delete action`.
---
## Task 8: NotificationOutbox retry/discard + ParkedMessages query (rec 2)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 4, Task 9
**Blocked by:** Task 2
**Files:**
- Create: `TESTS/Notifications/NotificationDataSeeder.cs` (mirror `SiteCallDataSeeder`; INSERT a `Parked` row into the central `Notifications` table — read the Notifications EF entity at `src/ZB.MOM.WW.ScadaBridge.Commons/Entities/.../Notification*.cs` + its migration to get exact columns; tag a unique field, e.g. a `zztest-*` recipient/list/target, for teardown).
- Create: `TESTS/Notifications/NotificationActionTests.cs`
- Create: `TESTS/Monitoring/ParkedMessagesTests.cs`
**Notification retry/discard (seedable — central singleton, fast):**
- `Retry_ParkedNotification_ShowsOutcomeToast`: seed a `Parked` Notifications row with a unique marker → `/notifications/report` → filter to status Parked / search the marker → Query → find row → click row's `Retry` (`button.btn-outline-success`) → confirm dialog non-danger `.modal-footer .btn-primary` (text `Confirm`) → assert single `.toast`. Teardown: delete the seeded row.
- `Discard_ParkedNotification_ShowsOutcomeToast`: same but `Discard` (`button.btn-outline-danger`) → confirm danger `.modal-footer .btn-danger` (text `Delete`) → toast.
**ParkedMessages query (NOT seedable — site SQLite over Akka Ask; deterministic render test):**
- `ParkedMessages_QueryForSite_RendersWithoutHang`: `/monitoring/parked-messages` → select `#pm-filter-site` = `site-a` (option value = SiteIdentifier) → click `Query` (`button.btn-primary` text `Query`) → assert that within a timeout EITHER `table.parked-table` OR an empty-state element renders (i.e. the singleton-backed query resolved, not hung). This guards the Akka-Ask hang class without needing to seed site state. (Document in a comment why retry/discard isn't exercised here: parked S&F messages live in site SQLite and can't be seeded from central SQL.)
**Steps:** TDD each (write → fails to compile → passes live). Run `dotnet test --filter "NotificationActionTests|ParkedMessagesTests"` → PASS. Commit: `test(e2e): cover notification retry/discard + parked-messages query`.
---
## Task 9: Transport Import round-trip (rec 3)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 4, Task 8
**Blocked by:** Task 2
**Files:**
- Create: `TESTS/Transport/TransportImportTests.cs`
**De-risk file upload FIRST (Step 0):** Playwright `SetInputFilesAsync(hostPath)` over a remote browser connection streams the host file to the container — verify with a 3-line spike before building the rest (upload the exported bundle to `#bundle-input`, assert `[data-testid="manifest-summary"]` appears). If streaming fails, fall back to writing the bundle to a path the container can read (a shared mount) and note it.
**Test `ImportSyntheticBundle_AppliesAndShowsAuditDrillIn`:**
1. `Skip.IfNot(...cluster...)`. `var env = CliRunner.UniqueName("env");` `var tmplName = CliRunner.UniqueName("imp");`
2. CLI: `int tmplId = await CliRunner.CreateTemplateAsync(tmplName); await CliRunner.AddAttributeAsync(tmplId,"Value","Double");`
3. CLI: `var bundlePath = Path.Combine(Path.GetTempPath(), $"{tmplName}.scadabundle"); await CliRunner.BundleExportAsync(bundlePath, tmplId, passphrase:"pw-"+env, sourceEnvironment:env);`
4. **Delete the source template via CLI** so the import sees it as a NEW item (not Modified): `await CliRunner.DeleteTemplateAsync(tmplId);`
5. UI: `/design/transport/import``SetInputFiles("#bundle-input", bundlePath)` → Next → `#import-passphrase` = `"pw-"+env``Unlock` → diff step (new template renders `Add`, no blockers) → Next → `#confirm-env` type `env` (must equal `Manifest.SourceEnvironment`) → `Apply Import` (`button.btn-danger`).
6. Assert `[data-testid="result-summary"]` text contains `Import complete.` and the `Audit trail →` link href starts `/audit/configuration?bundleImportId=`.
7. `finally`: `foreach (var id in await CliRunner.ListTemplateIdsByNamePrefixAsync(tmplName)) await CliRunner.DeleteTemplateAsync(id);` and `File.Delete(bundlePath)` best-effort.
**Step:** Run `dotnet test --filter TransportImportTests` → PASS; verify no residual `zztest-imp-*` templates (`template list`). Commit: `test(e2e): cover Transport Import apply round-trip`.
---
## Task 10: Site CRUD round-trip (rec 6)
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 11, Task 12, Task 13, Task 14
**Blocked by:** Task 2 (safety-net teardown only)
**Files:** Modify `TESTS/SiteCrudTests.cs`
- `CreateEditDelete_Site_RoundTrips`: `/admin/sites/create` → fill (by `<label>`-anchored inputs / `h6 Node A/B`-scoped placeholders): Name=`zztest-site-<hex>`, Identifier=`zztest-<hex>`, Description, and one Node A Akka + gRPC address (any well-formed value — `akka.tcp://scadabridge@zz:5000/user/site-communication`, `http://zz:8083`) → `Save` → assert the new card (`.card` with `.card-title` text == name) appears on `/admin/sites` → Edit → change Description → Save → kebab `⋮``Delete` → confirm `.modal-footer .btn-danger` → assert the card is gone. `finally`: best-effort `CliRunner` delete by resolving the site id via `site list` on the identifier.
**Step:** Run → PASS. Commit: `test(e2e): cover Site create/edit/delete round-trip`.
---
## Task 11: Template CRUD round-trip (rec 6)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 10, Task 12, Task 13, Task 14
**Blocked by:** Task 2 (safety-net teardown)
**Files:** Create `TESTS/Design/TemplateCrudTests.cs`
- `CreateAddAttributeDelete_Template_RoundTrips`: `/design/templates/create` → Name=`zztest-tmpl-<hex>``Create` (`button.btn-success`) → lands on `/design/templates/{id}` → Attributes tab → `Add Attribute` → modal: Name=`Val`, Data Type select → `Add` (submit text is `Add` for new) → assert the attribute row appears → header `Delete` (`button.btn-outline-danger`) → confirm `.modal-footer .btn-danger` → assert redirect to `/design/templates` and the node is gone. `finally`: `ListTemplateIdsByNamePrefixAsync("zztest-tmpl-")`→delete.
**Step:** Run → PASS. Commit: `test(e2e): cover Template create/add-attribute/delete round-trip`.
---
## Task 12: LDAP mapping CRUD round-trip (rec 6)
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 10, Task 11, Task 13, Task 14
**Blocked by:** none (pure UI; safety-net via direct SQL optional)
**Files:** Create `TESTS/Admin/LdapMappingCrudTests.cs`
- `CreateEditDelete_LdapMapping_RoundTrips`: `/admin/ldap-mappings/create` → LDAP Group Name (label-anchored input)=`zztest-grp-<hex>` → Role select (`.form-select`) = `Designer``Save` → assert the row (`tr` whose group-name `td` == the group) appears on `/admin/ldap-mappings` → Edit → change Role to `Viewer` → Save → kebab `⋮``Delete`. **Important:** LdapMappings Delete has **NO confirm dialog** (the page doesn't inject `IDialogService`) — clicking `Delete` removes the row immediately. Assert the row disappears (do NOT wait for a modal). `finally`: best-effort delete (UI did it; optional SQL net keyed on the unique group name).
**Step:** Run → PASS. Commit: `test(e2e): cover LDAP mapping create/edit/delete round-trip`.
---
## Task 13: Navigation render-assertion hardening (rec 4)
**Classification:** small
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 10, Task 11, Task 12, Task 14
**Blocked by:** none
**Files:** Modify `TESTS/NavigationTests.cs`
- Add a per-route expected-heading map (route → expected `h1/h4/h5` text), e.g. `/admin/sites`→"Site Management", `/notifications/report`→"Notification Report", `/design/templates`→"Templates", `/deployment/topology`→(its heading), `/monitoring/health`→(its heading), etc. (Read each target page's heading to fill the map exactly.)
- In the shared `ClickNavAndWait` helper (or each Theory), after asserting the URL, also `await Assertions.Expect(page.Locator("h1,h4,h5", new(){ HasText = expectedHeading })).ToBeVisibleAsync()`. This catches a route that 500s after navigation.
- Keep all existing Theory cases; just strengthen the assertion. No new test methods.
**Step:** Run `dotnet test --filter NavigationTests` → PASS (all routes render). Commit: `test(e2e): assert destination renders, not just URL, in nav tests`.
---
## Task 14: Health KPI load test (rec 5)
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 10, Task 11, Task 12, Task 13
**Blocked by:** none
**Files:** Create `TESTS/Monitoring/HealthDashboardTests.cs`
- `KpiTiles_ResolveToValues_NotDegradePlaceholder`: `/monitoring/health` → wait for load → assert the three KPI groups resolved to values (not the `—` em-dash degrade):
- Notification-Outbox (inlined, no data-test): `.card` whose `.text-muted` label is `Queue Depth` → its sibling `h3` text matches `^\d+$` (and `!= "—"`). Repeat for `Stuck`, `Parked`.
- Audit tiles: `[data-test='audit-kpi-volume']` (+ `error-rate`, `backlog`) text is non-empty/non-`—`.
- Site-Call tiles: `[data-test='site-call-kpi-buffered']` (+ `stuck`, `parked`).
- Use a generous per-tile wait (the page polls/loads async). This is the direct regression guard for the singleton-hang class.
**Step:** Run → PASS. Commit: `test(e2e): assert Health KPI tiles resolve (singleton-hang guard)`.
---
## Task 15: Full-suite verification + no-residue check
**Classification:** small
**Estimated implement time:** ~4 min
**Parallelizable with:** none (final)
**Blocked by:** all
**Steps:**
1. `dotnet test tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests` → expect **0 failed**; note passed/skipped counts and confirm the skip-summary log line prints.
2. `dotnet <CLI.dll> ... site list` / `template list` / `instance list --site-id <site-a>` → grep for `zztest-` → expect **none** (clean teardown). If any leak, fix the offending test's `finally`.
3. Commit any cleanup. Final message summarizing new coverage (count of added tests, the recs covered).
---
## Notes / open risks (surface, don't silently absorb)
- **CLI JSON id field names** (Task 2): confirm the actual field (`id` vs `templateId`/`instanceId`/`siteId`) from a live `create` response before finalizing the parsers.
- **`bundle export` single-template selection** (Task 2/9): verify the CLI supports scoping the export to one template; if it only does `--all`, adjust Task 9 (export a minimal cluster or accept a larger bundle and assert only the zztest template landed).
- **Remote file upload** (Task 9 Step 0): de-risk `SetInputFiles` over the WS-connected browser before building the full import test.
- **Deploy side-effects on site-a** (Tasks 57): ephemeral instances are deployed to the real site-a and deleted in `finally`; the no-residue check (Task 15) is the backstop.
- **Outcome tolerance:** deploy/enable/disable/retry/discard assert *a* toast, not a specific outcome — the live cluster decides Deployed/Applied vs SiteUnreachable/NotParked. This is intentional (matches the existing SiteCalls relay test).
@@ -0,0 +1,23 @@
{
"planPath": "docs/plans/2026-06-05-playwright-coverage-expansion.md",
"lastUpdated": "2026-06-05T00:00:00Z",
"nativeTaskIdBase": 57,
"tasks": [
{"id": 0, "nativeId": 57, "subject": "Task 0: Add CLI ProjectReference to test project", "status": "pending"},
{"id": 1, "nativeId": 58, "subject": "Task 1: CliRunner core + ClusterAvailability probe", "status": "pending", "blockedBy": [0]},
{"id": 2, "nativeId": 59, "subject": "Task 2: CliRunner typed fixture helpers", "status": "pending", "blockedBy": [1]},
{"id": 3, "nativeId": 60, "subject": "Task 3: Standardize skip policy + skip-count logging", "status": "pending", "blockedBy": [1]},
{"id": 4, "nativeId": 61, "subject": "Task 4: DeploymentFixture (ephemeral instance on site-a)", "status": "pending", "blockedBy": [2]},
{"id": 5, "nativeId": 62, "subject": "Task 5: DeploymentActionTests.Deploy", "status": "pending", "blockedBy": [4]},
{"id": 6, "nativeId": 63, "subject": "Task 6: DeploymentActionTests.Enable + Disable", "status": "pending", "blockedBy": [5]},
{"id": 7, "nativeId": 64, "subject": "Task 7: DeploymentActionTests.Delete", "status": "pending", "blockedBy": [6]},
{"id": 8, "nativeId": 65, "subject": "Task 8: Notification retry/discard + ParkedMessages query", "status": "pending", "blockedBy": [2]},
{"id": 9, "nativeId": 66, "subject": "Task 9: Transport Import round-trip", "status": "pending", "blockedBy": [2]},
{"id": 10, "nativeId": 67, "subject": "Task 10: Site CRUD round-trip", "status": "pending", "blockedBy": [2]},
{"id": 11, "nativeId": 68, "subject": "Task 11: Template CRUD round-trip", "status": "pending", "blockedBy": [2]},
{"id": 12, "nativeId": 69, "subject": "Task 12: LDAP mapping CRUD round-trip", "status": "pending", "blockedBy": [1]},
{"id": 13, "nativeId": 70, "subject": "Task 13: Navigation render-assertion hardening", "status": "pending"},
{"id": 14, "nativeId": 71, "subject": "Task 14: Health KPI load test", "status": "pending", "blockedBy": [1]},
{"id": 15, "nativeId": 72, "subject": "Task 15: Full-suite verification + no-residue check", "status": "pending", "blockedBy": [3, 7, 8, 9, 10, 11, 12, 13, 14]}
]
}
@@ -0,0 +1,213 @@
# Playwright Coverage Fill — Design
**Date:** 2026-06-06
**Status:** Approved (brainstorming complete) → ready for writing-plans
**Component:** #9 Central UI — `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests`
**Predecessor:** [2026-06-05 Playwright Coverage Expansion](2026-06-05-playwright-coverage-expansion-design.md) (the first wave; this is the close-out)
## Goal
Close the remaining functional and edge-case gaps found in the 2026-06-06 coverage
re-audit — every untested Tier 13 page plus a cross-cutting edge-case sweep on the
already-covered pages — delivered as **4 risk-tiered waves** inside the existing
xunit + `PlaywrightFixture` harness, against the live 8-node docker cluster.
## Background — the re-audit
After the first expansion (suite at 83 passing), a re-audit mapped all 38 routable pages
against the test catalog. Findings:
- ~14 of 38 pages have real functional coverage; the rest are nav-only (h4 heading) or
untested.
- **Biggest blind spot:** `InstanceConfigure` (`/deployment/instances/{id}/configure`) —
the most complex mutating page (5 override subsystems, browse dialog, test-bindings) —
has zero coverage.
- Edge cases are systematically uncovered across *all* pages: no duplicate-name validation,
no cancel flows, no empty states, no pagination, no filter-combination, no error/500
paths, no wrong-passphrase import.
- The suite is happy-path + outcome-tolerant by design (a deliberate, defensible choice
that matches the relay reality) — but a page that renders-but-misbehaves on a non-default
input can still stay green.
## Decisions (settled during brainstorming)
| # | Decision | Choice | Rationale |
|---|----------|--------|-----------|
| D1 | How much to take on | **Everything** — Tier 13 functional gaps + the edge sweep | User chose full close-out; phased so it stays tractable. |
| D2 | Structure | **Risk-tiered phased waves** (4) | Front-loads highest-risk surfaces; each wave is a shippable, green, zero-residue increment; mirrors the structure that worked for wave 1. |
| D3 | Test selectors on hook-poor pages | **Add `data-test` hooks as needed** (minimal, additive, non-functional) | Deep assertions on the complex forms (esp. `InstanceConfigure`) are otherwise brittle; matches the convention the audited Audit/SiteCalls/Transport pages already follow. |
| D4 | Real-time / streaming tests | **Drive real behavior, outcome-tolerant, generous timeouts, `SkippableFact`** | Highest value; managed flake risk on a handful of tests, with a documented downgrade-to-render-guard fallback. |
## Key enabler — the CLI mirrors the whole surface
The `scadabridge` CLI exposes create/list/delete (and read-back) for **every** entity the
new suites need, including the entire `InstanceConfigure` surface:
- `instance set-bindings | set-overrides | alarm-override set/delete/list |
native-alarm-source set/clear | set-area | diff | get`
- `security api-key create/list/delete/update/set-methods`
- `data-connection`, `db-connection`, `external-system` (+ `method`), `api-method`,
`shared-script`, `notification` (list/create/delete + `smtp`) create/list/delete
→ New functional suites need **no DB seeders**. Fixtures are CLI-provisioned and
persistence is verified by CLI read-back (`instance get`, `security api-key list`). DB
seeding remains only in the existing Audit/SiteCalls/Notification *report* edge tests that
require a `Parked` row (site SQLite isn't reachable, so those stay direct-SQL).
## Wave structure
| Wave | Theme | New suites | New fixtures |
|------|-------|-----------|--------------|
| **1** | Tier 1 — deepest mutating surfaces | `InstanceConfigureTests`, `ApiKeyCrudTests`, `TransportExportTests` (+ wrong-passphrase import negative) | `InstanceConfigureFixture`, `ApiSurfaceFixture` |
| **2** | Tier 2 — real-time / relay | `DeploymentsRealtimeTests`, `TopologyAreaTests`, `DebugViewTests`, `ParkedMessagesActionTests`, Discard click-throughs (SiteCalls + Notification) | reuse `DeploymentFixture` / `InstanceConfigureFixture` |
| **3** | Tier 3 — config CRUD breadth | `NotificationListCrudTests`, `SmtpConfigTests`, `NotificationKpisTests`, `DataConnectionCrudTests`, `ExternalSystemCrudTests`, `SharedScriptCrudTests`, `ApiMethodFormTests`, `EventLogsTests`, `AuditConfigurationTests` | small per-suite CLI helpers |
| **4** | Cross-cutting edge sweep | extend `SiteCrudTests`, `TemplateCrudTests`, `LdapMappingCrudTests`, `AuditLogPageTests`, `SiteCallsPageTests`, `NotificationActionTests` | none |
Each wave ends green with zero residue — a clean stop/ship point. Inside a wave, suites
are independent (disjoint fixtures/files) → parallel-dispatchable. Rough size: W1 ≈ 3 files
/~12 tests, W2 ≈ 5 files/~14, W3 ≈ 9 files/~20, W4 ≈ 6 extended files/~20. Total ≈
**23 files, ~65 new cases.**
## Shared infrastructure
**CLI helper extensions** (`CliRunner.Helpers.cs`, same throw-vs-swallow split):
- *Provision (throw):* `CreateDataConnectionAsync`, `CreateExternalSystemAsync`,
`CreateApiMethodAsync`, `CreateNotificationListAsync`, `CreateSharedScriptAsync`.
- *Verify (read-back, throw):* `GetInstanceAsync(id)`, `ListApiKeysAsync`.
- *Teardown (best-effort):* `DeleteApiKeyAsync`, `DeleteDataConnectionAsync`,
`DeleteExternalSystemAsync`, `DeleteNotificationListAsync`, `DeleteSharedScriptAsync`
all keyed on `zztest-*`.
**New fixtures** (`IAsyncLifetime` + `IClassFixture`, partial-init guard + `Available`
flag, mirroring `DeploymentFixture`):
- **`InstanceConfigureFixture`** — site-a: `zztest` template + attribute(s) + `zztest`
data-connection + an instance, **deployed**. Disposes instance → connection → template.
- **`ApiSurfaceFixture`** — `zztest` external-system + one api-method, so the API-key form
renders method checkboxes. Disposes api-method + external-system; created keys deleted
per-test via `security api-key delete`.
**`data-test` hooks** (minimal, additive, each its own tiny commit so the app diff is
auditable/revertable): `InstanceConfigure.razor` subsystem anchors; row anchors on
`EventLogs.razor` / `ConfigurationAuditLog.razor`; save/result anchors on a few Design
forms. Only where no stable existing selector (id/aria-label/text) exists.
**Reused as-is:** `ClusterAvailability` skip gate + `SkipSummaryReporter`;
`PlaywrightFixture`; toast = `.toast` web-first `ToHaveCountAsync(1)`; confirm =
`.modal-footer .btn-danger` (Delete) / `.btn-primary` (Confirm); `zztest-<kind>-<8hex>`
naming; `PlaywrightDbConnection` (only Wave-4 report-page Parked-row edge tests).
## Per-wave test shapes
### Wave 1 — Tier 1 mutating surfaces
**`InstanceConfigureTests`** (`InstanceConfigureFixture`):
- *Bindings round-trip* — bulk "Assign all to" the zztest connection → Save Bindings →
toast; verify persisted via `GetInstanceAsync` (not just a toast).
- *Attribute-override round-trip* — type text override → Save Overrides → toast →
`GetInstanceAsync` shows it.
- *Area reassignment* — select area → Set Area → toast → `GetInstanceAsync` confirms.
- *(⚠ feasibility)* Alarm-override Edit→Save→badge→Clear — needs a template attribute with
an alarm; resolve via `instance alarm-override set` precondition or downgrade to a
section-renders assertion. Other three tests don't depend on alarms.
- *Edge*`/deployment/instances/999999/configure``alert-danger` not-found.
**`ApiKeyCrudTests`** (`ApiSurfaceFixture`; teardown `security api-key delete`):
- Create→token reveal (`data-test="created-token"`, Copy works); Enable/Disable toast +
badge; Delete confirm `.btn-danger` → row gone; Edit (name disabled, toggle method).
- Edges: empty name → validation; uncheck all methods → "select at least one".
**`TransportExportTests`** + negative import:
- Export — CLI-create zztest template → `/design/transport/export` → select → passphrase →
Export → assert the bundle download (filename/size).
- Wrong-passphrase import — feed the exported bundle to the import wizard with a bad
passphrase → Unlock error state (passphrase input stays, error shown), no `diff-summary`.
### Wave 2 — Tier 2 real-time / relay (drive behavior, tolerant, generous timeouts, `SkippableFact`)
- **`DeploymentsRealtimeTests`** — on `/deployment/deployments`, CLI `instance deploy` a
fixture instance → status row appears within ~20s (SignalR push). Second: Pause → deploy
→ row absent → Refresh → row present.
- **`TopologyAreaTests`** — create area (toolbar + site context-menu); inline rename
(Enter commits + toast; Escape reverts); move area; move instance; *(⚠)* Diff dialog
opens for a deployed instance (tolerant).
- **`DebugViewTests`** — select site-a + enabled instance → Connect → badge "Live" +
snapshot/table region renders (tolerant: rows *or* "Waiting for snapshot"). Disconnect →
tables gone, selects re-enabled.
- **`ParkedMessagesActionTests`** — render+controls guard on the page's own Retry/Discard
button presence/enablement (site SQLite not seedable → no click-through here).
- **Discard click-throughs** — add the symmetric Discard-clicked test to
`SiteCallsPageTests` and `NotificationActionTests` (Retry is already click-through-tested).
### Wave 3 — Tier 3 config CRUD (breadth; CLI teardown by zztest name)
- **`NotificationListCrudTests`** — create → add recipient → delete recipient (no confirm)
→ delete list (confirm + "Deleted." toast).
- **`SmtpConfigTests`** (RequireAdmin) — add (host+from required) → saved toast →
credentials "(stored)"; edit cancel/save.
- **`NotificationKpisTests`** — load → 5 tiles resolve (or "—") → Refresh spinner.
- **`DataConnectionCrudTests` / `ExternalSystemCrudTests` / `SharedScriptCrudTests`** —
create→(edit)→delete round-trips.
- **`ApiMethodFormTests`** — create method (name+timeout, minimal Monaco script, default
schema) → Save → appears under external systems (Monaco interaction minimal/tolerant).
- **`EventLogsTests`** — Search disabled until site selected; select site → Search; filter
by Severity; row expand/collapse; Load more appends.
- **`AuditConfigurationTests`** — load `/audit/configuration`; search narrows; Prev disabled
on page 1; large-state modal open/close; copy-entity-id toast; `?bundleImportId=` chip
drill-in.
### Wave 4 — cross-cutting edge sweep (⚠ verify each page's real validation first)
- **Sites** — duplicate-identifier (assert whatever the app surfaces), cancel-from-edit,
invalid Akka/gRPC URL *if validated*.
- **Templates** — duplicate-name, cancel, edit existing attribute, delete-when-instances-
exist (blocking).
- **LDAP** — duplicate group name, missing-field validation.
- **Audit Log** — filter combination (channel+site+time), empty-results-after-Apply, drawer
close (X/Escape), non-API row has no cURL button, pagination.
- **Site Calls** — filter by status, empty state, keyset pagination (`site-calls-prev/next`
hooks exist, unused today).
- **Notification Report** — filter combos, "Stuck only", detail modal open/close, pagination.
## Error handling & verification strategy
**Validation-behavior protocol (the ⚠ items).** Before asserting a specific failure mode,
the implementer reads the page code-behind to learn what it actually does — inline
`_formError` vs error toast vs DB-constraint bubble vs silent success — and asserts that
reality. Where the app doesn't validate (relies on a DB constraint with no friendly
message), the test asserts the real surfaced behavior and a code comment notes the gap.
Same protocol resolves the alarm-override and Diff-dialog feasibility ⚠s.
**Flake management (real-time/streaming).** Web-first assertions only (`Expect(...)` with
explicit timeouts), never `WaitForTimeout` + read. Generous ceilings (~20s). Outcome-
tolerant where the cluster's response isn't deterministic. All `SkippableFact` +
`ClusterAvailability` gated. Documented fallback: if a real-time test is irreducibly flaky,
downgrade *that test only* to a render+controls guard.
**Teardown & residue.** Best-effort cleanup in `DisposeAsync`/`finally`, keyed on
`zztest-*`. Each wave ends with the no-residue check (`site/template/instance/
data-connection/api-key/notification list` via CLI + DB marker scan) → zero. Mutating tests
that target real site-a leave it as found.
**Per-wave gate.** Wave is "done" only when: new tests pass against the live cluster, the
full suite stays at **0 failed** (skips logged), zero residue, and `dotnet build` is clean
(`TreatWarningsAsErrors=true`). `data-test` additions verified not to alter rendered
behavior.
## Scope guard (YAGNI)
No new page-object framework, no CI wiring, no parallelization/runner changes, no
visual-regression/screenshot testing, no perf testing. New `data-test` attributes are the
only app-code change and are purely additive. Everything slots into the existing structure.
## Success criteria
All 4 waves merged; ~23 files / ~65 new cases; every Tier 13 page from the audit has
functional coverage; the edge sweep adds duplicate/cancel/empty/filter-combo/pagination
assertions to the previously happy-path-only pages; suite green with logged skips; zero
residue.
## Native tasks
Brainstorming checklist tasks #73#78 track this design through to writing-plans. The
implementation plan (produced next by the writing-plans skill) carries its own task set,
one wave at a time.
@@ -0,0 +1,700 @@
# Playwright Coverage Fill — Wave 1 Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans (or subagent-driven-development) to implement this plan task-by-task.
**Goal:** Add deep functional coverage for the three highest-risk untested mutating surfaces — `InstanceConfigure`, API-key create/edit/list, and Transport Export (+ a wrong-passphrase import negative) — with edge cases folded in.
**Architecture:** Extends the existing xunit + `PlaywrightFixture` harness. New ephemeral fixtures are CLI-provisioned on the live cluster and verified by CLI read-back (`instance get`, `security api-key list`); no DB seeding. All cluster-dependent tests are `[SkippableFact]` gated on `ClusterAvailability`. Cleanup is best-effort, keyed on `zztest-*`. Toast asserts are web-first `ToHaveCountAsync(1)`. A small number of additive, non-functional `data-test` attributes are added to `InstanceConfigure.razor`.
**Tech Stack:** .NET 10, xunit + `Xunit.SkippableFact`, Microsoft.Playwright (remote Chromium at `ws://localhost:3000`), the `scadabridge` CLI (`dotnet scadabridge.dll … --format json`).
**Reference design:** `docs/plans/2026-06-06-playwright-coverage-fill-design.md` (this is Wave 1 of 4).
**Conventions (carry into every task):**
- Test files use `[Collection("Playwright")]`; cluster tests use `Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason)`.
- App URL from the browser is `fixture.BaseUrl` (`http://scadabridge-traefik`); the CLI runs from the host against `localhost:9000` (handled inside `CliRunner`).
- Authenticated page: `await fixture.NewAuthenticatedPageAsync("multi-role", "password")`.
- Fixture names: `CliRunner.UniqueName("<kind>")``zztest-<kind>-<8hex>`.
- Toast: `await Assertions.Expect(page.Locator(".toast")).ToHaveCountAsync(1, new() { Timeout = 15_000 });`
- Danger confirm: `page.Locator(".modal-footer .btn-danger")`; non-danger: `.modal-footer .btn-primary`.
- Build is `TreatWarningsAsErrors=true`, `Nullable=enable` — no warnings, no unused usings.
**Validation-behavior protocol:** before asserting any *specific* failure/validation message, the implementer Reads the page code-behind and asserts what the app actually surfaces. Where reality differs from this plan's assumption, follow reality and note it in a code comment.
---
## Task 0: CLI helper extensions (data-connection, api-method, api-key teardown, instance read-back)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (foundation for the rest)
**Files:**
- Modify: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Cluster/CliRunner.Helpers.cs`
- Test: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Cluster/CliRunnerHelpersTests.cs`
**Context:** `CliRunner` is a `static partial class`. Add new helpers mirroring the existing throw-vs-swallow split: provision/read helpers throw (`RequireId`/`RunJsonAsync`); `Delete*` helpers swallow. Verified CLI signatures:
- `data-connection create --site-id <int> --name <string> --protocol <string> [--primary-config <string>]` → JSON object with `id`. `data-connection delete --id <int>`.
- `api-method create --name <string> --script <string> [--timeout <int>]` → JSON object with `id`. `api-method delete --id <int>`. `api-method list` → array of `{id,name}`.
- `security api-key list` → array of `{keyId,name,enabled}`. `security api-key delete --key-id <string>` (key id is a **string**, so it cannot use the int-based `BestEffortAsync`).
- `instance get --id <int>` → object with `connectionBindings[] {attributeName,dataConnectionId}`, `attributeOverrides[] {attributeName,overrideValue}`, `areaId`.
**Step 1: Add the helpers** to `CliRunner.Helpers.cs` (inside the partial class):
```csharp
/// <summary>
/// Creates a data connection on a site via <c>data-connection create</c> and returns its new <c>id</c>.
/// </summary>
public static async Task<int> CreateDataConnectionAsync(int siteId, string name, string protocol = "OpcUa", string? primaryConfig = null)
{
var inv = System.Globalization.CultureInfo.InvariantCulture;
var args = new List<string>
{
"data-connection", "create",
"--site-id", siteId.ToString(inv),
"--name", name,
"--protocol", protocol,
};
if (!string.IsNullOrEmpty(primaryConfig))
{
args.Add("--primary-config");
args.Add(primaryConfig);
}
using var doc = await RunJsonAsync([.. args]);
return RequireId(doc, "data-connection create");
}
/// <summary>Best-effort delete of a data connection via <c>data-connection delete</c> for teardown.</summary>
public static Task DeleteDataConnectionAsync(int id) => BestEffortAsync("data-connection", "delete", id);
/// <summary>
/// Creates an inbound API method via <c>api-method create</c> (so it appears as a checkbox in the
/// API-key form) and returns its new <c>id</c>.
/// </summary>
public static async Task<int> CreateApiMethodAsync(string name, string script = "return null;")
{
using var doc = await RunJsonAsync("api-method", "create", "--name", name, "--script", script);
return RequireId(doc, "api-method create");
}
/// <summary>Best-effort delete of an API method via <c>api-method delete</c> for teardown.</summary>
public static Task DeleteApiMethodAsync(int id) => BestEffortAsync("api-method", "delete", id);
/// <summary>
/// Resolves an API key's opaque string <c>keyId</c> from its display name via
/// <c>security api-key list</c>; returns <see langword="null"/> if no key matches.
/// </summary>
public static async Task<string?> ResolveApiKeyIdByNameAsync(string name)
{
using var doc = await RunJsonAsync("security", "api-key", "list");
if (doc.RootElement.ValueKind == JsonValueKind.Array)
{
foreach (var key in doc.RootElement.EnumerateArray())
{
if (key.TryGetProperty("name", out var n)
&& n.ValueKind == JsonValueKind.String
&& string.Equals(n.GetString(), name, StringComparison.Ordinal)
&& key.TryGetProperty("keyId", out var k)
&& k.ValueKind == JsonValueKind.String)
{
return k.GetString();
}
}
}
return null;
}
/// <summary>
/// Best-effort delete of an API key via <c>security api-key delete --key-id</c> for teardown.
/// The key id is an opaque string, so this cannot use the int-based <see cref="BestEffortAsync"/>.
/// </summary>
public static async Task DeleteApiKeyAsync(string keyId)
{
try
{
await RunAsync("security", "api-key", "delete", "--key-id", keyId);
}
catch
{
// Best-effort teardown — never mask the test's own failure.
}
}
/// <summary>
/// Reads an instance's full configuration via <c>instance get</c>; the returned document exposes
/// <c>connectionBindings</c>, <c>attributeOverrides</c>, and <c>areaId</c> for persistence read-back.
/// Caller owns the returned <see cref="JsonDocument"/>.
/// </summary>
public static Task<JsonDocument> GetInstanceAsync(int id) =>
RunJsonAsync("instance", "get", "--id", id.ToString(System.Globalization.CultureInfo.InvariantCulture));
```
**Step 2: Add round-trip helper tests** to `CliRunnerHelpersTests.cs` (follow the existing `[SkippableFact]` + `Skip.IfNot` pattern). Resolve `site-a` first.
```csharp
[SkippableFact]
public async Task CreateThenDeleteDataConnection_RoundTrips()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
var siteId = await CliRunner.ResolveSiteIdAsync("site-a");
var id = await CliRunner.CreateDataConnectionAsync(siteId, CliRunner.UniqueName("conn"));
try
{
Assert.True(id > 0);
}
finally
{
await CliRunner.DeleteDataConnectionAsync(id);
}
}
[SkippableFact]
public async Task CreateThenDeleteApiMethod_RoundTrips()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
var id = await CliRunner.CreateApiMethodAsync(CliRunner.UniqueName("method"));
try
{
Assert.True(id > 0);
}
finally
{
await CliRunner.DeleteApiMethodAsync(id);
}
}
```
**Step 3: Build + run** — `dotnet test --filter "FullyQualifiedName~CliRunnerHelpersTests"`. Expected: new tests pass (cluster up) or skip (cluster down); 0 failed.
**Step 4: Commit** — `git add -A && git commit -m "test(e2e): add CliRunner helpers for data-connection, api-method, api-key teardown, instance read-back"`
**Acceptance:** helpers compile warning-free; round-trip tests green; no residual `zztest-*` connection/method left behind.
---
## Task 1: InstanceConfigureFixture (ephemeral instance + data-connection on site-a)
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 2, Task 6
**Files:**
- Create: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Deployment/InstanceConfigureFixture.cs`
**Context:** Mirror `DeploymentFixture` exactly (partial-init guard, `Available` flag, best-effort dispose). Provisions on **site-a**: a `zztest` template + one `Double` attribute named `Value`, a `zztest` **data-connection** (so the bindings UI has a connection to bind to), a `zztest` area (for the area-reassignment test), and one instance created with **no area** (so the reassignment test makes a real change). Deploy is intentionally NOT performed — bindings/overrides/area are pre-deploy config operations, so a non-deployed instance is the correct, simpler fixture.
**Validation-behavior check (do first):** Read `InstanceConfigure.razor.cs` to confirm what populates `_bindingDataSourceAttrs` and `_overrideAttrs`. A plain `Double` attribute is expected to appear in both. If a plain attribute does NOT qualify as a binding data-source, adjust the fixture's attribute (e.g. add the attribute kind the page requires) and note it in a comment.
**Step 1: Write the fixture:**
```csharp
using System.Text.Json;
using ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Cluster;
namespace ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Deployment;
/// <summary>
/// <see cref="IAsyncLifetime"/> fixture for the InstanceConfigure E2E tests. Provisions, on the real
/// running <c>site-a</c>: a zztest template with a single bindable <c>Double</c> attribute, a zztest
/// data-connection (so the bindings UI has a connection to assign), a zztest area (for the
/// area-reassignment test), and one instance created with no area. The instance is NOT deployed —
/// bindings/overrides/area assignment are pre-deploy configuration operations.
/// </summary>
public sealed class InstanceConfigureFixture : IAsyncLifetime
{
private const string SiteAIdentifier = "site-a";
public int SiteAId { get; private set; }
public int TemplateId { get; private set; }
public int ConnectionId { get; private set; }
public int AreaId { get; private set; }
public int InstanceId { get; private set; }
/// <summary>The single bindable/overridable attribute name on the fixture template.</summary>
public string AttributeName => "Value";
/// <summary>The fixture data-connection name (for locating it in the bindings UI dropdown).</summary>
public string ConnectionName { get; private set; } = string.Empty;
public bool Available { get; private set; }
public async Task InitializeAsync()
{
Available = await ClusterAvailability.IsAvailableAsync();
if (!Available)
{
return;
}
try
{
SiteAId = await CliRunner.ResolveSiteIdAsync(SiteAIdentifier);
TemplateId = await CliRunner.CreateTemplateAsync(CliRunner.UniqueName("cfgtmpl"));
await CliRunner.AddAttributeAsync(TemplateId, AttributeName, "Double");
ConnectionName = CliRunner.UniqueName("conn");
ConnectionId = await CliRunner.CreateDataConnectionAsync(SiteAId, ConnectionName);
AreaId = await CliRunner.CreateAreaAsync(SiteAId, CliRunner.UniqueName("cfgarea"));
InstanceId = await CliRunner.CreateInstanceAsync(CliRunner.UniqueName("cfginst"), TemplateId, SiteAId);
}
catch
{
await SafeCleanupAsync();
Available = false;
throw;
}
}
public async Task DisposeAsync()
{
if (!Available)
{
return;
}
await SafeCleanupAsync();
}
private async Task SafeCleanupAsync()
{
await CliRunner.DeleteInstanceAsync(InstanceId);
await CliRunner.DeleteDataConnectionAsync(ConnectionId);
await CliRunner.DeleteAreaAsync(AreaId);
await CliRunner.DeleteTemplateAsync(TemplateId);
}
}
```
**Step 2: Build** — `dotnet build`. Expected: clean.
**Step 3: Commit** — `git add -A && git commit -m "test(e2e): add InstanceConfigureFixture (template+attr+connection+area+instance on site-a)"`
**Acceptance:** compiles; fields populated; dispose deletes everything (verified by Task 11 residue check).
---
## Task 2: Add data-test hooks to InstanceConfigure.razor
**Classification:** small
**Estimated implement time:** ~3 min
**Parallelizable with:** Task 1, Task 6
**Files:**
- Modify: `src/ZB.MOM.WW.ScadaBridge.CentralUI/Components/Pages/Deployment/InstanceConfigure.razor`
**Context:** The page's `<select>`s and the error alert have only generic Bootstrap classes. Add three additive, non-functional `data-test` attributes. Buttons ("Save Bindings", "Save Overrides", "Set Area") are reliably reachable by role+text and need no hooks.
**Step 1: Add the attributes** (exact locations from the selector audit — re-Read the file to confirm line numbers before editing):
- The bulk "Assign all to…" `<select>` in the bindings card header (~line 87): add `data-test="binding-bulk-select"`.
- The area `<select>` in the Area Assignment card (~line 439): add `data-test="area-select"`.
- The error/not-found `<div class="alert alert-danger">@_errorMessage</div>` (~line 48): add `data-test="instance-error-alert"`.
Example edit (bulk select):
```razor
<select class="form-select form-select-sm" data-test="binding-bulk-select" @bind="_bulkConnectionId">
```
**Step 2: Build** — `dotnet build src/ZB.MOM.WW.ScadaBridge.CentralUI`. Expected: clean (attributes are inert).
**Step 3: Commit** — `git add -A && git commit -m "feat(centralui): add data-test hooks to InstanceConfigure selects + error alert (test instrumentation)"`
**Acceptance:** the three `data-test` attributes render; no behavioral/markup change beyond the attributes.
---
## Task 3: InstanceConfigureTests — bindings round-trip
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 7, Task 9 (different files)
**Files:**
- Create: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Deployment/InstanceConfigureTests.cs`
**Depends on:** Task 0, Task 1, Task 2.
**Test:** Bulk-assign all attributes to the fixture connection → Save Bindings → assert one toast → verify persisted via `instance get`.
**Step 1: Write the test class + first test:**
```csharp
using System.Text.Json;
using Microsoft.Playwright;
using ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Cluster;
namespace ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Deployment;
[Collection("Playwright")]
public sealed class InstanceConfigureTests : IClassFixture<InstanceConfigureFixture>
{
private readonly PlaywrightFixture _fixture;
private readonly InstanceConfigureFixture _cfg;
public InstanceConfigureTests(PlaywrightFixture fixture, InstanceConfigureFixture cfg)
{
_fixture = fixture;
_cfg = cfg;
}
[SkippableFact]
public async Task BindAllAttributes_SavesAndPersists()
{
Skip.IfNot(_cfg.Available, ClusterAvailability.SkipReason);
var page = await _fixture.NewAuthenticatedPageAsync("multi-role", "password");
await page.GotoAsync($"{_fixture.BaseUrl}/deployment/instances/{_cfg.InstanceId}/configure");
// Bulk-assign every bindable attribute to the fixture connection, then Apply + Save.
await page.Locator("[data-test='binding-bulk-select']")
.SelectOptionAsync(new SelectOptionValue { Label = _cfg.ConnectionName });
await page.GetByRole(AriaRole.Button, new() { Name = "Apply" }).ClickAsync();
await page.GetByRole(AriaRole.Button, new() { Name = "Save Bindings" }).ClickAsync();
await Assertions.Expect(page.Locator(".toast")).ToHaveCountAsync(1, new() { Timeout = 15_000 });
// Verify persistence via CLI read-back (not just the toast).
using var doc = await CliRunner.GetInstanceDocumentAsync(_cfg.InstanceId);
var bindings = doc.RootElement.GetProperty("connectionBindings");
var bound = bindings.EnumerateArray().Any(b =>
b.GetProperty("attributeName").GetString() == _cfg.AttributeName
&& b.GetProperty("dataConnectionId").GetInt32() == _cfg.ConnectionId);
Assert.True(bound, "Expected the Value attribute to be bound to the fixture connection after Save Bindings.");
}
}
```
**Note (do first):** confirm the bulk `<select>` option label is the connection *name* (the audit indicates options are connection names). If the option text differs, select by the rendered text. Confirm `SelectOptionValue { Label = … }` matches; if the option value is the connection id, select by `Value = _cfg.ConnectionId.ToString()` instead.
**Step 2: Run** — `dotnet test --filter "FullyQualifiedName~InstanceConfigureTests.BindAllAttributes"`. Expected: pass (cluster up) / skip (down).
**Step 3: Commit** — `git add -A && git commit -m "test(e2e): InstanceConfigure bindings round-trip (bulk assign → save → verify via instance get)"`
**Acceptance:** test drives the real bindings save and verifies persistence by read-back; leaves no residue (fixture owns cleanup).
---
## Task 4: InstanceConfigureTests — attribute-override + area reassignment + not-found edge
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (same file as Task 3 → serial after it)
**Files:**
- Modify: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Deployment/InstanceConfigureTests.cs`
**Depends on:** Task 3.
**Tests (add three methods):**
1. **Attribute-override round-trip** — type an override value for `Value` in the Attribute Overrides section → "Save Overrides" → one toast → `instance get` shows `attributeOverrides` containing `{attributeName:"Value", overrideValue:<typed>}`. The per-attribute override input is the text `<input class="form-control form-control-sm">` in the overrides card row; locate it by scoping to the overrides card and the row whose label cell text is `Value` (re-Read the section to confirm the row structure; if ambiguous, add `data-test="override-input-Value"` to the input as a 4th hook in Task 2's spirit and reference it).
2. **Area reassignment**`data-test='area-select'` → select the fixture area by its name → "Set Area" → one toast → `instance get` shows `areaId == _cfg.AreaId`.
3. **Not-found edge**`GotoAsync(.../deployment/instances/999999999/configure)` → assert `page.Locator("[data-test='instance-error-alert']")` visible and contains text `not found` (confirm exact wording `Instance #999999999 not found.` against `InstanceConfigure.razor.cs` line ~547 per the protocol).
**Step 13:** write each test (same shape as Task 3: skip-gate, authenticated page, act, toast assert, CLI read-back), run the filtered tests, commit:
`git add -A && git commit -m "test(e2e): InstanceConfigure attribute-override + area reassignment + not-found edge"`
**Note — alarm overrides deferred:** the Alarm Overrides subsystem renders rows only when the template defines an unlocked alarm, and template alarms are not CLI-provisionable. Alarm-override UI coverage is therefore **deferred to a later wave** (requires a template-with-alarm fixture path). Add a `// TODO(wave-N): alarm-override UI coverage — needs template-with-alarm fixture (not CLI-provisionable today)` comment at the bottom of the file so the gap is tracked in-code.
**Acceptance:** three tests pass/skip; overrides + area verified by read-back; not-found asserts the real surfaced message.
---
## Task 5: ApiSurfaceFixture (inbound api-method for the API-key form)
**Classification:** small
**Estimated implement time:** ~3 min
**Parallelizable with:** Task 1, Task 2
**Files:**
- Create: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Admin/ApiSurfaceFixture.cs`
**Depends on:** Task 0.
**Context:** The API-key form renders one checkbox per inbound API method (`id="method-access-{ApiMethod.Id}"`). Provision one `zztest` api-method so a checkbox exists; expose its `Id` so tests can target `#method-access-{Id}` precisely.
```csharp
using ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Cluster;
namespace ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests.Admin;
/// <summary>
/// Provisions a single inbound API method so the API-key form renders at least one method checkbox
/// (<c>id="method-access-{MethodId}"</c>). Created API keys are deleted per-test; this fixture owns
/// only the method.
/// </summary>
public sealed class ApiSurfaceFixture : IAsyncLifetime
{
public int MethodId { get; private set; }
public string MethodName { get; private set; } = string.Empty;
public bool Available { get; private set; }
public async Task InitializeAsync()
{
Available = await ClusterAvailability.IsAvailableAsync();
if (!Available)
{
return;
}
try
{
MethodName = CliRunner.UniqueName("method");
MethodId = await CliRunner.CreateApiMethodAsync(MethodName);
}
catch
{
await CliRunner.DeleteApiMethodAsync(MethodId);
Available = false;
throw;
}
}
public async Task DisposeAsync()
{
if (Available)
{
await CliRunner.DeleteApiMethodAsync(MethodId);
}
}
}
```
**Commit:** `git add -A && git commit -m "test(e2e): add ApiSurfaceFixture (inbound api-method for API-key form checkboxes)"`
**Acceptance:** compiles; `MethodId > 0`; disposed cleanly.
---
## Task 6: ApiKeyCrudTests — create→token reveal + validation edges
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 3, Task 9 (different files)
**Files:**
- Create: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Admin/ApiKeyCrudTests.cs`
**Depends on:** Task 0, Task 5.
**Selectors (verified):** Name input = the single `input[type=text].form-control.form-control-sm`; method checkbox = `#method-access-{_api.MethodId}`; Save = button text "Save"; created-token panel = `[data-test='created-token']`; inline validation = `div.text-danger.small.mt-2` (messages: `Name is required.`, `Select at least one API method for this key.`).
**Tests:**
1. **Create→token reveal** (mutates; teardown via CLI):
```csharp
[SkippableFact]
public async Task CreateApiKey_RevealsOneTimeToken()
{
Skip.IfNot(_api.Available, ClusterAvailability.SkipReason);
var page = await _fixture.NewAuthenticatedPageAsync("multi-role", "password");
var keyName = CliRunner.UniqueName("apikey");
try
{
await page.GotoAsync($"{_fixture.BaseUrl}/admin/api-keys/create");
await page.Locator("input[type='text'].form-control-sm").First.FillAsync(keyName);
await page.Locator($"#method-access-{_api.MethodId}").CheckAsync();
await page.GetByRole(AriaRole.Button, new() { Name = "Save" }).ClickAsync();
await Assertions.Expect(page.Locator("[data-test='created-token']")).ToBeVisibleAsync(new() { Timeout = 10_000 });
await Assertions.Expect(page.GetByRole(AriaRole.Button, new() { Name = "Copy" })).ToBeVisibleAsync();
}
finally
{
var keyId = await CliRunner.ResolveApiKeyIdByNameAsync(keyName);
if (keyId is not null) await CliRunner.DeleteApiKeyAsync(keyId);
}
}
```
2. **Empty name → validation** — leave name blank, check the method, Save → assert `div.text-danger.small` visible containing `Name is required.`; no token panel. (No teardown needed — nothing created.)
3. **No methods → validation** — fill name, leave all methods unchecked, Save → assert validation contains `Select at least one API method for this key.`; no token panel.
**Step: run** `dotnet test --filter "FullyQualifiedName~ApiKeyCrudTests"`, then **commit**: `git add -A && git commit -m "test(e2e): API-key create→token reveal + name/method validation edges"`
**Acceptance:** create reveals the token; both validation paths assert the real messages; the created key is deleted by name in `finally` (verified by Task 11 residue check).
---
## Task 7: ApiKeyCrudTests — enable/disable + delete-with-confirm
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** none (same file as Task 6 → serial after it)
**Files:**
- Modify: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Admin/ApiKeyCrudTests.cs`
**Depends on:** Task 6.
**Context:** Pre-create the key via `CliRunner.CreateApiKeyAsync(name, methods)` (added in Task 0's review fix — it runs `security api-key create` via `RunAsync` because that command prints prose, not JSON, and resolves the new `keyId` by name) so the list has a row to act on, then drive the list-page actions. Pass `methods = _api.MethodName`. Teardown via `CliRunner.DeleteApiKeyAsync(keyId)` in `finally`.
**Tests:**
1. **Enable/Disable** — on `/admin/api-keys`, open the row kebab `button[aria-label="More actions for {name}"]` → click `Disable` → assert one toast and the `Disabled` badge appears on the row; re-open kebab → `Enable` → toast, badge gone.
2. **Delete-with-confirm** — kebab → `Delete` (`.dropdown-item.text-danger`) → confirm modal title `Delete API Key`, click `.modal-footer .btn-danger` (text `Delete`) → assert the row for that name is gone (`ToHaveCountAsync(0)`).
Locate a key's row by name via `page.Locator("tr:has(td:text-is(\"<name>\"))")`. Teardown: best-effort `DeleteApiKeyAsync` in `finally` (the delete test removes it; the enable/disable test must clean up its own key).
**Step: run + commit** — `git add -A && git commit -m "test(e2e): API-key enable/disable toast + delete-with-confirm removes row"`
**Acceptance:** enable/disable + delete drive real mutations with toast/row assertions; no residual keys.
---
## Task 8: TransportExportTests — export wizard happy path
**Classification:** standard
**Estimated implement time:** ~5 min
**Parallelizable with:** Task 3, Task 6, Task 9
**Files:**
- Create: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Transport/TransportExportTests.cs`
**Depends on:** Task 0 (uses `CreateTemplateAsync`/`AddAttributeAsync`/`DeleteTemplateAsync`, already present).
**Context:** Route `/design/transport/export` (RequireDesign — multi-role qualifies). Wizard: Step 1 Select (pick the template), Step 2 Review (Next), Step 3 Encrypt (`#passphrase` + `#passphrase-confirm`, Export enabled when matching & ≥8 chars), Step 4 Download (`[data-testid='download-summary']` "Bundle ready. Your browser is downloading the file."). The download itself is a JS-interop blob, **not** a DOM `<a download>` — so assert the `download-summary` DOM (proof the export succeeded server-side) rather than capturing the file, to avoid a hang on `WaitForDownload`.
**Test:**
```csharp
[SkippableFact]
public async Task ExportTemplate_ReachesDownloadSummary()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
var tmplName = CliRunner.UniqueName("exptmpl");
var tmplId = await CliRunner.CreateTemplateAsync(tmplName);
await CliRunner.AddAttributeAsync(tmplId, "Value", "Double");
try
{
var page = await _fixture.NewAuthenticatedPageAsync("multi-role", "password");
await page.GotoAsync($"{_fixture.BaseUrl}/design/transport/export");
await page.WaitForLoadStateAsync(LoadState.NetworkIdle);
// Step 1 — narrow to the zztest template and select it.
await page.Locator("#export-filter").FillAsync(tmplName);
await page.Locator($"[data-testid='group-templates'] label:has-text('{tmplName}')").ClickAsync();
await page.GetByRole(AriaRole.Button, new() { Name = "Next" }).ClickAsync();
// Step 2 — Review.
await page.GetByRole(AriaRole.Button, new() { Name = "Next" }).ClickAsync();
// Step 3 — Encrypt.
await page.Locator("#passphrase").FillAsync("zztest-passphrase-123");
await page.Locator("#passphrase-confirm").FillAsync("zztest-passphrase-123");
await page.GetByRole(AriaRole.Button, new() { Name = "Export" }).ClickAsync();
// Step 4 — success.
await Assertions.Expect(page.Locator("[data-testid='download-summary']"))
.ToBeVisibleAsync(new() { Timeout = 20_000 });
await Assertions.Expect(page.Locator("[data-testid='download-summary']"))
.ToContainTextAsync("Bundle ready");
}
finally
{
await CliRunner.DeleteTemplateAsync(tmplId);
}
}
```
**Notes (do first):** confirm the template checkbox interaction — clicking the `label` toggles the tree checkbox; if the label click doesn't check it, target the adjacent `input[type=checkbox]`. Confirm the "Export" button label text and that it enables after both passphrases match. If `download-summary` doesn't appear because the JS download needs a real browser download path, wrap the Export click in `page.RunAndWaitForDownloadAsync(...)` and assert the download's `SuggestedFilename` ends with `.scadabundle` instead.
**Step: run + commit** — `git add -A && git commit -m "test(e2e): Transport Export wizard reaches download summary for a zztest template"`
**Acceptance:** export drives the full wizard to the success state; template cleaned up.
---
## Task 9: Wrong-passphrase import negative test
**Classification:** standard
**Estimated implement time:** ~4 min
**Parallelizable with:** Task 8 (different file)
**Files:**
- Modify: `tests/ZB.MOM.WW.ScadaBridge.CentralUI.PlaywrightTests/Transport/TransportImportTests.cs`
**Depends on:** Task 0 (existing helpers only).
**Context:** Reuse the existing import scaffolding. Export a real encrypted bundle via `CliRunner.BundleExportAsync(path, tmplId, correctPass, env)`, upload it, then submit a **wrong** passphrase at Step 2. Verified failure behavior (`TransportImport.razor.cs` `SubmitPassphraseAsync`): `_errorMessage = "Wrong passphrase. Please try again."`, the `#import-passphrase` input stays visible, and `[data-testid='diff-summary']` does not appear. Error element: `[data-testid='error-message']`. Secondary: `[data-testid='unlock-attempts']` → "Failed unlock attempts: 1 of …".
**Test:**
```csharp
[SkippableFact]
public async Task ImportWithWrongPassphrase_ShowsErrorAndStaysOnPassphraseStep()
{
Skip.IfNot(await ClusterAvailability.IsAvailableAsync(), ClusterAvailability.SkipReason);
var tmplName = CliRunner.UniqueName("wrongpass");
var tmplId = await CliRunner.CreateTemplateAsync(tmplName);
await CliRunner.AddAttributeAsync(tmplId, "Value", "Double");
var bundlePath = Path.Combine(Path.GetTempPath(), tmplName + ".scadabundle");
try
{
await CliRunner.BundleExportAsync(bundlePath, tmplId, "correct-passphrase-1", "src-env");
var page = await _fixture.NewAuthenticatedPageAsync("multi-role", "password");
await page.GotoAsync($"{_fixture.BaseUrl}/design/transport/import");
await page.WaitForLoadStateAsync(LoadState.NetworkIdle);
await page.Locator("#bundle-input").SetInputFilesAsync(bundlePath);
await Assertions.Expect(page.Locator("[data-testid='encrypted-bundle-notice']")).ToBeVisibleAsync(new() { Timeout = 15_000 });
await page.Locator("#import-passphrase").FillAsync("WRONG-passphrase-xyz");
await page.Locator("button.btn-primary:has-text('Unlock')").ClickAsync();
await Assertions.Expect(page.Locator("[data-testid='error-message']"))
.ToContainTextAsync("Wrong passphrase. Please try again.", new() { Timeout = 10_000 });
await Assertions.Expect(page.Locator("#import-passphrase")).ToBeVisibleAsync();
await Assertions.Expect(page.Locator("[data-testid='diff-summary']")).ToBeHiddenAsync();
}
finally
{
foreach (var id in await CliRunner.ListTemplateIdsByNamePrefixAsync(tmplName))
await CliRunner.DeleteTemplateAsync(id);
try { File.Delete(bundlePath); } catch { }
}
}
```
Note: the source template is NOT deleted before import here (we never reach the diff/apply step), so teardown deletes by name prefix to catch it.
**Step: run + commit** — `git add -A && git commit -m "test(e2e): Transport import wrong-passphrase shows error and stays on passphrase step"`
**Acceptance:** asserts the real error message, that the wizard stays on Step 2, and that no diff appears; bundle + template cleaned up.
---
## Task 10: Wave 1 verification — full suite green + zero residue + clean build
**Classification:** standard
**Estimated implement time:** ~4 min
**Parallelizable with:** none (final gate)
**Depends on:** Tasks 09.
**Steps:**
1. `dotnet build` the test project — expect 0 warnings/0 errors (`TreatWarningsAsErrors=true`).
2. Run the **full** suite: `dotnet test`. Expect **0 failed**; new Wave-1 tests pass against the live cluster; any cluster-down skips are logged by `SkipSummaryReporter`. Record the pass/skip/fail tally.
3. **Residue check** (cluster up) — confirm zero `zztest-*` leftovers:
- `dotnet scadabridge.dll … --format json template list` → no `zztest-` names.
- `… instance list --site-id <site-a>` → no `zztest-inst`/`zztest-cfginst` names.
- `… data-connection list --site-id <site-a>` → no `zztest-conn` names.
- `… security api-key list` → no `zztest-apikey` names.
- `… api-method list` → no `zztest-method` names.
- `… site area list`/topology → no `zztest-cfgarea` names.
Any leftover → fix the owning test's teardown before closing the wave.
4. Confirm the InstanceConfigure `data-test` additions did not change rendered behavior (heading/sections unchanged) — spot-check by loading the page.
**Commit (if any residue/teardown fixes were needed):** `git add -A && git commit -m "test(e2e): Wave 1 verification fixes (teardown/residue)"`
**Acceptance:** full suite 0 failed with skips logged; zero `zztest-*` residue across all entity types; build clean. Wave 1 is shippable.
---
## Execution notes
- **Parallel dispatch:** after Task 0, Tasks 1 / 2 / 5 are independent (disjoint files) and can run concurrently. Tasks 3, 6, 8, 9 are independent test files and can run concurrently once their deps land. Tasks 4 and 7 are serial after 3 and 6 respectively (same file). Task 10 is the final gate.
- **Cluster required:** all functional tests are `SkippableFact` — run against the live 8-node docker cluster for real coverage. If the cluster is down they skip-and-log (suite still green), but Wave 1 isn't "verified" until a green run against the live cluster with the residue check passing.
- **Waves 24** are planned separately after this wave ships, per the design doc.
@@ -0,0 +1,19 @@
{
"planPath": "docs/plans/2026-06-06-playwright-coverage-fill-wave1.md",
"lastUpdated": "2026-06-06T00:00:00Z",
"nativeTaskIdBase": 79,
"status": "completed",
"tasks": [
{"id": 0, "nativeId": 79, "subject": "Task 0: CLI helper extensions", "status": "completed"},
{"id": 1, "nativeId": 80, "subject": "Task 1: InstanceConfigureFixture", "status": "completed", "blockedBy": [0]},
{"id": 2, "nativeId": 81, "subject": "Task 2: data-test hooks on InstanceConfigure.razor", "status": "completed"},
{"id": 3, "nativeId": 82, "subject": "Task 3: InstanceConfigureTests bindings round-trip", "status": "completed", "blockedBy": [0, 1, 2]},
{"id": 4, "nativeId": 83, "subject": "Task 4: InstanceConfigureTests override + area + not-found", "status": "completed", "blockedBy": [3]},
{"id": 5, "nativeId": 84, "subject": "Task 5: ApiSurfaceFixture", "status": "completed", "blockedBy": [0]},
{"id": 6, "nativeId": 85, "subject": "Task 6: ApiKeyCrudTests create + validation", "status": "completed", "blockedBy": [0, 5]},
{"id": 7, "nativeId": 86, "subject": "Task 7: ApiKeyCrudTests enable/disable + delete", "status": "completed", "blockedBy": [6]},
{"id": 8, "nativeId": 87, "subject": "Task 8: TransportExportTests happy path", "status": "completed", "blockedBy": [0]},
{"id": 9, "nativeId": 88, "subject": "Task 9: Wrong-passphrase import negative", "status": "completed", "blockedBy": [0]},
{"id": 10, "nativeId": 89, "subject": "Task 10: Wave 1 verification + residue check", "status": "completed", "blockedBy": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
]
}
+5 -5
View File
@@ -8,7 +8,7 @@ This document describes the local Docker-based test infrastructure for ScadaBrid
|---------|-------|---------|--------|-------------|
| OPC UA Server | `mcr.microsoft.com/iotedge/opc-plc:latest` | 50000 (OPC UA), 8080 (web) | `infra/opcua/nodes.json` | `infra/` |
| OPC UA Server 2 | `mcr.microsoft.com/iotedge/opc-plc:latest` | 50010 (OPC UA), 8081 (web) | `infra/opcua/nodes.json` | `infra/` |
| LDAP Server | `glauth/glauth:latest` | 3893 | `infra/glauth/config.toml` | `infra/` |
| LDAP Server | `glauth/glauth:latest` | 3893 | `scadaproj/infra/glauth/config.toml` | **Shared**`zb-shared-glauth` on `10.100.0.35`; NOT started by `infra/` stack (retired 2026-06-04). See `scadaproj/infra/glauth/`. |
| MS SQL 2022 | `mcr.microsoft.com/mssql/server:2022-latest` | 1433 | `infra/mssql/setup.sql` | `infra/` |
| SMTP (Mailpit) | `axllent/mailpit:latest` | 1025 (SMTP), 8025 (web) | Environment vars | `infra/` |
| REST API (Flask) | Custom build (`infra/restapi/Dockerfile`) | 5200 | `infra/restapi/app.py` | `infra/` |
@@ -65,9 +65,9 @@ For use in `appsettings.Development.json`:
"ScadaBridgeMachineData": "Server=localhost,1433;Database=ScadaBridgeMachineData;User Id=scadabridge_app;Password=ScadaBridge_Dev1#;TrustServerCertificate=true"
},
"Ldap": {
"Server": "localhost",
"Server": "10.100.0.35",
"Port": 3893,
"BaseDN": "dc=scadabridge,dc=local",
"BaseDN": "dc=zb,dc=local",
"UseSsl": false
},
"OpcUa": {
@@ -98,7 +98,7 @@ For use in `appsettings.Development.json`:
```bash
cd infra
docker compose down # stop containers, preserve SQL data volume
docker compose stop opcua # stop a single service (also: opcua2, ldap, mssql, smtp, restapi)
docker compose stop opcua # stop a single service (also: opcua2, mssql, smtp, restapi) # note: ldap is no longer a local service
```
**Full teardown** (removes volumes, optionally images and venv):
@@ -117,7 +117,7 @@ After a full teardown, the next `docker compose up -d` starts fresh — re-run t
infra/
docker-compose.yml # All seven services
teardown.sh # Teardown script (volumes, images, venv)
glauth/config.toml # LDAP users and groups
glauth/config.toml # LDAP users and groups — HISTORICAL (retired 2026-06-04); live config is scadaproj/infra/glauth/config.toml
mssql/setup.sql # Database and user creation
mssql/machinedata_seed.sql # Machine Data tables, stored procedures, sample data
opcua/nodes.json # Custom OPC UA tag definitions
+29 -19
View File
@@ -1,5 +1,11 @@
# Test Infrastructure: LDAP Server
> **SUPERSEDED 2026-06-04** — ScadaBridge no longer runs its own glauth. Dev/test LDAP is now the
> shared GLAuth on **10.100.0.35:3893** (dc=zb,dc=local); source of truth and runbook:
> **`scadaproj/infra/glauth/`** (`~/Desktop/scadaproj/infra/glauth/config.toml`). The
> `scadabridge-ldap` container has been retired (commented out in `infra/docker-compose.yml`).
> The content below describes the retired local setup, kept for reference/rollback.
## Overview
The test LDAP server uses [GLAuth](https://glauth.github.io/), a lightweight LDAP server backed by a TOML config file. It provides test users and groups that map to ScadaBridge's role-based authorization model.
@@ -8,11 +14,12 @@ The test LDAP server uses [GLAuth](https://glauth.github.io/), a lightweight LDA
- **Image**: `glauth/glauth:latest`
- **LDAP port**: 3893 (plain LDAP, no TLS — dev only)
- **Host (shared)**: `10.100.0.35` — the shared `zb-shared-glauth` container on the Linux docker host (replaces `localhost` below)
## Base DN
```
dc=scadabridge,dc=local
dc=zb,dc=local
```
## Test Users
@@ -41,46 +48,49 @@ All users have the password `password`.
Users bind with their full DN, which includes the primary group as an OU:
```
cn=<username>,ou=<PrimaryGroupName>,ou=users,dc=scadabridge,dc=local
cn=<username>,ou=<PrimaryGroupName>,ou=users,dc=zb,dc=local
```
For example: `cn=admin,ou=SCADA-Admins,ou=users,dc=scadabridge,dc=local`
For example: `cn=admin,ou=SCADA-Admins,ou=users,dc=zb,dc=local`
The full DNs for all test users:
| Username | Full DN |
|----------|---------|
| `admin` | `cn=admin,ou=SCADA-Admins,ou=users,dc=scadabridge,dc=local` |
| `designer` | `cn=designer,ou=SCADA-Designers,ou=users,dc=scadabridge,dc=local` |
| `deployer` | `cn=deployer,ou=SCADA-Deploy-All,ou=users,dc=scadabridge,dc=local` |
| `site-deployer` | `cn=site-deployer,ou=SCADA-Deploy-SiteA,ou=users,dc=scadabridge,dc=local` |
| `multi-role` | `cn=multi-role,ou=SCADA-Admins,ou=users,dc=scadabridge,dc=local` |
| `admin` | `cn=admin,ou=SCADA-Admins,ou=users,dc=zb,dc=local` |
| `designer` | `cn=designer,ou=SCADA-Designers,ou=users,dc=zb,dc=local` |
| `deployer` | `cn=deployer,ou=SCADA-Deploy-All,ou=users,dc=zb,dc=local` |
| `site-deployer` | `cn=site-deployer,ou=SCADA-Deploy-SiteA,ou=users,dc=zb,dc=local` |
| `multi-role` | `cn=multi-role,ou=SCADA-Admins,ou=users,dc=zb,dc=local` |
## Verification
1. Check the container is running:
1. Check the shared container is running (on the docker host):
```bash
docker ps --filter name=scadabridge-ldap
# The container now runs on 10.100.0.35 as zb-shared-glauth, not locally.
# To verify from the docker host:
# docker ps --filter name=zb-shared-glauth
# Formerly: docker ps --filter name=scadabridge-ldap (retired)
```
2. Test a user bind with `ldapsearch`:
2. Test a user bind with `ldapsearch` against the shared host:
```bash
ldapsearch -H ldap://localhost:3893 \
-D "cn=admin,ou=SCADA-Admins,ou=users,dc=scadabridge,dc=local" \
ldapsearch -H ldap://10.100.0.35:3893 \
-D "cn=admin,ou=SCADA-Admins,ou=users,dc=zb,dc=local" \
-w password \
-b "dc=scadabridge,dc=local" \
-b "dc=zb,dc=local" \
"(objectClass=*)"
```
3. Search for group membership:
```bash
ldapsearch -H ldap://localhost:3893 \
-D "cn=admin,ou=SCADA-Admins,ou=users,dc=scadabridge,dc=local" \
ldapsearch -H ldap://10.100.0.35:3893 \
-D "cn=admin,ou=SCADA-Admins,ou=users,dc=zb,dc=local" \
-w password \
-b "dc=scadabridge,dc=local" \
-b "dc=zb,dc=local" \
"(cn=multi-role)"
```
@@ -112,7 +122,7 @@ python infra/tools/ldap_tool.py groups
python infra/tools/ldap_tool.py search --filter "(cn=multi-role)"
```
Use `--host` and `--port` to override defaults (localhost:3893). Run with `--help` for full usage.
Use `--host 10.100.0.35 --port 3893` to point at the shared server. Run with `--help` for full usage.
## Relevance to ScadaBridge Components
@@ -122,6 +132,6 @@ Use `--host` and `--port` to override defaults (localhost:3893). Run with `--hel
## Notes
- GLAuth uses plain LDAP on port 3893. ScadaBridge's Security & Auth component requires LDAPS/StartTLS in production. For dev testing, configure the LDAP client to allow plaintext connections.
- To add users or groups, edit `infra/glauth/config.toml` locally and restart the container: `docker compose restart ldap`. Note that the file is named `config.toml` on the host but is mounted into the container as `/app/config/config.cfg` (the path GLAuth expects).
- To add users or groups, edit **`scadaproj/infra/glauth/config.toml`** (the shared source of truth at `~/Desktop/scadaproj/infra/glauth/config.toml`) and restart the `zb-shared-glauth` container on the docker host. **Do not edit the retired `ScadaBridge/infra/glauth/config.toml`** — that file is historical only. The config is mounted into the container as `/app/config/config.cfg` (the path GLAuth expects).
- The `admin` user is configured with `[[users.capabilities]]` (`action = "search"`, `object = "*"`) in the GLAuth config. This grants the admin account permission to perform LDAP search operations, which is required for user/group lookups.
- Anonymous bind is not allowed. All LDAP operations (including searches) require an authenticated bind. Use the `admin` account for search operations.
+2 -2
View File
@@ -14,7 +14,7 @@ This starts the following services:
|---------|------|---------|
| OPC UA (Azure IoT OPC PLC) | 50000 (OPC UA), 8080 (web) | Simulated OPC UA server with ScadaBridge-style tags |
| OPC UA 2 (Azure IoT OPC PLC) | 50010 (OPC UA), 8081 (web) | Second OPC UA server instance (same tags, independent state) |
| LDAP (GLAuth) | 3893 | Lightweight LDAP with test users/groups matching ScadaBridge roles |
| ~~LDAP (GLAuth)~~ | ~~3893~~ | **RETIRED (2026-06-04)** — no longer started by this stack. Dev/test LDAP is the shared GLAuth on `10.100.0.35:3893` (dc=zb,dc=local). Central nodes bind `Ldap:Server=10.100.0.35`. Source of truth + config: `scadaproj/infra/glauth/`. |
| MS SQL 2022 | 1433 | Configuration and machine data databases |
| SMTP (Mailpit) | 1025 (SMTP), 8025 (web) | Email capture for notification testing |
| REST API (Flask) | 5200 | External REST API for Gateway and Inbound API testing |
@@ -56,7 +56,7 @@ docker compose down
**Stop a single service** (leave the others running):
```bash
docker compose stop opcua # or: opcua2, ldap, mssql, smtp, restapi
docker compose stop opcua # or: opcua2, mssql, smtp, restapi (ldap is no longer a local service)
docker compose start opcua # bring it back without recreating
```
+15 -10
View File
@@ -41,16 +41,21 @@ services:
- scadabridge-net
restart: unless-stopped
ldap:
image: glauth/glauth:latest
container_name: scadabridge-ldap
ports:
- "3893:3893"
volumes:
- ./glauth/config.toml:/app/config/config.cfg:ro
networks:
- scadabridge-net
restart: unless-stopped
# RETIRED 2026-06-04: superseded by the shared dev GLAuth on 10.100.0.35:3893
# (scadaproj/infra/glauth/). The central nodes now bind there (see
# docker/ + docker-env2 central-node appsettings: Ldap:Server=10.100.0.35).
# Kept here, commented, for rollback — uncomment + `docker compose up -d ldap`
# and revert the central-node Server back to "scadabridge-ldap".
# ldap:
# image: glauth/glauth:latest
# container_name: scadabridge-ldap
# ports:
# - "3893:3893"
# volumes:
# - ./glauth/config.toml:/app/config/config.cfg:ro
# networks:
# - scadabridge-net
# restart: unless-stopped
mssql:
image: mcr.microsoft.com/mssql/server:2022-latest
+8 -2
View File
@@ -7,7 +7,7 @@
[backend]
datastore = "config"
baseDN = "dc=scadabridge,dc=local"
baseDN = "dc=zb,dc=local"
# ── Groups ──────────────────────────────────────────────────────────
@@ -27,6 +27,10 @@
name = "SCADA-Deploy-SiteA"
gidnumber = 5504
[[groups]]
name = "SCADA-Viewers"
gidnumber = 5505
# ── Users ───────────────────────────────────────────────────────────
# All test passwords: "password"
# SHA256 of "password": 5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8
@@ -77,5 +81,7 @@
mail = "multi-role@scadabridge.local"
uidnumber = 5005
primarygroup = 5501
othergroups = [5502, 5503]
# Member of every role group so it resolves to all four ScadaBridge roles:
# 5501 Admins→Administrator, 5502 Designers→Designer, 5503/5504 Deploy→Deployer, 5505 Viewers→Viewer
othergroups = [5502, 5503, 5504, 5505]
passsha256 = "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8"
+3 -3
View File
@@ -9,10 +9,10 @@ from ldap3 import Server, Connection, NONE, SUBTREE, SIMPLE
DEFAULT_HOST = "localhost"
DEFAULT_PORT = 3893
DEFAULT_BASE_DN = "dc=scadabridge,dc=local"
DEFAULT_BASE_DN = "dc=zb,dc=local"
# GLAuth places users under ou=<PrimaryGroupName>,ou=users,dc=...
# The admin user (primarygroup SCADA-Admins) needs search capabilities in config.
DEFAULT_BIND_DN = "cn=admin,ou=SCADA-Admins,ou=users,dc=scadabridge,dc=local"
DEFAULT_BIND_DN = "cn=admin,ou=SCADA-Admins,ou=users,dc=zb,dc=local"
DEFAULT_BIND_PASSWORD = "password"
@@ -48,7 +48,7 @@ def cmd_check(args):
def cmd_bind(args):
"""Test user authentication via bind.
GLAuth DN format: cn=<user>,ou=<PrimaryGroup>,ou=users,dc=scadabridge,dc=local
GLAuth DN format: cn=<user>,ou=<PrimaryGroup>,ou=users,dc=zb,dc=local
Since we don't know the user's primary group upfront, we search for the user first
to discover the full DN, then rebind with that DN.
"""
+7
View File
@@ -18,6 +18,13 @@
<package pattern="ZB.MOM.WW.MxGateway.*" />
<package pattern="ZB.MOM.WW.Health" />
<package pattern="ZB.MOM.WW.Health.*" />
<package pattern="ZB.MOM.WW.Telemetry" />
<package pattern="ZB.MOM.WW.Telemetry.*" />
<package pattern="ZB.MOM.WW.Configuration" />
<package pattern="ZB.MOM.WW.Auth" />
<package pattern="ZB.MOM.WW.Auth.*" />
<package pattern="ZB.MOM.WW.Audit" />
<package pattern="ZB.MOM.WW.Theme" />
</packageSource>
</packageSourceMapping>
<!--
@@ -1,10 +1,11 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.AuditLog.Redaction;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using ZB.MOM.WW.ScadaBridge.ConfigurationDatabase;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
@@ -13,7 +14,7 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// Central-side singleton (per Bundle E wiring) that ingests batches of
/// <see cref="AuditEvent"/> rows pushed from sites via the
/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
/// <see cref="AuditEvent.IngestedAtUtc"/> and inserted idempotently via
/// the central-side IngestedAtUtc (in DetailsJson) and inserted idempotently via
/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
/// silently swallowed (first-write-wins per Bundle A's hardening).
/// </summary>
@@ -116,10 +117,10 @@ public class AuditLogIngestActor : ReceiveActor
// Resolve the repository for the whole batch — one DbContext per
// message, mirroring NotificationOutboxActor. The injected-repository
// mode (Bundle D tests) skips the scope entirely.
// Bundle C (M5-T6): the IAuditPayloadFilter is also resolved from the
// Bundle C (M5-T6): the IAuditRedactor is also resolved from the
// per-message scope when one is available so the row is truncated +
// redacted before InsertIfNotExistsAsync. The single-repository test
// ctor has no service provider — it falls through with no filter,
// ctor has no service provider — it falls through with no redactor,
// which preserves the small-payload assumptions baked into the
// existing D2 fixtures.
// AuditLog-003: use CreateAsyncScope + await using so scoped EF Core
@@ -127,19 +128,19 @@ public class AuditLogIngestActor : ReceiveActor
// without blocking on sync Dispose() of pending connection cleanup.
if (_injectedRepository is not null)
{
await IngestWithRepositoryAsync(_injectedRepository, filter: null, failureCounter: null, cmd, nowUtc, accepted)
await IngestWithRepositoryAsync(_injectedRepository, redactor: null, failureCounter: null, cmd, nowUtc, accepted)
.ConfigureAwait(false);
}
else
{
await using var scope = _serviceProvider!.CreateAsyncScope();
var repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
var redactor = scope.ServiceProvider.GetService<IAuditRedactor>();
// M6 Bundle E (T8): central health counter is best-effort —
// unregistered (test composition roots) means the per-row catch
// simply logs without surfacing on the health dashboard.
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
await IngestWithRepositoryAsync(repository, filter, failureCounter, cmd, nowUtc, accepted)
await IngestWithRepositoryAsync(repository, redactor, failureCounter, cmd, nowUtc, accepted)
.ConfigureAwait(false);
}
@@ -148,7 +149,7 @@ public class AuditLogIngestActor : ReceiveActor
private async Task IngestWithRepositoryAsync(
IAuditLogRepository repository,
IAuditPayloadFilter? filter,
IAuditRedactor? redactor,
ICentralAuditWriteFailureCounter? failureCounter,
IngestAuditEventsCommand cmd,
DateTime nowUtc,
@@ -162,15 +163,17 @@ public class AuditLogIngestActor : ReceiveActor
// repository hardening already swallows duplicate-key races,
// so the same id arriving twice (site retry, reconciliation)
// is a silent no-op.
// Filter BEFORE the IngestedAtUtc stamp so the redacted
// copy carries the central-side ingest timestamp. Filter
// Redact BEFORE the IngestedAtUtc stamp so the redacted
// copy carries the central-side ingest timestamp. The redactor
// is contract-bound to never throw. AuditLog-008: a null
// filter (test composition root, no IAuditPayloadFilter
// redactor (test composition root, no IAuditRedactor
// registered) now falls back to the SafeDefault rather than
// pass-through, so HTTP header redaction always runs.
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
var filtered = safeFilter.Apply(evt);
var ingested = filtered with { IngestedAtUtc = nowUtc };
// C3 transitional shim: IngestedAtUtc is a DetailsJson field on
// the canonical record, so stamp it via the projection helper.
var safeRedactor = redactor ?? SafeDefaultAuditRedactor.Instance;
var filtered = safeRedactor.Apply(evt);
var ingested = AuditRowProjection.WithIngestedAtUtc(filtered, nowUtc);
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
accepted.Add(evt.EventId);
}
@@ -216,12 +219,12 @@ public class AuditLogIngestActor : ReceiveActor
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaBridgeDbContext>();
// Bundle C (M5-T6): resolve the filter for the whole batch from
// the scope; null = pass-through for test composition roots that
// skip the filter registration. The filter is contract-bound to
// Bundle C (M5-T6): resolve the redactor for the whole batch from
// the scope; null = SafeDefault for test composition roots that
// skip the redactor registration. The redactor is contract-bound to
// never throw, so we can apply it inside the per-entry try
// without risking an unbounded blast radius.
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
var redactor = scope.ServiceProvider.GetService<IAuditRedactor>();
// M6 Bundle E (T8): same best-effort central health counter as
// the OnIngestAsync path — null on test composition roots that
// skip the registration.
@@ -240,14 +243,16 @@ public class AuditLogIngestActor : ReceiveActor
// matching timestamps (debugging convenience, not a
// correctness invariant).
var ingestedAt = DateTime.UtcNow;
// Filter the audit half BEFORE the dual-write — only the
// AuditLog row's payload columns are filterable; SiteCalls
// Redact the audit half BEFORE the dual-write — only the
// AuditLog row's payload columns are redactable; SiteCalls
// carries operational state only (status, retry count) and
// is left untouched. AuditLog-008: null filter falls back
// is left untouched. AuditLog-008: null redactor falls back
// to SafeDefault so header redaction always runs.
var safeFilter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
var filteredAudit = safeFilter.Apply(entry.Audit);
var auditStamped = filteredAudit with { IngestedAtUtc = ingestedAt };
// C3 transitional shim: IngestedAtUtc is a DetailsJson field
// on the canonical record, so stamp it via the projection helper.
var safeRedactor = redactor ?? SafeDefaultAuditRedactor.Instance;
var filteredAudit = safeRedactor.Apply(entry.Audit);
var auditStamped = AuditRowProjection.WithIngestedAtUtc(filteredAudit, ingestedAt);
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
await auditRepo.InsertIfNotExistsAsync(auditStamped)
@@ -76,7 +76,12 @@ public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDispo
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
/// <summary>
/// Starts the background maintenance loop, firing an immediate first tick and then
/// repeating every <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/>.
/// </summary>
/// <param name="ct">Cancellation token provided by the host.</param>
/// <returns>A completed task; the loop runs independently on a background thread.</returns>
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
@@ -136,14 +141,21 @@ public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDispo
}
}
/// <inheritdoc />
/// <summary>
/// Signals the maintenance loop to stop by cancelling its linked token,
/// then returns the loop task so the host can await its completion.
/// </summary>
/// <param name="ct">Cancellation token provided by the host (unused — the internal CTS is cancelled directly).</param>
/// <returns>The background loop task, or a completed task if the loop was never started.</returns>
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
/// <summary>
/// Disposes the internal <see cref="CancellationTokenSource"/> used to stop the maintenance loop.
/// </summary>
public void Dispose()
{
_cts?.Dispose();
@@ -5,10 +5,10 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
/// <summary>
/// Audit Log (#23) M6 Bundle E (T9) — bridges
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
/// parameter redactor stage throws and the filter has to over-redact the
/// offending field) into <see cref="AuditCentralHealthSnapshot"/> so the
/// failure surfaces on the central health surface as
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Redaction.ScadaBridgeAuditRedactor"/> every time
/// a header / body / SQL parameter redactor stage throws and the redactor has
/// to over-redact the offending field) into <see cref="AuditCentralHealthSnapshot"/>
/// so the failure surfaces on the central health surface as
/// <c>AuditCentralHealthSnapshot.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
@@ -1,9 +1,10 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.AuditLog.Redaction;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
@@ -41,7 +42,7 @@ public sealed class CentralAuditWriter : ICentralAuditWriter
{
private readonly IServiceProvider _services;
private readonly ILogger<CentralAuditWriter> _logger;
private readonly IAuditPayloadFilter _filter;
private readonly IAuditRedactor _redactor;
private readonly ICentralAuditWriteFailureCounter _failureCounter;
private readonly INodeIdentityProvider? _nodeIdentity;
@@ -68,24 +69,25 @@ public sealed class CentralAuditWriter : ICentralAuditWriter
/// </summary>
/// <param name="services">Service provider used to open a per-call scope for the scoped repository.</param>
/// <param name="logger">Logger for swallowed write-failure diagnostics.</param>
/// <param name="filter">Optional payload filter for truncation and redaction; defaults to a pass-through.</param>
/// <param name="redactor">Optional canonical redactor for truncation and redaction; defaults to the always-safe default.</param>
/// <param name="failureCounter">Optional counter incremented on swallowed repository failures; defaults to a no-op.</param>
/// <param name="nodeIdentity">Optional node identity provider for stamping <c>SourceNode</c> on central-origin rows.</param>
public CentralAuditWriter(
IServiceProvider services,
ILogger<CentralAuditWriter> logger,
IAuditPayloadFilter? filter = null,
IAuditRedactor? redactor = null,
ICentralAuditWriteFailureCounter? failureCounter = null,
INodeIdentityProvider? nodeIdentity = null)
{
_services = services ?? throw new ArgumentNullException(nameof(services));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
// AuditLog-008: never default to null — over-redact instead.
// SafeDefaultAuditPayloadFilter applies HTTP header redaction with
// C3 (Task 2.5): wired via the canonical IAuditRedactor seam.
// SafeDefaultAuditRedactor applies HTTP header redaction with
// hard-coded sensitive defaults so a composition root that omits the
// real filter still scrubs Authorization / X-Api-Key / Cookie /
// real redactor still scrubs Authorization / X-Api-Key / Cookie /
// Set-Cookie before persistence.
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
_redactor = redactor ?? SafeDefaultAuditRedactor.Instance;
_failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter();
_nodeIdentity = nodeIdentity;
}
@@ -103,12 +105,12 @@ public sealed class CentralAuditWriter : ICentralAuditWriter
try
{
// Filter BEFORE stamping IngestedAtUtc + handing to the repo. The
// filter contract is "never throws". AuditLog-008: _filter is now
// non-null (SafeDefaultAuditPayloadFilter fallback) so header
// Redact BEFORE stamping IngestedAtUtc + handing to the repo. The
// redactor contract is "never throws". AuditLog-008: _redactor is
// now non-null (SafeDefaultAuditRedactor fallback) so header
// redaction always runs even in composition roots that omit the
// real filter.
var filtered = _filter.Apply(evt);
// real redactor.
var filtered = _redactor.Apply(evt);
// SourceNode-stamping (Task 12): caller-provided value wins
// (supports any future direct-write callsite that already has its
@@ -124,7 +126,9 @@ public sealed class CentralAuditWriter : ICentralAuditWriter
await using var scope = _services.CreateAsyncScope();
var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
var stamped = filtered with { IngestedAtUtc = DateTime.UtcNow };
// C3 transitional shim: IngestedAtUtc is a DetailsJson field on the
// canonical record, so stamp it via the projection helper.
var stamped = AuditRowProjection.WithIngestedAtUtc(filtered, DateTime.UtcNow);
await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
}
catch (Exception ex)
@@ -143,17 +147,17 @@ public sealed class CentralAuditWriter : ICentralAuditWriter
// misbehaving custom counter does, swallowing here keeps the
// best-effort contract intact.
}
// Log the input event's identifying fields. These three (EventId,
// Kind, Status) are immutable across the filter+stamp chain — the
// `with` clones above touch only SourceNode and IngestedAtUtc — so
// referencing `evt` here is intentional and equivalent to the
// stamped record for diagnostics. If you add a field here that the
// stamp chain DOES mutate (e.g., SourceNode), reference the latest
// post-stamp record name instead, not `evt`.
// Log the input event's identifying fields. EventId + Action are
// immutable across the redact+stamp chain — the `with` clones above
// touch only SourceNode and DetailsJson — so referencing `evt` here
// is intentional and equivalent to the stamped record for
// diagnostics. Action = "{Channel}.{Kind}" carries the kind; the
// canonical Outcome carries the coarse status (fine-grained Status
// lives in DetailsJson).
_logger.LogWarning(
ex,
"CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})",
evt.EventId, evt.Kind, evt.Status);
"CentralAuditWriter failed for EventId {EventId} (Action={Action}, Outcome={Outcome})",
evt.EventId, evt.Action, evt.Outcome);
}
}
}
@@ -41,6 +41,7 @@ public interface IPullAuditEventsClient
/// <param name="sinceUtc">Only events with an <c>OccurredAtUtc</c> at or after this cursor time are returned.</param>
/// <param name="batchSize">Maximum number of events to return per call.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to the next reconciliation batch with a <c>MoreAvailable</c> flag.</returns>
Task<PullAuditEventsResponse> PullAsync(
string siteId,
DateTime sinceUtc,
@@ -23,6 +23,7 @@ public interface ISiteEnumerator
/// — the actor calls this once per tick.
/// </summary>
/// <param name="ct">Cancellation token for the async enumeration.</param>
/// <returns>A task that resolves to the current set of site entries to poll on the next reconciliation tick.</returns>
Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default);
}
@@ -2,8 +2,8 @@ using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Central;
@@ -258,7 +258,9 @@ public class SiteAuditReconciliationActor : ReceiveActor
// concurrent push, or a retry of this very pull) collapse to
// a no-op courtesy of M2 Bundle A's race-fix on
// InsertIfNotExistsAsync.
var ingested = evt with { IngestedAtUtc = nowUtc };
// C3: IngestedAtUtc is a DetailsJson field on the canonical record —
// stamp it via the projection helper.
var ingested = AuditRowProjection.WithIngestedAtUtc(evt, nowUtc);
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
_failedInsertAttempts.Remove(evt.EventId);
advanceForThisRow = true;
@@ -299,9 +301,11 @@ public class SiteAuditReconciliationActor : ReceiveActor
}
}
if (advanceForThisRow && evt.OccurredAtUtc > maxOccurred)
// C3: canonical OccurredAtUtc is a DateTimeOffset; the cursor is a UTC DateTime.
var occurredUtc = evt.OccurredAtUtc.UtcDateTime;
if (advanceForThisRow && occurredUtc > maxOccurred)
{
maxOccurred = evt.OccurredAtUtc;
maxOccurred = occurredUtc;
}
}
@@ -133,6 +133,7 @@ public sealed class SiteAuditTelemetryStalledTracker : IDisposable
/// Returns a defensive copy of the per-site latched stalled state.
/// Absent sites are interpreted as <c>Stalled=false</c> by consumers.
/// </summary>
/// <returns>A snapshot dictionary mapping each known site ID to its current stalled state.</returns>
public IReadOnlyDictionary<string, bool> Snapshot() =>
new Dictionary<string, bool>(_state);
@@ -1,4 +1,4 @@
using Microsoft.Extensions.Options;
using ZB.MOM.WW.Configuration;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
@@ -13,7 +13,7 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
/// drop in-flight investigations, too long would defeat the partition-switch
/// purge's purpose.
/// </summary>
public sealed class AuditLogOptionsValidator : IValidateOptions<AuditLogOptions>
public sealed class AuditLogOptionsValidator : OptionsValidatorBase<AuditLogOptions>
{
/// <summary>Inclusive lower bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
public const int MinRetentionDays = 30;
@@ -28,43 +28,29 @@ public sealed class AuditLogOptionsValidator : IValidateOptions<AuditLogOptions>
public const int MaxInboundMaxBytes = 16_777_216;
/// <inheritdoc />
public ValidateOptionsResult Validate(string? name, AuditLogOptions options)
protected override void Validate(ValidationBuilder builder, AuditLogOptions options)
{
ArgumentNullException.ThrowIfNull(options);
builder.RequireThat(options.DefaultCapBytes > 0,
$"AuditLog:{nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}) " +
"must be > 0; it drives payload-summary truncation in audit writers.");
var failures = new List<string>();
builder.RequireThat(options.ErrorCapBytes >= options.DefaultCapBytes,
$"AuditLog:{nameof(AuditLogOptions.ErrorCapBytes)} ({options.ErrorCapBytes}) " +
$"must be >= {nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}); " +
"the error-row cap is intended to capture more detail than the happy-path summary.");
if (options.DefaultCapBytes <= 0)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}) " +
"must be > 0; it drives payload-summary truncation in audit writers.");
}
// Valid when RetentionDays is within [Min, Max] inclusive. The De Morgan'd
// guard !(below Min OR above Max) is equivalent to (>= Min AND <= Max).
builder.RequireThat(
!(options.RetentionDays < MinRetentionDays || options.RetentionDays > MaxRetentionDays),
$"AuditLog:{nameof(AuditLogOptions.RetentionDays)} ({options.RetentionDays}) " +
$"must be in [{MinRetentionDays}, {MaxRetentionDays}] days.");
if (options.ErrorCapBytes < options.DefaultCapBytes)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.ErrorCapBytes)} ({options.ErrorCapBytes}) " +
$"must be >= {nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}); " +
"the error-row cap is intended to capture more detail than the happy-path summary.");
}
if (options.RetentionDays < MinRetentionDays || options.RetentionDays > MaxRetentionDays)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.RetentionDays)} ({options.RetentionDays}) " +
$"must be in [{MinRetentionDays}, {MaxRetentionDays}] days.");
}
if (options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes)
{
failures.Add(
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
}
return failures.Count == 0
? ValidateOptionsResult.Success
: ValidateOptionsResult.Fail(failures);
// Valid when InboundMaxBytes is within [Min, Max] inclusive. The De Morgan'd
// guard !(below Min OR above Max) is equivalent to (>= Min AND <= Max).
builder.RequireThat(
!(options.InboundMaxBytes < MinInboundMaxBytes || options.InboundMaxBytes > MaxInboundMaxBytes),
$"AuditLog:{nameof(AuditLogOptions.InboundMaxBytes)} ({options.InboundMaxBytes}) " +
$"must be in [{MinInboundMaxBytes}, {MaxInboundMaxBytes}] bytes.");
}
}
@@ -0,0 +1,342 @@
using System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
using System.Text.Json.Nodes;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Pure, stateless redaction + truncation primitives used by
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Redaction.ScadaBridgeAuditRedactor"/>
/// (which operates on <c>ZB.MOM.WW.Audit.AuditEvent</c> + its <c>DetailsJson</c>).
/// Extracted in ScadaBridge audit re-architecture stage C2 (Task 2.5) so the
/// byte-exact redaction logic lives in ONE place.
/// </summary>
/// <remarks>
/// <para>
/// Each stage method is a pure function of its inputs (no instance state). The
/// only side effects are diagnostics-only: a warning log line and an
/// <paramref name="onFailure"/> callback invocation when a redactor faults, so
/// the caller can bump its redaction-failure health counter. The callbacks are
/// passed in (rather than the counter interface) to keep this helper free of
/// any DI / health-metric coupling.
/// </para>
/// <para>
/// The regex CACHE and per-call options resolution live in
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Payload.AuditRegexCache"/> /
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Redaction.ScadaBridgeAuditRedactor"/>
/// — they carry per-instance state (lazy compile, 100 ms compile budget,
/// sentinel entries). This helper only holds the stateless stages that
/// operate once the compiled regex set / redact list / cap has already been
/// resolved.
/// </para>
/// </remarks>
internal static class AuditRedactionPrimitives
{
/// <summary>Marker replacing redacted header values, body matches, and SQL parameter values.</summary>
public const string RedactedMarker = "<redacted>";
/// <summary>Over-redaction marker emitted when a redactor stage itself faults.</summary>
public const string RedactorErrorMarker = "<redacted: redactor error>";
/// <summary>
/// Marker used by the outer never-throws safety net when the entire redaction
/// pipeline fails catastrophically — all potentially-sensitive string fields are
/// set to this value so no raw payload leaks on an unexpected fault.
/// Deliberately equal to <see cref="RedactorErrorMarker"/>: both represent a
/// defensive scrub-everything fallback.
/// </summary>
public const string OverRedactedEventMarker = RedactorErrorMarker;
/// <summary>
/// JSON serializer options used to re-emit redacted summaries. The
/// UnsafeRelaxedJsonEscaping encoder is required so the redaction marker
/// (which contains <c>&lt;</c> / <c>&gt;</c>) survives unescaped — matching
/// the legacy filter's output byte-for-byte.
/// </summary>
public static readonly JsonSerializerOptions RedactedSummaryJsonOptions = new()
{
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
};
/// <summary>
/// Parse <paramref name="json"/> as the documented
/// <c>{"headers": {...}, "body": ...}</c> shape and replace values whose
/// header NAME (case-insensitive) is in <paramref name="redactList"/> with
/// <see cref="RedactedMarker"/>. Re-serialises and returns the result.
/// No-op pass-through for inputs that are not JSON-object-shaped or do not
/// carry a top-level <c>headers</c> object. On any unexpected fault the
/// field is over-redacted with <see cref="RedactorErrorMarker"/> and
/// <paramref name="onFailure"/> is invoked.
/// </summary>
/// <param name="json">The raw JSON string to redact; null passes through as null.</param>
/// <param name="redactList">Header names (case-insensitive) whose values should be replaced.</param>
/// <param name="logger">Logger for warning diagnostics on redactor faults.</param>
/// <param name="onFailure">Callback invoked when the redactor stage faults; used to increment health counters.</param>
/// <returns>The re-serialized JSON with redacted header values, the original string if nothing was redacted, or <see cref="RedactorErrorMarker"/> on fault.</returns>
public static string? RedactHeaders(
string? json,
IList<string> redactList,
ILogger logger,
Action onFailure)
{
if (json is null)
{
return null;
}
// Cheap structural pre-check: only attempt JSON parsing when the input
// actually looks like a JSON object. Saves the JsonDocument allocation
// on the (very common) non-JSON ErrorDetail / Extra fields.
var trimmed = json.AsSpan().TrimStart();
if (trimmed.Length == 0 || trimmed[0] != '{')
{
return json;
}
try
{
JsonNode? root;
try
{
root = JsonNode.Parse(json);
}
catch (JsonException)
{
// Not parseable JSON — leave the field alone (no error, no
// redaction). Emitters not yet using the documented shape get
// a transparent pass.
return json;
}
if (root is not JsonObject obj || obj["headers"] is not JsonObject headers)
{
// No "headers" object at the top level — nothing to redact.
return json;
}
// Build a case-insensitive lookup of the redact list so we can do
// one O(1) check per header name without an inner Any() loop.
var redactSet = new HashSet<string>(redactList, StringComparer.OrdinalIgnoreCase);
// Take a snapshot of names first — we cannot mutate while
// enumerating the JsonObject.
var names = new List<string>(headers.Count);
foreach (var kvp in headers)
{
names.Add(kvp.Key);
}
foreach (var name in names)
{
if (redactSet.Contains(name))
{
headers[name] = JsonValue.Create(RedactedMarker);
}
}
return obj.ToJsonString(RedactedSummaryJsonOptions);
}
catch (Exception ex)
{
logger.LogWarning(
ex,
"Header redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { onFailure(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
/// <summary>
/// Apply each compiled body-redactor regex to <paramref name="value"/> in
/// turn, replacing every match with <see cref="RedactedMarker"/>. If any
/// single regex match throws (most commonly
/// <see cref="RegexMatchTimeoutException"/>) the field is over-redacted
/// with <see cref="RedactorErrorMarker"/> and <paramref name="onFailure"/>
/// is invoked — the user-facing action is never aborted.
/// </summary>
/// <param name="value">The string to redact; null passes through as null.</param>
/// <param name="regexes">Compiled body-redaction regexes applied in order.</param>
/// <param name="logger">Logger for warning diagnostics on redactor faults.</param>
/// <param name="onFailure">Callback invoked when a regex match faults; used to increment health counters.</param>
/// <returns>The value with all regex matches replaced by <see cref="RedactedMarker"/>, or <see cref="RedactorErrorMarker"/> on fault.</returns>
public static string? RedactBody(
string? value,
IReadOnlyList<Regex> regexes,
ILogger logger,
Action onFailure)
{
if (value is null)
{
return null;
}
var current = value;
foreach (var rx in regexes)
{
try
{
current = rx.Replace(current, RedactedMarker);
}
catch (Exception ex)
{
logger.LogWarning(
ex,
"Body redactor '{Pattern}' faulted; over-redacting field with '{Marker}'",
rx.ToString(), RedactorErrorMarker);
try { onFailure(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
return current;
}
/// <summary>
/// Walk the M4 <c>{"sql":"...","parameters":{...}}</c> RequestSummary
/// shape; for each parameter whose NAME matches
/// <paramref name="paramNameRegex"/>, replace its value with
/// <see cref="RedactedMarker"/>. Re-serialise. No-op pass-through when the
/// input is not parseable JSON, is not a JSON object, or does not carry a
/// top-level <c>"parameters"</c> object. On any unexpected fault the field
/// is over-redacted with <see cref="RedactorErrorMarker"/> and
/// <paramref name="onFailure"/> is invoked.
/// </summary>
/// <param name="json">The raw JSON string to redact; null passes through as null.</param>
/// <param name="paramNameRegex">Compiled regex matched against each SQL parameter name.</param>
/// <param name="logger">Logger for warning diagnostics on redactor faults.</param>
/// <param name="onFailure">Callback invoked when the redactor stage faults; used to increment health counters.</param>
/// <returns>The re-serialized JSON with matched parameter values replaced by <see cref="RedactedMarker"/>, the original string if no parameters matched, or <see cref="RedactorErrorMarker"/> on fault.</returns>
public static string? RedactSqlParameters(
string? json,
Regex paramNameRegex,
ILogger logger,
Action onFailure)
{
if (json is null)
{
return null;
}
var trimmed = json.AsSpan().TrimStart();
if (trimmed.Length == 0 || trimmed[0] != '{')
{
return json;
}
try
{
JsonNode? root;
try
{
root = JsonNode.Parse(json);
}
catch (JsonException)
{
return json;
}
if (root is not JsonObject obj || obj["parameters"] is not JsonObject parameters)
{
return json;
}
// Snapshot the names — mutating during enumeration is unsupported.
var names = new List<string>(parameters.Count);
foreach (var kvp in parameters)
{
names.Add(kvp.Key);
}
var anyChanged = false;
foreach (var name in names)
{
bool matched;
try
{
matched = paramNameRegex.IsMatch(name);
}
catch (Exception ex)
{
logger.LogWarning(
ex,
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { onFailure(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
if (matched)
{
parameters[name] = JsonValue.Create(RedactedMarker);
anyChanged = true;
}
}
// Avoid re-serialising (which would normalise whitespace / order)
// when no parameter matched — keeps the on-disk row byte-identical
// to the emitter's output on the no-match path.
return anyChanged ? obj.ToJsonString(RedactedSummaryJsonOptions) : json;
}
catch (Exception ex)
{
logger.LogWarning(
ex,
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { onFailure(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
/// <summary>
/// Truncate <paramref name="value"/> to <paramref name="cap"/> UTF-8 bytes,
/// setting <paramref name="truncated"/> to <c>true</c> when the value was
/// shortened. Null passes through as null.
/// </summary>
/// <param name="value">The string to truncate; null passes through as null.</param>
/// <param name="cap">Maximum number of UTF-8 bytes to retain.</param>
/// <param name="truncated">Set to <c>true</c> when the value was shortened; unchanged otherwise.</param>
/// <returns>The truncated string, the original string if within the cap, or <c>null</c> if the input was null.</returns>
public static string? TruncateField(string? value, int cap, ref bool truncated)
{
if (value is null)
{
return null;
}
var result = TruncateUtf8(value, cap);
// Char-count comparison is sufficient: TruncateUtf8 only ever shortens the
// string, so result.Length < value.Length iff bytes were removed.
if (result.Length != value.Length)
{
truncated = true;
}
return result;
}
/// <summary>
/// UTF-8 byte-safe truncation. Encodes the input to UTF-8, walks back from
/// the cap position until the byte is NOT a continuation byte
/// (<c>byte &amp; 0xC0 == 0x80</c>), and decodes the resulting prefix —
/// guaranteeing the returned string never splits a multi-byte sequence.
/// </summary>
/// <param name="value">The string to truncate.</param>
/// <param name="capBytes">Maximum number of UTF-8 bytes in the returned string.</param>
/// <returns>The truncated string guaranteed not to split a multi-byte UTF-8 sequence, or the original string if within the cap.</returns>
public static string TruncateUtf8(string value, int capBytes)
{
if (string.IsNullOrEmpty(value))
{
return value;
}
var bytes = Encoding.UTF8.GetBytes(value);
if (bytes.Length <= capBytes)
{
return value;
}
var boundary = capBytes;
while (boundary > 0 && (bytes[boundary] & 0xC0) == 0x80)
{
boundary--;
}
return Encoding.UTF8.GetString(bytes, 0, boundary);
}
}
@@ -0,0 +1,103 @@
using System.Collections.Concurrent;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Per-instance compiled-regex cache for audit body / SQL-parameter redactors
/// used by <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Redaction.ScadaBridgeAuditRedactor"/>.
/// Extracted in ScadaBridge audit re-architecture stage C2 (Task 2.5) to
/// centralize compile rules (50 ms per-match timeout, 100 ms compile budget,
/// invalid-pattern sentinel).
/// </summary>
/// <remarks>
/// <para>
/// Lazy population keyed by pattern string: each pattern is compiled on first
/// use and cached forever. A failed compile (or a compile slower than 100 ms)
/// caches a sentinel so the failing compile is not retried on every event. The
/// failure is logged once on first encounter. <see cref="ConcurrentDictionary{TKey,TValue}"/>
/// is the right primitive because the owning redactor is a DI singleton on the
/// audit hot-path.
/// </para>
/// </remarks>
internal sealed class AuditRegexCache
{
/// <summary>
/// Per-match regex timeout. Catastrophic-backtracking patterns trip a
/// <see cref="RegexMatchTimeoutException"/> when a single match takes longer
/// than this; the caller then over-redacts the offending field. 50 ms is
/// generous for normal patterns yet short enough that the audit hot-path is
/// not held up by a misconfigured regex.
/// </summary>
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.FromMilliseconds(50);
private readonly ConcurrentDictionary<string, CompiledRegex> _cache = new();
private readonly ILogger _logger;
/// <summary>Initializes the cache with the logger used to report compile failures.</summary>
/// <param name="logger">Logger for recording invalid or slow-compile pattern warnings.</param>
public AuditRegexCache(ILogger logger) => _logger = logger;
/// <summary>
/// Resolve a compiled regex from the cache, compiling it on first use.
/// Returns <c>false</c> for patterns that are invalid OR whose compile took
/// longer than 100 ms (the spec calls catastrophic-backtracking guesses at
/// compile time "invalid"); the failure is logged once and the sentinel
/// cache entry prevents repeat compile attempts.
/// </summary>
/// <param name="pattern">The regex pattern string to look up or compile.</param>
/// <param name="regex">The compiled <see cref="Regex"/>, or <c>null</c> if the pattern is invalid.</param>
/// <returns><c>true</c> if the pattern compiled successfully; <c>false</c> if it is invalid or too slow to compile.</returns>
public bool TryGet(string pattern, out Regex? regex)
{
var entry = _cache.GetOrAdd(pattern, Compile);
regex = entry.Regex;
return entry.Regex != null;
}
private CompiledRegex Compile(string pattern)
{
try
{
var swStart = System.Diagnostics.Stopwatch.GetTimestamp();
var rx = new Regex(pattern, RegexOptions.Compiled, RegexMatchTimeout);
var elapsedMs = (System.Diagnostics.Stopwatch.GetTimestamp() - swStart)
* 1000d / System.Diagnostics.Stopwatch.Frequency;
if (elapsedMs > 100)
{
_logger.LogWarning(
"Body redactor pattern compiled in {Elapsed}ms (> 100ms cap); rejecting '{Pattern}'",
elapsedMs, pattern);
return CompiledRegex.Invalid;
}
return new CompiledRegex(rx);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Body redactor pattern '{Pattern}' failed to compile; skipping",
pattern);
return CompiledRegex.Invalid;
}
}
/// <summary>
/// Cache entry for a body-redactor pattern. Carries the working
/// <see cref="Regex"/> on the success path, or the <see cref="Invalid"/>
/// sentinel for patterns that failed to compile (or exceeded the 100 ms
/// compile budget).
/// </summary>
private readonly struct CompiledRegex
{
public static readonly CompiledRegex Invalid = new(null);
/// <summary>The compiled regex, or <c>null</c> when this entry represents an invalid pattern.</summary>
public Regex? Regex { get; }
/// <summary>Initializes the entry with the compiled regex (or <c>null</c> for the invalid sentinel).</summary>
/// <param name="regex">The compiled <see cref="Regex"/>, or <c>null</c> for a failed compile.</param>
public CompiledRegex(Regex? regex) => Regex = regex;
}
}
@@ -1,587 +0,0 @@
using System.Collections.Concurrent;
using System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
using System.Text.Json.Nodes;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Default <see cref="IAuditPayloadFilter"/>. Bundle A established the
/// truncation backbone; Bundle B chains HTTP header redaction (M5-T3) BEFORE
/// truncation so redactors operate on the full payload and the cap then trims
/// the redacted result.
/// </summary>
/// <remarks>
/// <para>
/// Uses <see cref="IOptionsMonitor{TOptions}"/> (not <see cref="IOptions{TOptions}"/>)
/// so the M5-T8 hot-reload path sees fresh values without re-resolving the
/// singleton. <see cref="Apply"/> reads <see cref="IOptionsMonitor{T}.CurrentValue"/>
/// on every call, and the regex cache is keyed by pattern string — patterns
/// added via a live config change compile on first use of the next event;
/// patterns removed simply stop being looked up. No <c>OnChange</c> subscription
/// or explicit cache invalidation is required (the
/// <c>AuditLogOptionsBindingTests</c> fixture in <c>ZB.MOM.WW.ScadaBridge.AuditLog.Tests</c>
/// pins this behaviour).
/// </para>
/// <para>
/// "Error row" = <see cref="AuditEvent.Status"/> NOT IN (<c>Delivered</c>,
/// <c>Submitted</c>, <c>Forwarded</c>) — every other status, including the
/// non-terminal <c>Attempted</c>, the parked/discarded terminals, and the
/// short-circuit <c>Skipped</c>, receives the larger error cap so a verbose
/// error body survives.
/// </para>
/// <para>
/// Apply MUST NOT throw — on internal failure the filter over-redacts by
/// returning the input with <see cref="AuditEvent.PayloadTruncated"/> set and
/// increments the <c>AuditRedactionFailure</c> health metric via the injected
/// <see cref="IAuditRedactionFailureCounter"/>. Each redactor stage runs in
/// its own try/catch — a failure in (say) the header redactor still lets the
/// SQL parameter redactor and the truncator run on the remaining fields.
/// </para>
/// <para>
/// Stage order (each runs on every applicable field):
/// header redaction → body regex redaction → truncation. The SQL-parameter
/// stage piggybacks on the body-redactor path; both run BEFORE truncation so
/// the cap trims the redacted result, never bytes the redactor intended to
/// hide.
/// </para>
/// </remarks>
public sealed class DefaultAuditPayloadFilter : IAuditPayloadFilter
{
private const string RedactedMarker = "<redacted>";
private const string RedactorErrorMarker = "<redacted: redactor error>";
/// <summary>
/// Per-match regex timeout. Catastrophic-backtracking patterns trip a
/// <see cref="RegexMatchTimeoutException"/> when a single match takes
/// longer than this; the offending field is then over-redacted with
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
/// 50 ms is generous for normal patterns yet short enough that the
/// audit hot-path isn't held up by a misconfigured regex.
/// </summary>
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.FromMilliseconds(50);
/// <summary>
/// JSON serializer options used to re-emit redacted summaries. The
/// UnsafeRelaxedJsonEscaping encoder is required so the redaction marker
/// (which contains <c>&lt;</c> / <c>&gt;</c>) survives unescaped — the
/// header-redaction tests grep for the literal marker, and the downstream
/// UI / log readers would rather see <c>&lt;redacted&gt;</c> than
/// <c><redacted></c>. The summaries are persisted to the audit
/// table and rendered in trusted-internal contexts only, so the relaxed
/// HTML-escaping rules do not introduce an XSS surface.
/// </summary>
private static readonly JsonSerializerOptions RedactedSummaryJsonOptions = new()
{
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
};
private readonly IOptionsMonitor<AuditLogOptions> _options;
private readonly ILogger<DefaultAuditPayloadFilter> _logger;
private readonly IAuditRedactionFailureCounter _failureCounter;
/// <summary>
/// Compiled-regex cache keyed by pattern string. Lazy population: each
/// pattern is compiled on first use and cached forever (the entry's
/// <see cref="CompiledRegex"/> carries either the working <see cref="Regex"/>
/// or a sentinel marking the pattern as invalid so we don't retry the
/// failing compile on every call). ConcurrentDictionary is the right
/// thread-safety primitive here because the filter is a DI singleton
/// shared across the audit hot-path.
/// </summary>
private readonly ConcurrentDictionary<string, CompiledRegex> _regexCache = new();
/// <summary>
/// Primary constructor used by DI — pulls the optional redaction-failure
/// counter from the container; a NoOp default is registered in
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
/// </summary>
/// <param name="options">Live-reloadable audit log options.</param>
/// <param name="logger">Logger for redaction diagnostics.</param>
/// <param name="failureCounter">Optional counter incremented when a redaction operation fails; defaults to a no-op.</param>
public DefaultAuditPayloadFilter(
IOptionsMonitor<AuditLogOptions> options,
ILogger<DefaultAuditPayloadFilter> logger,
IAuditRedactionFailureCounter? failureCounter = null)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_failureCounter = failureCounter ?? new NoOpAuditRedactionFailureCounter();
}
/// <inheritdoc />
public AuditEvent Apply(AuditEvent rawEvent)
{
try
{
var opts = _options.CurrentValue;
// Inbound API gets a dedicated, larger ceiling — request/response bodies are
// captured verbatim up to InboundMaxBytes (default 1 MiB) so support can
// replay exactly what the caller sent and what we returned. Other channels
// keep the global 8 KiB / 64 KiB policy.
// See docs/plans/2026-05-23-inbound-api-full-response-audit-design.md.
var cap = rawEvent.Channel == AuditChannel.ApiInbound
? opts.InboundMaxBytes
: (IsErrorStatus(rawEvent.Status) ? opts.ErrorCapBytes : opts.DefaultCapBytes);
// --- Header-redaction stage (runs BEFORE truncation) ----------
var request = RedactHeaders(rawEvent.RequestSummary, opts.HeaderRedactList);
var response = RedactHeaders(rawEvent.ResponseSummary, opts.HeaderRedactList);
var errorDetail = rawEvent.ErrorDetail;
var extra = rawEvent.Extra;
// --- Body-regex stage (also runs BEFORE truncation) -----------
// Resolves the active regex set per event so per-target overrides
// bound to AuditEvent.Target are picked up; effectively a no-op
// when neither GlobalBodyRedactors nor the per-target additions
// are configured.
var bodyRegexes = ResolveBodyRegexes(opts, rawEvent.Target);
if (bodyRegexes.Count > 0)
{
request = RedactBody(request, bodyRegexes);
response = RedactBody(response, bodyRegexes);
errorDetail = RedactBody(errorDetail, bodyRegexes);
extra = RedactBody(extra, bodyRegexes);
}
// --- SQL parameter redaction stage (DbOutbound only) ----------
// Parses the M4 AuditingDbCommand RequestSummary shape
// {"sql":"...","parameters":{...}} and redacts parameter VALUES
// whose NAME matches the per-connection regex. Opt-in: no
// PerTargetOverrides[connectionName].RedactSqlParamsMatching =>
// no-op. Channel-guarded so the same regex can never accidentally
// touch an ApiOutbound row.
if (rawEvent.Channel == AuditChannel.DbOutbound
&& TryGetSqlParamRedactor(opts, rawEvent.Target, out var sqlParamRegex))
{
request = RedactSqlParameters(request, sqlParamRegex!);
}
// --- Truncation stage -----------------------------------------
var truncated = false;
request = TruncateField(request, cap, ref truncated);
response = TruncateField(response, cap, ref truncated);
errorDetail = TruncateField(errorDetail, cap, ref truncated);
extra = TruncateField(extra, cap, ref truncated);
return rawEvent with
{
RequestSummary = request,
ResponseSummary = response,
ErrorDetail = errorDetail,
Extra = extra,
PayloadTruncated = rawEvent.PayloadTruncated || truncated,
};
}
catch (Exception ex)
{
// Audit is best-effort: over-redact rather than fail the caller.
// The per-stage try/catches above already handle redactor faults
// and increment the counter; this catch covers any unexpected
// surprise in the surrounding orchestration code.
_logger.LogWarning(
ex,
"Payload filter failed; returning raw event with PayloadTruncated=true");
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return rawEvent with { PayloadTruncated = true };
}
}
/// <summary>
/// Parse <paramref name="json"/> as the documented
/// <c>{"headers": {...}, "body": ...}</c> shape and replace values whose
/// header NAME (case-insensitive) is in <paramref name="redactList"/> with
/// <see cref="RedactedMarker"/>. Re-serialises and returns the result.
/// </summary>
/// <remarks>
/// No-op pass-through for inputs that aren't JSON-shaped — emitters that
/// have not yet adopted the convention (the M2 site emitters today, which
/// leave RequestSummary null on outbound API calls) get a transparent
/// pass. If the redactor itself throws, we over-redact the whole field
/// with <see cref="RedactorErrorMarker"/> and bump the failure counter.
/// </remarks>
private string? RedactHeaders(string? json, IList<string> redactList)
{
if (json is null)
{
return null;
}
// Cheap structural pre-check: only attempt JSON parsing when the input
// actually looks like a JSON object. Saves the JsonDocument allocation
// on the (very common) non-JSON ErrorDetail / Extra fields.
var trimmed = json.AsSpan().TrimStart();
if (trimmed.Length == 0 || trimmed[0] != '{')
{
return json;
}
try
{
JsonNode? root;
try
{
root = JsonNode.Parse(json);
}
catch (JsonException)
{
// Not parseable JSON — leave the field alone (no error, no
// redaction). Emitters not yet using the documented shape get
// a transparent pass; Bundle C will update them.
return json;
}
if (root is not JsonObject obj || obj["headers"] is not JsonObject headers)
{
// No "headers" object at the top level — nothing to redact.
return json;
}
// Build a case-insensitive lookup of the redact list so we can do
// one O(1) check per header name without an inner Any() loop.
var redactSet = new HashSet<string>(redactList, StringComparer.OrdinalIgnoreCase);
// Take a snapshot of names first — we cannot mutate while
// enumerating the JsonObject.
var names = new List<string>(headers.Count);
foreach (var kvp in headers)
{
names.Add(kvp.Key);
}
foreach (var name in names)
{
if (redactSet.Contains(name))
{
headers[name] = JsonValue.Create(RedactedMarker);
}
}
return obj.ToJsonString(RedactedSummaryJsonOptions);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Header redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
/// <summary>
/// Combine the global and per-target body-redactor lists for a single
/// event, returning the compiled-regex set to apply. Patterns that failed
/// compilation are silently skipped — the compile-time failure was logged
/// once on first encounter; we never let one bad pattern starve the rest.
/// </summary>
private IReadOnlyList<Regex> ResolveBodyRegexes(AuditLogOptions opts, string? target)
{
var hasGlobal = opts.GlobalBodyRedactors is { Count: > 0 };
var perTargetAdditions = (target != null
&& opts.PerTargetOverrides.TryGetValue(target, out var over)
&& over.AdditionalBodyRedactors is { Count: > 0 })
? over.AdditionalBodyRedactors
: null;
if (!hasGlobal && perTargetAdditions == null)
{
return Array.Empty<Regex>();
}
var result = new List<Regex>();
if (hasGlobal)
{
foreach (var pattern in opts.GlobalBodyRedactors)
{
if (TryGetCompiledRegex(pattern, out var rx))
{
result.Add(rx!);
}
}
}
if (perTargetAdditions != null)
{
foreach (var pattern in perTargetAdditions)
{
if (TryGetCompiledRegex(pattern, out var rx))
{
result.Add(rx!);
}
}
}
return result;
}
/// <summary>
/// Resolve a compiled regex from the cache, compiling it on first use.
/// Returns <c>false</c> for patterns that are invalid OR whose compile
/// took longer than 100 ms (the spec calls catastrophic-backtracking
/// guesses at compile time "invalid"); the failure is logged once and
/// the sentinel cache entry prevents repeat compile attempts.
/// </summary>
private bool TryGetCompiledRegex(string pattern, out Regex? regex)
{
var entry = _regexCache.GetOrAdd(pattern, CompileRegex);
regex = entry.Regex;
return entry.Regex != null;
}
private CompiledRegex CompileRegex(string pattern)
{
try
{
var swStart = System.Diagnostics.Stopwatch.GetTimestamp();
var rx = new Regex(pattern, RegexOptions.Compiled, RegexMatchTimeout);
var elapsedMs = (System.Diagnostics.Stopwatch.GetTimestamp() - swStart)
* 1000d / System.Diagnostics.Stopwatch.Frequency;
if (elapsedMs > 100)
{
_logger.LogWarning(
"Body redactor pattern compiled in {Elapsed}ms (> 100ms cap); rejecting '{Pattern}'",
elapsedMs, pattern);
return CompiledRegex.Invalid;
}
return new CompiledRegex(rx);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Body redactor pattern '{Pattern}' failed to compile; skipping",
pattern);
return CompiledRegex.Invalid;
}
}
/// <summary>
/// Apply each compiled body-redactor regex to <paramref name="value"/> in
/// turn, replacing every match with <see cref="RedactedMarker"/>. If any
/// single regex match throws (most commonly
/// <see cref="RegexMatchTimeoutException"/>) the field is over-redacted
/// with <see cref="RedactorErrorMarker"/> and the failure counter is
/// incremented — the user-facing action is never aborted.
/// </summary>
private string? RedactBody(string? value, IReadOnlyList<Regex> regexes)
{
if (value is null)
{
return null;
}
var current = value;
foreach (var rx in regexes)
{
try
{
current = rx.Replace(current, RedactedMarker);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Body redactor '{Pattern}' faulted; over-redacting field with '{Marker}'",
rx.ToString(), RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
return current;
}
/// <summary>
/// Resolve the per-connection SQL parameter redaction regex for the given
/// DbOutbound event target. Target shape (M4 AuditingDbCommand): the
/// connection name optionally followed by <c>.&lt;sql-snippet&gt;</c> for
/// disambiguation; the per-target dictionary is keyed by the connection
/// name alone, so we strip the snippet suffix before lookup. Patterns are
/// compiled with case-insensitive matching to match the documented
/// behaviour.
/// </summary>
private bool TryGetSqlParamRedactor(AuditLogOptions opts, string? target, out Regex? regex)
{
regex = null;
if (string.IsNullOrEmpty(target))
{
return false;
}
var dot = target.IndexOf('.');
var connectionKey = dot < 0 ? target : target[..dot];
if (!opts.PerTargetOverrides.TryGetValue(connectionKey, out var over)
|| string.IsNullOrEmpty(over.RedactSqlParamsMatching))
{
return false;
}
// Force case-insensitivity per the spec — even if the operator wrote
// the pattern without an IgnoreCase flag. The compile cache key folds
// the option to keep the entries unambiguous.
var cacheKey = "(?i)" + over.RedactSqlParamsMatching;
if (!TryGetCompiledRegex(cacheKey, out regex))
{
return false;
}
return true;
}
/// <summary>
/// Walk the M4 <c>{"sql":"...","parameters":{...}}</c> RequestSummary
/// shape; for each parameter whose NAME matches
/// <paramref name="paramNameRegex"/>, replace its value with
/// <see cref="RedactedMarker"/>. Re-serialise.
/// </summary>
/// <remarks>
/// No-op pass-through when the input isn't parseable JSON, isn't a JSON
/// object, or doesn't carry a top-level <c>"parameters"</c> object. On
/// any unexpected fault the field is over-redacted with
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
/// </remarks>
private string? RedactSqlParameters(string? json, Regex paramNameRegex)
{
if (json is null)
{
return null;
}
var trimmed = json.AsSpan().TrimStart();
if (trimmed.Length == 0 || trimmed[0] != '{')
{
return json;
}
try
{
JsonNode? root;
try
{
root = JsonNode.Parse(json);
}
catch (JsonException)
{
return json;
}
if (root is not JsonObject obj || obj["parameters"] is not JsonObject parameters)
{
return json;
}
// Snapshot the names — mutating during enumeration is unsupported.
var names = new List<string>(parameters.Count);
foreach (var kvp in parameters)
{
names.Add(kvp.Key);
}
var anyChanged = false;
foreach (var name in names)
{
bool matched;
try
{
matched = paramNameRegex.IsMatch(name);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
if (matched)
{
parameters[name] = JsonValue.Create(RedactedMarker);
anyChanged = true;
}
}
// Avoid re-serialising (which would normalise whitespace / order)
// when no parameter matched — keeps the on-disk row byte-identical
// to the emitter's output on the no-match path.
return anyChanged ? obj.ToJsonString(RedactedSummaryJsonOptions) : json;
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
RedactorErrorMarker);
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
return RedactorErrorMarker;
}
}
private static string? TruncateField(string? value, int cap, ref bool truncated)
{
if (value is null)
{
return null;
}
var result = TruncateUtf8(value, cap);
if (result.Length != value.Length)
{
truncated = true;
}
return result;
}
/// <summary>
/// UTF-8 byte-safe truncation. Encodes the input to UTF-8, walks back from
/// the cap position until the byte is NOT a continuation byte
/// (<c>byte &amp; 0xC0 == 0x80</c>), and decodes the resulting prefix —
/// guaranteeing the returned string never splits a multi-byte sequence.
/// </summary>
private static string TruncateUtf8(string value, int capBytes)
{
if (string.IsNullOrEmpty(value))
{
return value;
}
var bytes = Encoding.UTF8.GetBytes(value);
if (bytes.Length <= capBytes)
{
return value;
}
var boundary = capBytes;
while (boundary > 0 && (bytes[boundary] & 0xC0) == 0x80)
{
boundary--;
}
return Encoding.UTF8.GetString(bytes, 0, boundary);
}
private static bool IsErrorStatus(AuditStatus status) => status switch
{
AuditStatus.Delivered or AuditStatus.Submitted or AuditStatus.Forwarded => false,
_ => true,
};
/// <summary>
/// Cache entry for a body-redactor pattern. Carries the working
/// <see cref="Regex"/> on the success path, or the
/// <see cref="Invalid"/> sentinel for patterns that failed to compile
/// (or exceeded the 100 ms compile budget). The sentinel lets us skip
/// repeat compile attempts on every event without re-throwing on the
/// hot-path.
/// </summary>
private readonly struct CompiledRegex
{
public static readonly CompiledRegex Invalid = new(null);
/// <summary>Gets the compiled <see cref="System.Text.RegularExpressions.Regex"/>, or <c>null</c> when the pattern was invalid.</summary>
public Regex? Regex { get; }
/// <summary>Initializes a new <see cref="CompiledRegex"/> wrapping the given compiled regex instance.</summary>
/// <param name="regex">The pre-compiled regex, or <c>null</c> to represent an invalid pattern.</param>
public CompiledRegex(Regex? regex) => Regex = regex;
}
}
@@ -1,31 +0,0 @@
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Filters an <see cref="AuditEvent"/> between construction and persistence —
/// truncates oversized payload fields, applies header/body/SQL-parameter
/// redaction, sets <see cref="AuditEvent.PayloadTruncated"/>.
/// </summary>
/// <remarks>
/// <para>
/// Pure function: returns a filtered COPY of the input via <c>with</c>
/// expressions; never throws (over-redacts on internal failure and increments
/// the <c>AuditRedactionFailure</c> health metric).
/// </para>
/// <para>
/// Wired in M5 between event construction and the writer chain
/// (<c>FallbackAuditWriter.WriteAsync</c>, <c>CentralAuditWriter.WriteAsync</c>,
/// and the <c>AuditLogIngestActor</c> handlers).
/// </para>
/// </remarks>
public interface IAuditPayloadFilter
{
/// <summary>
/// Apply the configured truncation + redaction policy to <paramref name="rawEvent"/>
/// and return a filtered copy. MUST NOT throw — on internal failure, over-redact
/// and surface the failure via the audit-redaction-failure health metric.
/// </summary>
/// <param name="rawEvent">The unfiltered audit event to process.</param>
AuditEvent Apply(AuditEvent rawEvent);
}
@@ -1,9 +1,9 @@
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// Counter sink invoked by <see cref="DefaultAuditPayloadFilter"/> every time
/// a redactor (header / body regex / SQL parameter) throws and the filter has
/// to over-redact the offending field with the
/// Counter sink invoked by <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Redaction.ScadaBridgeAuditRedactor"/>
/// every time a redactor (header / body regex / SQL parameter) throws and the
/// redactor has to over-redact the offending field with the
/// <c>&lt;redacted: redactor error&gt;</c> marker. Bundle C bridges this into
/// the Site Health Monitoring report payload as <c>AuditRedactionFailure</c>.
/// </summary>
@@ -1,79 +0,0 @@
using System.Text.RegularExpressions;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
/// <summary>
/// AuditLog-008: minimal always-safe fallback filter used by the writer chain
/// when no <see cref="IAuditPayloadFilter"/> is injected (test composition
/// roots, future composition roots that bypass <c>AddAuditLog</c>). Performs
/// HTTP header redaction for the always-sensitive defaults
/// (Authorization, X-Api-Key, Cookie, Set-Cookie) so a fixture that wires a
/// real <see cref="AuditEvent.RequestSummary"/> never persists those headers
/// in cleartext. Does NOT perform body-regex redaction, SQL-parameter
/// redaction, or truncation — those stages need
/// <see cref="DefaultAuditPayloadFilter"/> with live options. The contract is:
/// over-redact safely, never throw, never miss a header that's on the
/// default sensitive list.
/// </summary>
public sealed class SafeDefaultAuditPayloadFilter : IAuditPayloadFilter
{
/// <summary>Singleton instance — the filter is stateless and side-effect-free.</summary>
public static SafeDefaultAuditPayloadFilter Instance { get; } = new SafeDefaultAuditPayloadFilter();
private static readonly string[] DefaultHeaderRedactList =
{
"Authorization",
"X-Api-Key",
"Cookie",
"Set-Cookie",
};
private static readonly Regex HeaderRegex = new(
@"(?<name>[A-Za-z][A-Za-z0-9\-_]*)\s*:\s*(?<value>[^\r\n]*)",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private SafeDefaultAuditPayloadFilter() { }
/// <inheritdoc />
public AuditEvent Apply(AuditEvent rawEvent)
{
ArgumentNullException.ThrowIfNull(rawEvent);
try
{
return rawEvent with
{
RequestSummary = RedactHeaders(rawEvent.RequestSummary),
ResponseSummary = RedactHeaders(rawEvent.ResponseSummary),
};
}
catch
{
// Over-redact: drop both summaries entirely so a malformed parse
// path never leaks the original. The contract is "never throw."
return rawEvent with
{
RequestSummary = "[redacted by SafeDefaultAuditPayloadFilter]",
ResponseSummary = "[redacted by SafeDefaultAuditPayloadFilter]",
};
}
}
private static string? RedactHeaders(string? summary)
{
if (string.IsNullOrEmpty(summary)) return summary;
return HeaderRegex.Replace(summary, m =>
{
var name = m.Groups["name"].Value;
foreach (var sensitive in DefaultHeaderRedactList)
{
if (string.Equals(name, sensitive, StringComparison.OrdinalIgnoreCase))
{
return $"{name}: [REDACTED]";
}
}
return m.Value;
});
}
}
@@ -0,0 +1,107 @@
using System.Text.RegularExpressions;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using static ZB.MOM.WW.ScadaBridge.AuditLog.Payload.AuditRedactionPrimitives;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Redaction;
/// <summary>
/// Minimal always-safe <see cref="IAuditRedactor"/> fallback for composition
/// roots that bypass the full <see cref="ScadaBridgeAuditRedactor"/>.
/// Performs line-oriented HTTP header
/// redaction for the always-sensitive defaults (Authorization, X-Api-Key,
/// Cookie, Set-Cookie) on the <c>RequestSummary</c> / <c>ResponseSummary</c>
/// fields carried inside <c>ZB.MOM.WW.Audit.AuditEvent.DetailsJson</c>. Does NOT
/// perform body-regex redaction, SQL-parameter redaction, or truncation — those
/// need <see cref="ScadaBridgeAuditRedactor"/> with live options. Contract:
/// over-redact safely, never throw, never miss a header on the default
/// sensitive list.
/// </summary>
public sealed class SafeDefaultAuditRedactor : IAuditRedactor
{
/// <summary>Singleton instance — the redactor is stateless and side-effect-free.</summary>
public static SafeDefaultAuditRedactor Instance { get; } = new SafeDefaultAuditRedactor();
private static readonly string[] DefaultHeaderRedactList =
{
"Authorization",
"X-Api-Key",
"Cookie",
"Set-Cookie",
};
private static readonly Regex HeaderRegex = new(
@"(?<name>[A-Za-z][A-Za-z0-9\-_]*)\s*:\s*(?<value>[^\r\n]*)",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
private SafeDefaultAuditRedactor() { }
/// <summary>
/// Applies line-oriented header redaction to the default sensitive headers
/// (<c>Authorization</c>, <c>X-Api-Key</c>, <c>Cookie</c>, <c>Set-Cookie</c>)
/// found in <c>RequestSummary</c> and <c>ResponseSummary</c> inside
/// <paramref name="rawEvent"/>.<c>DetailsJson</c>. Never throws; over-redacts on
/// any internal failure.
/// </summary>
/// <param name="rawEvent">The audit event whose details JSON is to be redacted.</param>
/// <returns>A new <see cref="AuditEvent"/> with sensitive headers replaced by the redacted marker, or an over-redacted sentinel on failure.</returns>
public AuditEvent Apply(AuditEvent rawEvent)
{
ArgumentNullException.ThrowIfNull(rawEvent);
// Fast path: no DetailsJson means no summaries to scrub.
if (string.IsNullOrEmpty(rawEvent.DetailsJson))
{
return rawEvent;
}
try
{
var d = AuditDetailsCodec.Deserialize(rawEvent.DetailsJson);
var scrubbed = d with
{
RequestSummary = RedactHeaders(d.RequestSummary),
ResponseSummary = RedactHeaders(d.ResponseSummary),
};
return rawEvent with { DetailsJson = AuditDetailsCodec.Serialize(scrubbed) };
}
catch
{
// Over-redact: suppress ALL sensitive free-text fields so a failure
// on any internal path never leaks the original. The contract is
// "never throw." Uses the shared OverRedactedEventMarker so all
// redactor safety-nets emit the same sentinel string.
var safe = new AuditDetails
{
RequestSummary = OverRedactedEventMarker,
ResponseSummary = OverRedactedEventMarker,
ErrorDetail = OverRedactedEventMarker,
ErrorMessage = OverRedactedEventMarker,
Extra = OverRedactedEventMarker,
PayloadTruncated = true,
};
return rawEvent with { DetailsJson = AuditDetailsCodec.Serialize(safe) };
}
}
private static string? RedactHeaders(string? summary)
{
if (string.IsNullOrEmpty(summary)) return summary;
return HeaderRegex.Replace(summary, m =>
{
var name = m.Groups["name"].Value;
foreach (var sensitive in DefaultHeaderRedactList)
{
if (string.Equals(name, sensitive, StringComparison.OrdinalIgnoreCase))
{
// Use the shared RedactedMarker so line-format and JSON-format
// header redaction emit the same sentinel string.
return $"{name}: {RedactedMarker}";
}
}
return m.Value;
});
}
}
@@ -0,0 +1,354 @@
using System.Text;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Redaction;
/// <summary>
/// Canonical <see cref="IAuditRedactor"/> implementation for ScadaBridge —
/// operates on <c>ZB.MOM.WW.Audit.AuditEvent</c> and its <see cref="AuditEvent.DetailsJson"/>
/// payload bag. The ScadaBridge request/response/error/extra summaries travel
/// inside <c>DetailsJson</c> as a <see cref="AuditDetails"/> record (serialized
/// by <see cref="AuditDetailsCodec"/>); this redactor deserializes them, applies
/// the header → body-regex → SQL-parameter → byte-safe truncation pipeline,
/// re-serializes, and returns a filtered COPY.
/// </summary>
/// <remarks>
/// <para>
/// Cap selection is faithful to the original pipeline, translated onto canonical
/// fields:
/// <list type="bullet">
/// <item>The <c>ApiInbound</c> branch keys on <see cref="AuditEvent.Category"/>
/// (= <c>AuditChannel.ToString()</c> per <see cref="AuditFieldBuilders.BuildCategory"/>)
/// → <see cref="AuditLogOptions.InboundMaxBytes"/>.</item>
/// <item>The "error row" branch reproduces the legacy
/// <c>IsErrorStatus(Status)</c> rule — Status NOT IN (<c>Delivered</c>,
/// <c>Submitted</c>, <c>Forwarded</c>) → <see cref="AuditLogOptions.ErrorCapBytes"/>.
/// The fine-grained status is read from <see cref="AuditDetails.Status"/>
/// when present (it must be — <see cref="AuditOutcome"/> alone cannot
/// reproduce <c>IsErrorStatus</c>, since <c>Attempted</c>/<c>Skipped</c>
/// project to <see cref="AuditOutcome.Success"/> yet take the error cap).
/// When <see cref="AuditDetails.Status"/> is absent/unparseable the
/// canonical <see cref="AuditEvent.Outcome"/> is the fallback:
/// <see cref="AuditOutcome.Failure"/>/<see cref="AuditOutcome.Denied"/>
/// → error cap.</item>
/// </list>
/// </para>
/// <para>
/// MUST NOT throw — wrapped in try/catch; over-redacts (drops ALL sensitive free-text
/// fields to a safe marker) on any internal failure, mirroring
/// <see cref="SafeDefaultAuditRedactor"/>.
/// </para>
/// </remarks>
public sealed class ScadaBridgeAuditRedactor : IAuditRedactor
{
private const string OverRedactedMarker = AuditRedactionPrimitives.OverRedactedEventMarker;
private readonly IOptionsMonitor<AuditLogOptions> _options;
private readonly ILogger<ScadaBridgeAuditRedactor> _logger;
private readonly IAuditRedactionFailureCounter _failureCounter;
private readonly AuditRegexCache _regexCache;
/// <summary>
/// Primary constructor used by DI — pulls the optional redaction-failure
/// counter from the container; a NoOp default is used when none is supplied.
/// </summary>
/// <param name="options">Live-reloadable audit log options.</param>
/// <param name="logger">Logger for redaction diagnostics.</param>
/// <param name="failureCounter">Optional counter incremented when a redaction operation fails; defaults to a no-op.</param>
public ScadaBridgeAuditRedactor(
IOptionsMonitor<AuditLogOptions> options,
ILogger<ScadaBridgeAuditRedactor> logger,
IAuditRedactionFailureCounter? failureCounter = null)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_failureCounter = failureCounter ?? new NoOpAuditRedactionFailureCounter();
_regexCache = new AuditRegexCache(_logger);
}
/// <summary>
/// Applies the full redaction pipeline to <paramref name="rawEvent"/> and returns a
/// filtered copy; returns the same instance unchanged on the fast path. Never throws.
/// </summary>
/// <param name="rawEvent">The raw audit event to redact.</param>
/// <returns>A redacted copy of <paramref name="rawEvent"/>, or the original instance when no changes are needed.</returns>
public AuditEvent Apply(AuditEvent rawEvent)
{
try
{
var opts = _options.CurrentValue;
// --- Fast path -------------------------------------------------
// Mirror the legacy filter's non-JSON pre-check: when there is no
// DetailsJson payload to scrub AND the Target is within the cap,
// there is nothing to redact or truncate. Return the input
// unchanged so the common case stays cheap (no Deserialize, no
// re-Serialize, same instance back).
var detailsEmpty = string.IsNullOrEmpty(rawEvent.DetailsJson);
var targetWithinCap = rawEvent.Target is null
|| Encoding.UTF8.GetByteCount(rawEvent.Target) <= opts.DefaultCapBytes;
if (detailsEmpty && targetWithinCap)
{
return rawEvent;
}
// --- Slow path -------------------------------------------------
var d = AuditDetailsCodec.Deserialize(rawEvent.DetailsJson);
// Cap selection. Channel = canonical Category (the ApiInbound
// branch); error-cap selection reproduces the legacy
// IsErrorStatus(Status) — read from d.Status when present, else
// fall back to the canonical Outcome.
var cap = SelectCap(opts, rawEvent.Category, d.Status, rawEvent.Outcome);
// --- Header-redaction stage (runs BEFORE truncation) ----------
var request = RedactHeaders(d.RequestSummary, opts.HeaderRedactList);
var response = RedactHeaders(d.ResponseSummary, opts.HeaderRedactList);
var errorDetail = d.ErrorDetail;
var extra = d.Extra;
// --- Body-regex stage (also runs BEFORE truncation) -----------
// Per-target additions key on the canonical Target.
var bodyRegexes = ResolveBodyRegexes(opts, rawEvent.Target);
if (bodyRegexes.Count > 0)
{
request = RedactBody(request, bodyRegexes);
response = RedactBody(response, bodyRegexes);
errorDetail = RedactBody(errorDetail, bodyRegexes);
extra = RedactBody(extra, bodyRegexes);
}
// --- SQL parameter redaction stage (DbOutbound only) ----------
// Channel-guarded on the canonical Category; connection key is the
// Target prefix before the first '.'.
if (string.Equals(rawEvent.Category, nameof(AuditChannel.DbOutbound), StringComparison.Ordinal)
&& TryGetSqlParamRedactor(opts, rawEvent.Target, out var sqlParamRegex))
{
request = RedactSqlParameters(request, sqlParamRegex!);
}
// --- Truncation stage -----------------------------------------
var truncated = false;
request = TruncateField(request, cap, ref truncated);
response = TruncateField(response, cap, ref truncated);
errorDetail = TruncateField(errorDetail, cap, ref truncated);
extra = TruncateField(extra, cap, ref truncated);
var rewritten = d with
{
RequestSummary = request,
ResponseSummary = response,
ErrorDetail = errorDetail,
Extra = extra,
PayloadTruncated = d.PayloadTruncated || truncated,
};
// Target length cap (canonical top-level field). Cap at the default
// byte ceiling so an absurd Target cannot blow the storage column.
var cappedTarget = TruncateTarget(rawEvent.Target, opts.DefaultCapBytes);
return rawEvent with
{
DetailsJson = AuditDetailsCodec.Serialize(rewritten),
Target = cappedTarget,
};
}
catch (Exception ex)
{
// Audit is best-effort: over-redact rather than fail the caller.
// Drop the summaries entirely (mirroring SafeDefault's catch path)
// and flag PayloadTruncated so downstream readers know the row was
// scrubbed defensively.
_logger.LogWarning(
ex,
"Canonical audit redactor failed; over-redacting DetailsJson and flagging PayloadTruncated");
IncrementFailureCounter();
return OverRedact(rawEvent);
}
}
/// <summary>
/// Pick the truncation cap. <paramref name="category"/> = canonical Category
/// (= channel name): <c>ApiInbound</c> → <see cref="AuditLogOptions.InboundMaxBytes"/>.
/// Otherwise the legacy <c>IsErrorStatus</c> rule decides between the error
/// and default caps, preferring the fine-grained <paramref name="detailsStatus"/>
/// (from <c>DetailsJson</c>) and falling back to the canonical
/// <paramref name="outcome"/> when status is absent/unparseable.
/// </summary>
private static int SelectCap(
AuditLogOptions opts,
string? category,
string? detailsStatus,
AuditOutcome outcome)
{
if (string.Equals(category, nameof(AuditChannel.ApiInbound), StringComparison.Ordinal))
{
return opts.InboundMaxBytes;
}
return IsErrorRow(detailsStatus, outcome) ? opts.ErrorCapBytes : opts.DefaultCapBytes;
}
/// <summary>
/// Reproduce the legacy <c>IsErrorStatus(Status)</c> error-cap predicate on
/// the canonical record: Status NOT IN (<c>Delivered</c>, <c>Submitted</c>,
/// <c>Forwarded</c>) → error row. When the fine-grained status is present in
/// <c>DetailsJson</c> it is authoritative; otherwise the canonical
/// <see cref="AuditOutcome"/> is the fallback
/// (<see cref="AuditOutcome.Failure"/>/<see cref="AuditOutcome.Denied"/>
/// → error row).
/// </summary>
private static bool IsErrorRow(string? detailsStatus, AuditOutcome outcome)
{
if (!string.IsNullOrEmpty(detailsStatus)
&& Enum.TryParse<AuditStatus>(detailsStatus, ignoreCase: false, out var status))
{
return status switch
{
AuditStatus.Delivered or AuditStatus.Submitted or AuditStatus.Forwarded => false,
_ => true,
};
}
// No usable status — fall back to the canonical outcome.
return outcome != AuditOutcome.Success;
}
private string? RedactHeaders(string? json, IList<string> redactList)
=> AuditRedactionPrimitives.RedactHeaders(json, redactList, _logger, IncrementFailureCounter);
private string? RedactBody(string? value, IReadOnlyList<Regex> regexes)
=> AuditRedactionPrimitives.RedactBody(value, regexes, _logger, IncrementFailureCounter);
private string? RedactSqlParameters(string? json, Regex paramNameRegex)
=> AuditRedactionPrimitives.RedactSqlParameters(json, paramNameRegex, _logger, IncrementFailureCounter);
private static string? TruncateField(string? value, int cap, ref bool truncated)
=> AuditRedactionPrimitives.TruncateField(value, cap, ref truncated);
private static string? TruncateTarget(string? target, int cap)
=> target is null ? null : AuditRedactionPrimitives.TruncateUtf8(target, cap);
/// <summary>
/// Combine the global and per-target body-redactor lists, returning the
/// compiled-regex set to apply. Patterns that failed compilation are
/// silently skipped.
/// </summary>
private IReadOnlyList<Regex> ResolveBodyRegexes(AuditLogOptions opts, string? target)
{
var hasGlobal = opts.GlobalBodyRedactors is { Count: > 0 };
var perTargetAdditions = (target != null
&& opts.PerTargetOverrides.TryGetValue(target, out var over)
&& over.AdditionalBodyRedactors is { Count: > 0 })
? over.AdditionalBodyRedactors
: null;
if (!hasGlobal && perTargetAdditions == null)
{
return Array.Empty<Regex>();
}
var result = new List<Regex>();
if (hasGlobal)
{
foreach (var pattern in opts.GlobalBodyRedactors)
{
if (_regexCache.TryGet(pattern, out var rx))
{
result.Add(rx!);
}
}
}
if (perTargetAdditions != null)
{
foreach (var pattern in perTargetAdditions)
{
if (_regexCache.TryGet(pattern, out var rx))
{
result.Add(rx!);
}
}
}
return result;
}
/// <summary>
/// Resolve the per-connection SQL parameter redaction regex for the given
/// target. Connection key = everything before the first <c>.</c> in
/// <paramref name="target"/>. Patterns are forced case-insensitive.
/// </summary>
private bool TryGetSqlParamRedactor(AuditLogOptions opts, string? target, out Regex? regex)
{
regex = null;
if (string.IsNullOrEmpty(target))
{
return false;
}
var dot = target.IndexOf('.');
var connectionKey = dot < 0 ? target : target[..dot];
if (!opts.PerTargetOverrides.TryGetValue(connectionKey, out var over)
|| string.IsNullOrEmpty(over.RedactSqlParamsMatching))
{
return false;
}
var cacheKey = "(?i)" + over.RedactSqlParamsMatching;
return _regexCache.TryGet(cacheKey, out regex);
}
/// <summary>
/// Over-redaction copy returned from the never-throws catch: suppress ALL
/// potentially-sensitive string fields inside <c>DetailsJson</c> to a safe
/// marker and flag <see cref="AuditDetails.PayloadTruncated"/>. "All sensitive
/// fields" = <c>RequestSummary</c>, <c>ResponseSummary</c>, <c>ErrorDetail</c>,
/// <c>ErrorMessage</c>, and <c>Extra</c> — all body-regex redaction targets
/// that can carry sensitive values. Best-effort re-serialise; if even that
/// fails, return the input with no sensitive fields via a minimal details bag.
/// </summary>
private static AuditEvent OverRedact(AuditEvent rawEvent)
{
try
{
var d = AuditDetailsCodec.Deserialize(rawEvent.DetailsJson) with
{
RequestSummary = OverRedactedMarker,
ResponseSummary = OverRedactedMarker,
ErrorDetail = OverRedactedMarker,
ErrorMessage = OverRedactedMarker,
Extra = OverRedactedMarker,
PayloadTruncated = true,
};
return rawEvent with { DetailsJson = AuditDetailsCodec.Serialize(d) };
}
catch
{
var safe = new AuditDetails
{
RequestSummary = OverRedactedMarker,
ResponseSummary = OverRedactedMarker,
ErrorDetail = OverRedactedMarker,
ErrorMessage = OverRedactedMarker,
Extra = OverRedactedMarker,
PayloadTruncated = true,
};
return rawEvent with { DetailsJson = AuditDetailsCodec.Serialize(safe) };
}
}
/// <summary>
/// Bumps the injected redaction-failure counter, swallowing any fault per
/// alog.md §7. Passed as the <c>onFailure</c> callback to the shared
/// primitives and called from the top-level catch.
/// </summary>
private void IncrementFailureCounter()
{
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
}
}
@@ -3,13 +3,16 @@ using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.Configuration;
using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
using ZB.MOM.WW.ScadaBridge.AuditLog.Configuration;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.AuditLog.Redaction;
using ZB.MOM.WW.ScadaBridge.AuditLog.Site;
using ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using IAuditWriter = ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.IAuditWriter;
namespace ZB.MOM.WW.ScadaBridge.AuditLog;
@@ -62,19 +65,19 @@ public static class ServiceCollectionExtensions
ArgumentNullException.ThrowIfNull(config);
// M1: top-level AuditLogOptions + validator (redaction policy, payload caps, etc.).
services.AddOptions<AuditLogOptions>()
.Bind(config.GetSection(ConfigSectionName))
.ValidateOnStart();
services.AddSingleton<IValidateOptions<AuditLogOptions>, AuditLogOptionsValidator>();
// Collapsed onto the shared ZB.MOM.WW.Configuration helper: it binds the
// "AuditLog" section, registers the validator, and enables ValidateOnStart in
// one call. Same section path as before; AddAuditLog is call-once per
// collection, and the helper's TryAddEnumerable is idempotent for the
// validator (a strict improvement over the previous AddSingleton).
services.AddValidatedOptions<AuditLogOptions, AuditLogOptionsValidator>(config, ConfigSectionName);
// M5 Bundle A: payload filter — truncates oversized RequestSummary /
// ResponseSummary / ErrorDetail / Extra fields between event
// construction and persistence. Bundle B layers header / body /
// SQL-parameter redaction onto the same singleton; Bundle C wires it
// into the FallbackAuditWriter / CentralAuditWriter / IngestActor
// paths. Singleton — the filter is stateless and the IOptionsMonitor
// dependency picks up M5-T8 hot reloads on its own.
services.AddSingleton<IAuditPayloadFilter, DefaultAuditPayloadFilter>();
// C3 (Task 2.5): the canonical IAuditRedactor is wired as
// ScadaBridgeAuditRedactor — same truncation + header / body /
// SQL-parameter redaction as the original pipeline, applied between
// event construction and persistence. Singleton — stateless; the
// IOptionsMonitor dependency picks up hot reloads on its own.
services.AddSingleton<IAuditRedactor, ScadaBridgeAuditRedactor>();
// M5 Bundle B: per-stage redactor-failure counter. NoOp default;
// Bundle C replaces this binding with the Site Health Monitoring
@@ -113,7 +116,7 @@ public static class ServiceCollectionExtensions
// The script-thread surface is FallbackAuditWriter (primary + ring +
// counter), not the raw SqliteAuditWriter — primary failures must NEVER
// abort the user-facing action.
// Bundle C (M5-T6): the IAuditPayloadFilter singleton above is wired
// C3 (Task 2.5): the canonical IAuditRedactor singleton above is wired
// through the factory so every event written through this surface is
// truncated + redacted before it hits SQLite (and the ring on
// failure).
@@ -122,7 +125,7 @@ public static class ServiceCollectionExtensions
ring: sp.GetRequiredService<RingBufferFallback>(),
failureCounter: sp.GetRequiredService<IAuditWriteFailureCounter>(),
logger: sp.GetRequiredService<ILogger<FallbackAuditWriter>>(),
filter: sp.GetRequiredService<IAuditPayloadFilter>()));
redactor: sp.GetRequiredService<IAuditRedactor>()));
// ISiteStreamAuditClient: NoOp default. This binding remains correct for
// central/test composition roots that have no SiteCommunicationActor.
@@ -200,7 +203,7 @@ public static class ServiceCollectionExtensions
// is intentionally distinct from IAuditWriter so site composition roots
// do not accidentally bind it; central composition roots that include
// AddConfigurationDatabase get a working implementation transparently.
// Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so
// C3 (Task 2.5): wire the canonical IAuditRedactor into the factory so
// NotificationOutboxActor + Inbound API rows are truncated + redacted
// before they hit MS SQL.
// M6 Bundle E (T8): also wire the ICentralAuditWriteFailureCounter
@@ -208,7 +211,7 @@ public static class ServiceCollectionExtensions
services.AddSingleton<ICentralAuditWriter>(sp => new CentralAuditWriter(
sp,
sp.GetRequiredService<ILogger<CentralAuditWriter>>(),
sp.GetRequiredService<IAuditPayloadFilter>(),
sp.GetRequiredService<IAuditRedactor>(),
sp.GetRequiredService<ICentralAuditWriteFailureCounter>(),
// SourceNode-stamping (Task 12): wire the local node identity so
// central-origin rows (Notification Outbox dispatch, Inbound API)
@@ -228,7 +231,7 @@ public static class ServiceCollectionExtensions
/// real <see cref="HealthMetricsAuditWriteFailureCounter"/> /
/// <see cref="HealthMetricsAuditRedactionFailureCounter"/> bridges so the
/// FallbackAuditWriter primary-failure counter AND the
/// DefaultAuditPayloadFilter redactor-failure counter both surface in the
/// <see cref="ScadaBridgeAuditRedactor"/> redactor-failure counter both surface in the
/// site health report payload as
/// <c>SiteHealthReport.SiteAuditWriteFailures</c> +
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
@@ -1,7 +1,8 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.AuditLog.Payload;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.AuditLog.Redaction;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using IAuditWriter = ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services.IAuditWriter;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
@@ -31,43 +32,45 @@ public sealed class FallbackAuditWriter : IAuditWriter
private readonly RingBufferFallback _ring;
private readonly IAuditWriteFailureCounter _failureCounter;
private readonly ILogger<FallbackAuditWriter> _logger;
private readonly IAuditPayloadFilter _filter;
private readonly IAuditRedactor _redactor;
private readonly SemaphoreSlim _drainGate = new(1, 1);
/// <summary>
/// Bundle C (M5-T6) wires the singleton <see cref="IAuditPayloadFilter"/>
/// Bundle C (M5-T6) wires the singleton <see cref="IAuditRedactor"/>
/// here so every event written via the site hot path is truncated +
/// header/body/SQL-param redacted before it hits both the primary SQLite
/// writer AND the ring fallback. The parameter is optional (defaults to
/// no filtering) so the long tail of test composition roots that don't
/// care about the filter need no change — the production
/// the always-safe <see cref="SafeDefaultAuditRedactor"/>) so the long
/// tail of test composition roots that don't care about the redactor need
/// no change — the production
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/> registration
/// always passes the real filter through.
/// always passes the real redactor through.
/// </summary>
/// <param name="primary">The primary audit writer (typically the SQLite writer).</param>
/// <param name="ring">Drop-oldest ring buffer used to stash events when the primary fails.</param>
/// <param name="failureCounter">Counter incremented on each primary failure for health reporting.</param>
/// <param name="logger">Logger for diagnostics.</param>
/// <param name="filter">Optional payload filter applied before writing; null means no filtering.</param>
/// <param name="redactor">Optional canonical redactor applied before writing; null means the always-safe default.</param>
public FallbackAuditWriter(
IAuditWriter primary,
RingBufferFallback ring,
IAuditWriteFailureCounter failureCounter,
ILogger<FallbackAuditWriter> logger,
IAuditPayloadFilter? filter = null)
IAuditRedactor? redactor = null)
{
_primary = primary ?? throw new ArgumentNullException(nameof(primary));
_ring = ring ?? throw new ArgumentNullException(nameof(ring));
_failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
// AuditLog-008: never default to a null filter — over-redact instead.
// SafeDefaultAuditPayloadFilter.Instance performs HTTP header
// redaction with the hard-coded sensitive defaults (Authorization,
// X-Api-Key, Cookie, Set-Cookie) so a test composition root that
// doesn't bind the real options never persists those headers
// verbatim. The real DefaultAuditPayloadFilter (truncation + body /
// AuditLog-008: never default to a null redactor — over-redact instead.
// C3 (Task 2.5): wired via the canonical IAuditRedactor seam.
// SafeDefaultAuditRedactor performs HTTP header redaction with the
// hard-coded sensitive defaults (Authorization, X-Api-Key, Cookie,
// Set-Cookie) on the DetailsJson summaries so a test composition root
// that doesn't bind the real options never persists those headers
// verbatim. The full ScadaBridgeAuditRedactor (truncation + body /
// SQL-param redaction) is wired by AddAuditLog and takes precedence.
_filter = filter ?? Payload.SafeDefaultAuditPayloadFilter.Instance;
_redactor = redactor ?? SafeDefaultAuditRedactor.Instance;
}
/// <inheritdoc />
@@ -75,14 +78,14 @@ public sealed class FallbackAuditWriter : IAuditWriter
{
ArgumentNullException.ThrowIfNull(evt);
// Filter once, up-front. The filtered event flows BOTH to the primary
// Redact once, up-front. The redacted event flows BOTH to the primary
// and (on failure) to the ring buffer — so a primary outage that
// drains later still hands the SqliteAuditWriter a row that has
// already been truncated and redacted. The filter contract is
// "MUST NOT throw". AuditLog-008: _filter is now non-null (defaults
// to SafeDefaultAuditPayloadFilter so header redaction is always
// applied even in composition roots that don't wire the real filter).
var filtered = _filter.Apply(evt);
// already been truncated and redacted. The redactor contract is
// "MUST NOT throw". AuditLog-008: _redactor is now non-null (defaults
// to SafeDefaultAuditRedactor so header redaction is always applied
// even in composition roots that don't wire the real redactor).
var filtered = _redactor.Apply(evt);
try
{
@@ -6,10 +6,10 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// <summary>
/// Audit Log (#23) M5 Bundle C — bridges
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
/// parameter redactor stage throws and the filter has to over-redact the
/// offending field) into <see cref="ISiteHealthCollector"/> so the count
/// surfaces in the site health report payload as
/// <see cref="ZB.MOM.WW.ScadaBridge.AuditLog.Redaction.ScadaBridgeAuditRedactor"/> every time
/// a header / body / SQL parameter redactor stage throws and the redactor has
/// to over-redact the offending field) into <see cref="ISiteHealthCollector"/>
/// so the count surfaces in the site health report payload as
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
/// </summary>
/// <remarks>
@@ -1,6 +1,6 @@
using System.Runtime.CompilerServices;
using System.Threading.Channels;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.Audit;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
@@ -96,6 +96,7 @@ public sealed class RingBufferFallback
/// must call <see cref="Complete"/> first.
/// </summary>
/// <param name="cancellationToken">Cancellation token to abort the async enumeration.</param>
/// <returns>An async sequence of buffered <see cref="AuditEvent"/> values in FIFO order.</returns>
public async IAsyncEnumerable<AuditEvent> DrainAsync(
[EnumeratorCancellation] CancellationToken cancellationToken)
{
@@ -69,7 +69,9 @@ public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
}
/// <inheritdoc />
/// <summary>Starts the background polling loop, running an immediate first probe before entering the timed cycle.</summary>
/// <param name="ct">Cancellation token signalling host shutdown.</param>
/// <returns>A task that represents the asynchronous operation.</returns>
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
@@ -123,14 +125,16 @@ public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
}
}
/// <inheritdoc />
/// <summary>Signals the polling loop to stop and waits for it to complete.</summary>
/// <param name="ct">Cancellation token (not used; the internal CTS governs shutdown).</param>
/// <returns>A task that represents the asynchronous operation.</returns>
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
/// <summary>Releases the internal <see cref="CancellationTokenSource"/> used to stop the polling loop.</summary>
public void Dispose()
{
_cts?.Dispose();
@@ -2,10 +2,12 @@ using System.Threading.Channels;
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
using AuditEvent = ZB.MOM.WW.Audit.AuditEvent;
using AuditOutcome = ZB.MOM.WW.Audit.AuditOutcome;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
@@ -18,15 +20,27 @@ namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site;
/// </summary>
/// <remarks>
/// <para>
/// The schema is bootstrapped in the constructor (Bundle B-T1). The
/// Channel-based <see cref="WriteAsync"/> hot-path + Bundle D
/// <see cref="ReadPendingAsync"/> / <see cref="MarkForwardedAsync"/> support
/// surface are wired in Bundle B-T2.
/// <b>C4 (Task 2.5) — two-table schema.</b> The site store is now two tables:
/// the append-only canonical <c>audit_event</c> (the 10 canonical
/// <see cref="AuditEvent"/> fields stored directly — NO 24-column decompose) and
/// the mutable operational <c>audit_forward_state</c> sidecar that carries the
/// forwarding lifecycle (<see cref="AuditForwardState"/>), a duplicated
/// <c>OccurredAtUtc</c> for the drain index range-scan, a precomputed
/// <c>IsCachedKind</c> flag that drives the cached/non-cached drain split without
/// re-parsing <c>DetailsJson</c> on the read hot-path, plus attempt bookkeeping.
/// </para>
/// <para>
/// <b>Ephemeral reset.</b> The site SQLite store is ephemeral (≈7-day retention,
/// recreated per deployment), so C4's schema change is an in-place RESET: the new
/// tables are created and the old single 24-column <c>AuditLog</c> table is
/// DROP-ped if present. No SQLite data migration is performed (and none is
/// needed) — any rows in a pre-C4 <c>AuditLog</c> table are within the retention
/// window and are discarded by the drop.
/// </para>
/// <para>
/// Site rows always carry <see cref="AuditForwardState.Pending"/> on first
/// insert; the central row-shape's <c>IngestedAtUtc</c> column does NOT live in
/// the site SQLite schema — central stamps it on ingest.
/// insert; the central row-shape's <c>IngestedAtUtc</c> is a DetailsJson field
/// stamped by central on ingest, not a site column.
/// </para>
/// </remarks>
public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable, IDisposable
@@ -35,8 +49,10 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
// on a PRIMARY KEY violation; the extended subcode 1555 (SQLITE_CONSTRAINT_PRIMARYKEY)
// is exposed via SqliteException.SqliteExtendedErrorCode but isn't reliably
// surfaced across all SQLite builds. We treat any constraint error on insert
// as a duplicate-eventid race and swallow it (first-write-wins) — the index
// on EventId is the only constraint on this table, so this scope is precise.
// as a duplicate-eventid race and swallow it (first-write-wins) — the PRIMARY
// KEY on audit_event.EventId is the constraint that fires first, so this scope
// is precise (the sidecar insert for the same EventId is in the same
// transaction and never reached once audit_event's insert throws).
private const int SqliteErrorConstraint = 19;
private readonly SqliteConnection _connection;
@@ -97,6 +113,17 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
_readConnection = new SqliteConnection(connectionString);
_readConnection.Open();
// PRAGMA foreign_keys is a per-connection setting. Set it on the read
// connection as well so that any future read-path change (e.g. a
// DELETE that may be added later) also benefits from FK enforcement.
// Pure SELECT queries are unaffected — this is defensive belt-and-
// suspenders for the read connection.
using (var pragmaCmd = _readConnection.CreateCommand())
{
pragmaCmd.CommandText = "PRAGMA foreign_keys = ON";
pragmaCmd.ExecuteNonQuery();
}
_writeQueue = Channel.CreateBounded<PendingAuditEvent>(
new BoundedChannelOptions(_options.ChannelCapacity)
{
@@ -140,110 +167,98 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
pragmaCmd.ExecuteNonQuery();
}
// Enable FK enforcement on the WRITE connection. PRAGMA foreign_keys is
// a per-connection, per-session setting in SQLite — it is NOT persisted
// in the database file, so every new connection that may INSERT into
// audit_forward_state must set it for the FK
// audit_forward_state.EventId → audit_event.EventId
// to be a real runtime guard rather than decorative DDL. The write
// connection owns all INSERTs (and the MarkForwardedAsync /
// MarkReconciledAsync UPDATEs), so setting it here — after WAL is
// established, before the CREATE TABLEs — ensures the FK is live for
// every insert that follows. The existing insert order (audit_event
// first, then audit_forward_state, inside the same transaction) already
// satisfies the FK, so no pre-existing rows can violate the constraint.
using (var pragmaCmd = _connection.CreateCommand())
{
pragmaCmd.CommandText = "PRAGMA foreign_keys = ON";
pragmaCmd.ExecuteNonQuery();
}
// C4 (Task 2.5) — in-place reset. The site store is EPHEMERAL (≈7-day
// retention, recreated per deployment), so we do NOT migrate the old
// single 24-column AuditLog table to the new two-table shape: any rows
// it holds are within the retention window and discarded. DROP it if a
// pre-C4 deployment left it behind, then CREATE the two new tables. This
// is safe precisely BECAUSE the site store is ephemeral — never do this
// on a durable store (the central SQL Server side keeps its shim until
// C5 and is migrated, not reset).
using (var dropCmd = _connection.CreateCommand())
{
dropCmd.CommandText = "DROP TABLE IF EXISTS AuditLog;";
dropCmd.ExecuteNonQuery();
}
using var cmd = _connection.CreateCommand();
cmd.CommandText = """
CREATE TABLE IF NOT EXISTS AuditLog (
EventId TEXT NOT NULL,
OccurredAtUtc TEXT NOT NULL,
Channel TEXT NOT NULL,
Kind TEXT NOT NULL,
CorrelationId TEXT NULL,
SourceSiteId TEXT NULL,
SourceNode TEXT NULL,
SourceInstanceId TEXT NULL,
SourceScript TEXT NULL,
Actor TEXT NULL,
Target TEXT NULL,
Status TEXT NOT NULL,
HttpStatus INTEGER NULL,
DurationMs INTEGER NULL,
ErrorMessage TEXT NULL,
ErrorDetail TEXT NULL,
RequestSummary TEXT NULL,
ResponseSummary TEXT NULL,
PayloadTruncated INTEGER NOT NULL,
Extra TEXT NULL,
ForwardState TEXT NOT NULL,
ExecutionId TEXT NULL,
ParentExecutionId TEXT NULL,
-- Canonical, append-only / write-once: the 10 fields of the canonical
-- ZB.MOM.WW.Audit.AuditEvent stored directly (DetailsJson carries the
-- ScadaBridge domain fields). No forwarding state lives here that is
-- the audit_forward_state sidecar's concern.
CREATE TABLE IF NOT EXISTS audit_event (
EventId TEXT NOT NULL,
OccurredAtUtc TEXT NOT NULL,
Actor TEXT NOT NULL,
Action TEXT NOT NULL,
Outcome TEXT NOT NULL,
Category TEXT NULL,
Target TEXT NULL,
SourceNode TEXT NULL,
CorrelationId TEXT NULL,
DetailsJson TEXT NULL,
PRIMARY KEY (EventId)
);
CREATE INDEX IF NOT EXISTS IX_SiteAuditLog_ForwardState_Occurred
ON AuditLog (ForwardState, OccurredAtUtc);
-- Operational, mutable: the forwarding lifecycle for each canonical
-- row. OccurredAtUtc is duplicated here so the drain range-scan stays
-- on this one table's index; IsCachedKind is precomputed at insert so
-- the cached/non-cached drain split never re-parses DetailsJson on the
-- read hot-path.
CREATE TABLE IF NOT EXISTS audit_forward_state (
EventId TEXT NOT NULL,
ForwardState TEXT NOT NULL,
OccurredAtUtc TEXT NOT NULL,
IsCachedKind INTEGER NOT NULL,
AttemptCount INTEGER NOT NULL DEFAULT 0,
LastAttemptUtc TEXT NULL,
PRIMARY KEY (EventId),
FOREIGN KEY (EventId) REFERENCES audit_event(EventId)
);
-- Drain index: every read filters on (ForwardState, IsCachedKind) and
-- range-scans/orders by OccurredAtUtc, so this composite covers the
-- four reads + the backlog COUNT/MIN.
CREATE INDEX IF NOT EXISTS IX_fwd
ON audit_forward_state (ForwardState, IsCachedKind, OccurredAtUtc);
""";
cmd.ExecuteNonQuery();
// Audit Log #23 (ExecutionId): additively add the ExecutionId column.
// CREATE TABLE IF NOT EXISTS above does NOT add columns to an AuditLog
// table that already exists from a pre-ExecutionId build, so an
// auditlog.db created by an older build needs the column ALTER-ed in.
// The file is durable across restart/failover by design (7-day
// retention), so without this step every WriteAsync on an upgraded
// deployment would bind $ExecutionId against a missing column and the
// best-effort write path would silently drop every site audit row.
// SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
// probed first and the ALTER skipped when already there. The column is
// nullable with no default, so any row written before this migration
// reads back ExecutionId = null (back-compat).
AddColumnIfMissing("ExecutionId", "TEXT NULL");
// Audit Log #23 (ParentExecutionId): same idempotent upgrade path as
// ExecutionId above. A deployment that already ran the ExecutionId
// branch has an auditlog.db with the 21-column schema and no
// ParentExecutionId column; CREATE TABLE IF NOT EXISTS cannot add it,
// so it is ALTER-ed in here. Nullable with no default — rows written
// before this migration read back ParentExecutionId = null.
AddColumnIfMissing("ParentExecutionId", "TEXT NULL");
// SourceNode stamping: same idempotent upgrade path as ExecutionId /
// ParentExecutionId above. A deployment that already ran the
// ParentExecutionId branch has an auditlog.db with the 22-column
// schema and no SourceNode column; CREATE TABLE IF NOT EXISTS cannot
// add it, so it is ALTER-ed in here. Nullable with no default — rows
// written before this migration read back SourceNode = null.
AddColumnIfMissing("SourceNode", "TEXT NULL");
}
/// <summary>
/// Audit Log #23: additively adds a column to <c>AuditLog</c> only when
/// it is not already present (used for <c>ExecutionId</c> and
/// <c>ParentExecutionId</c>). SQLite lacks <c>ADD COLUMN IF NOT EXISTS</c>,
/// so the schema is probed via <c>PRAGMA table_info</c> first. Idempotent —
/// safe to run on every <see cref="InitializeSchema"/>. Mirrors
/// <c>StoreAndForwardStorage.AddColumnIfMissingAsync</c>; kept synchronous
/// here to match the rest of this writer's bootstrap DDL.
/// Enqueues an audit event for asynchronous batched persistence to SQLite.
/// Back-pressure is applied when the write channel is full.
/// </summary>
private void AddColumnIfMissing(string columnName, string columnDefinition)
{
using var probe = _connection.CreateCommand();
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('AuditLog') WHERE name = $name";
probe.Parameters.AddWithValue("$name", columnName);
var exists = Convert.ToInt32(probe.ExecuteScalar()) > 0;
if (exists)
{
return;
}
using var alter = _connection.CreateCommand();
// Column name + definition are caller-controlled constants, never user
// input — safe to interpolate (parameters are not permitted in DDL).
alter.CommandText = $"ALTER TABLE AuditLog ADD COLUMN {columnName} {columnDefinition}";
alter.ExecuteNonQuery();
}
/// <inheritdoc />
/// <param name="evt">The audit event to persist.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that completes when the event has been persisted.</returns>
public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(evt);
// Site rows always carry a non-null ForwardState; central rows leave it
// null. Force Pending on enqueue so callers can pass a bare AuditEvent
// without thinking about site-vs-central provenance.
var siteEvt = evt.ForwardState is null
? evt with { ForwardState = AuditForwardState.Pending }
: evt;
var pending = new PendingAuditEvent(siteEvt);
// The canonical record carries no ForwardState (a site-storage-only
// concern). Site rows always start Pending; the sidecar row is written
// alongside the canonical row in the same transaction.
var pending = new PendingAuditEvent(evt, AuditForwardState.Pending);
// CreateBounded(FullMode=Wait) means WriteAsync will await room rather
// than throw when full — exactly the hot-path back-pressure semantics
@@ -316,96 +331,99 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
using var transaction = _connection.BeginTransaction();
try
{
using var cmd = _connection.CreateCommand();
cmd.Transaction = transaction;
cmd.CommandText = """
INSERT INTO AuditLog (
EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
// INSERT 1: the canonical row, stored DIRECTLY (the 10 canonical
// fields straight off the AuditEvent — no Decompose; audit_event
// holds canonical shape, not the legacy 24-column shape).
using var eventCmd = _connection.CreateCommand();
eventCmd.Transaction = transaction;
eventCmd.CommandText = """
INSERT INTO audit_event (
EventId, OccurredAtUtc, Actor, Action, Outcome,
Category, Target, SourceNode, CorrelationId, DetailsJson
) VALUES (
$EventId, $OccurredAtUtc, $Channel, $Kind, $CorrelationId,
$SourceSiteId, $SourceNode, $SourceInstanceId, $SourceScript, $Actor, $Target,
$Status, $HttpStatus, $DurationMs, $ErrorMessage, $ErrorDetail,
$RequestSummary, $ResponseSummary, $PayloadTruncated, $Extra, $ForwardState,
$ExecutionId, $ParentExecutionId
$EventId, $OccurredAtUtc, $Actor, $Action, $Outcome,
$Category, $Target, $SourceNode, $CorrelationId, $DetailsJson
);
""";
var eEventId = eventCmd.Parameters.Add("$EventId", SqliteType.Text);
var eOccurredAt = eventCmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
var eActor = eventCmd.Parameters.Add("$Actor", SqliteType.Text);
var eAction = eventCmd.Parameters.Add("$Action", SqliteType.Text);
var eOutcome = eventCmd.Parameters.Add("$Outcome", SqliteType.Text);
var eCategory = eventCmd.Parameters.Add("$Category", SqliteType.Text);
var eTarget = eventCmd.Parameters.Add("$Target", SqliteType.Text);
var eSourceNode = eventCmd.Parameters.Add("$SourceNode", SqliteType.Text);
var eCorrelationId = eventCmd.Parameters.Add("$CorrelationId", SqliteType.Text);
var eDetailsJson = eventCmd.Parameters.Add("$DetailsJson", SqliteType.Text);
var pEventId = cmd.Parameters.Add("$EventId", SqliteType.Text);
var pOccurredAt = cmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
var pChannel = cmd.Parameters.Add("$Channel", SqliteType.Text);
var pKind = cmd.Parameters.Add("$Kind", SqliteType.Text);
var pCorrelationId = cmd.Parameters.Add("$CorrelationId", SqliteType.Text);
var pSourceSiteId = cmd.Parameters.Add("$SourceSiteId", SqliteType.Text);
var pSourceNode = cmd.Parameters.Add("$SourceNode", SqliteType.Text);
var pSourceInstanceId = cmd.Parameters.Add("$SourceInstanceId", SqliteType.Text);
var pSourceScript = cmd.Parameters.Add("$SourceScript", SqliteType.Text);
var pActor = cmd.Parameters.Add("$Actor", SqliteType.Text);
var pTarget = cmd.Parameters.Add("$Target", SqliteType.Text);
var pStatus = cmd.Parameters.Add("$Status", SqliteType.Text);
var pHttpStatus = cmd.Parameters.Add("$HttpStatus", SqliteType.Integer);
var pDurationMs = cmd.Parameters.Add("$DurationMs", SqliteType.Integer);
var pErrorMessage = cmd.Parameters.Add("$ErrorMessage", SqliteType.Text);
var pErrorDetail = cmd.Parameters.Add("$ErrorDetail", SqliteType.Text);
var pRequestSummary = cmd.Parameters.Add("$RequestSummary", SqliteType.Text);
var pResponseSummary = cmd.Parameters.Add("$ResponseSummary", SqliteType.Text);
var pPayloadTruncated = cmd.Parameters.Add("$PayloadTruncated", SqliteType.Integer);
var pExtra = cmd.Parameters.Add("$Extra", SqliteType.Text);
var pForwardState = cmd.Parameters.Add("$ForwardState", SqliteType.Text);
var pExecutionId = cmd.Parameters.Add("$ExecutionId", SqliteType.Text);
var pParentExecutionId = cmd.Parameters.Add("$ParentExecutionId", SqliteType.Text);
// INSERT 2: the operational sidecar row. ForwardState=Pending,
// OccurredAtUtc duplicated for the drain index, IsCachedKind
// precomputed (so the read split never parses DetailsJson),
// AttemptCount=0, LastAttemptUtc=NULL.
using var fwdCmd = _connection.CreateCommand();
fwdCmd.Transaction = transaction;
fwdCmd.CommandText = """
INSERT INTO audit_forward_state (
EventId, ForwardState, OccurredAtUtc, IsCachedKind, AttemptCount, LastAttemptUtc
) VALUES (
$EventId, $ForwardState, $OccurredAtUtc, $IsCachedKind, 0, NULL
);
""";
var fEventId = fwdCmd.Parameters.Add("$EventId", SqliteType.Text);
var fForwardState = fwdCmd.Parameters.Add("$ForwardState", SqliteType.Text);
var fOccurredAt = fwdCmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
var fIsCachedKind = fwdCmd.Parameters.Add("$IsCachedKind", SqliteType.Integer);
foreach (var pending in batch)
{
var e = pending.Event;
pEventId.Value = e.EventId.ToString();
pOccurredAt.Value = e.OccurredAtUtc.ToString("o");
pChannel.Value = e.Channel.ToString();
pKind.Value = e.Kind.ToString();
pCorrelationId.Value = (object?)e.CorrelationId?.ToString() ?? DBNull.Value;
pSourceSiteId.Value = (object?)e.SourceSiteId ?? DBNull.Value;
var evt = pending.Event;
// Canonical OccurredAtUtc is UTC by construction; store the
// round-trip "o" form so string comparison stays monotonic
// (the drain range-scan and ORDER BY rely on it).
var occurredText = evt.OccurredAtUtc.UtcDateTime.ToString(
"o", System.Globalization.CultureInfo.InvariantCulture);
eEventId.Value = evt.EventId.ToString();
eOccurredAt.Value = occurredText;
// Canonical Actor is a required non-null string.
eActor.Value = evt.Actor ?? string.Empty;
eAction.Value = evt.Action;
eOutcome.Value = evt.Outcome.ToString();
eCategory.Value = (object?)evt.Category ?? DBNull.Value;
eTarget.Value = (object?)evt.Target ?? DBNull.Value;
// SourceNode-stamping: caller-provided value wins (preserves
// rows reconciled in from other nodes via the same writer);
// otherwise stamp from the local INodeIdentityProvider. The
// event record itself is NOT mutated — stamping is at write
// time only. If the provider also returns null (unconfigured
// node), the row's SourceNode stays NULL — operators see
// "needs config" via the schema, not a magic fallback string.
var sourceNode = e.SourceNode ?? _nodeIdentity.NodeName;
pSourceNode.Value = (object?)sourceNode ?? DBNull.Value;
pSourceInstanceId.Value = (object?)e.SourceInstanceId ?? DBNull.Value;
pSourceScript.Value = (object?)e.SourceScript ?? DBNull.Value;
pActor.Value = (object?)e.Actor ?? DBNull.Value;
pTarget.Value = (object?)e.Target ?? DBNull.Value;
pStatus.Value = e.Status.ToString();
pHttpStatus.Value = (object?)e.HttpStatus ?? DBNull.Value;
pDurationMs.Value = (object?)e.DurationMs ?? DBNull.Value;
pErrorMessage.Value = (object?)e.ErrorMessage ?? DBNull.Value;
pErrorDetail.Value = (object?)e.ErrorDetail ?? DBNull.Value;
pRequestSummary.Value = (object?)e.RequestSummary ?? DBNull.Value;
pResponseSummary.Value = (object?)e.ResponseSummary ?? DBNull.Value;
pPayloadTruncated.Value = e.PayloadTruncated ? 1 : 0;
pExtra.Value = (object?)e.Extra ?? DBNull.Value;
pForwardState.Value = (e.ForwardState ?? AuditForwardState.Pending).ToString();
pExecutionId.Value = (object?)e.ExecutionId?.ToString() ?? DBNull.Value;
pParentExecutionId.Value = (object?)e.ParentExecutionId?.ToString() ?? DBNull.Value;
// node), the column stays NULL — operators see "needs config"
// via the schema, not a magic fallback string.
var sourceNode = evt.SourceNode ?? _nodeIdentity.NodeName;
eSourceNode.Value = (object?)sourceNode ?? DBNull.Value;
eCorrelationId.Value = (object?)evt.CorrelationId?.ToString() ?? DBNull.Value;
eDetailsJson.Value = (object?)evt.DetailsJson ?? DBNull.Value;
fEventId.Value = evt.EventId.ToString();
fForwardState.Value = pending.ForwardState.ToString();
fOccurredAt.Value = occurredText;
fIsCachedKind.Value = IsCachedKind(evt.DetailsJson) ? 1 : 0;
try
{
cmd.ExecuteNonQuery();
eventCmd.ExecuteNonQuery();
fwdCmd.ExecuteNonQuery();
pending.Completion.TrySetResult();
}
catch (SqliteException ex) when (ex.SqliteErrorCode == SqliteErrorConstraint)
{
// Duplicate EventId — first-write-wins (alog.md §11).
// Treat as success: the lifecycle event is durably
// recorded under the first writer's payload.
// Duplicate EventId — first-write-wins (alog.md §11). The
// audit_event PRIMARY KEY throws before the sidecar insert
// runs, so neither table gains a second row. Treat as
// success: the lifecycle event is durably recorded under
// the first writer's payload.
_logger.LogDebug(ex,
"Duplicate EventId {EventId} swallowed by SqliteAuditWriter",
e.EventId);
evt.EventId);
pending.Completion.TrySetResult();
}
}
@@ -427,18 +445,43 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
// AuditLog-001: cached-lifecycle audit kinds that ride the combined-telemetry
// drain (joined with the operational tracking row + pushed via
// IngestCachedTelemetryAsync into the central dual-write transaction).
// ReadPendingAsync EXCLUDES these so the audit-only drain doesn't double-emit
// them; ReadPendingCachedTelemetryAsync below is the dedicated read surface
// the new SiteAuditTelemetryActor cached-drain uses.
private static readonly string[] CachedTelemetryKindNames =
// C4: this is the SAME set the pre-C4 ReadPendingCachedTelemetryAsync query
// filtered on (Kind IN (...)); it is now precomputed into the sidecar's
// IsCachedKind flag at INSERT (see IsCachedKind) so the read split is a cheap
// integer predicate, not a JSON parse. ReadPendingAsync drains everything
// with IsCachedKind=0; ReadPendingCachedTelemetryAsync drains IsCachedKind=1.
private static readonly HashSet<AuditKind> CachedTelemetryKinds = new()
{
nameof(AuditKind.CachedSubmit),
nameof(AuditKind.ApiCallCached),
nameof(AuditKind.DbWriteCached),
nameof(AuditKind.CachedResolve),
AuditKind.CachedSubmit,
AuditKind.ApiCallCached,
AuditKind.DbWriteCached,
AuditKind.CachedResolve,
};
/// <inheritdoc />
/// <summary>
/// C4: precomputes the sidecar's <c>IsCachedKind</c> flag from a canonical
/// row's <c>DetailsJson</c>. Parses the <see cref="AuditDetails.Kind"/>
/// discriminator via <see cref="AuditDetailsCodec"/> and returns <c>true</c>
/// iff it is one of the cached-lifecycle kinds
/// (<see cref="AuditKind.CachedSubmit"/>, <see cref="AuditKind.ApiCallCached"/>,
/// <see cref="AuditKind.DbWriteCached"/>, <see cref="AuditKind.CachedResolve"/>).
/// Runs once per event at INSERT time so the cached/non-cached drain split is
/// a cheap integer predicate on read, never a JSON parse on the hot path.
/// </summary>
private static bool IsCachedKind(string? detailsJson)
{
var details = AuditDetailsCodec.Deserialize(detailsJson);
var kind = AuditRowProjection.ParseEnum(details.Kind, AuditKind.InboundRequest);
return CachedTelemetryKinds.Contains(kind);
}
/// <summary>
/// Returns up to <paramref name="limit"/> non-cached pending audit events, oldest first.
/// Cached-lifecycle kinds are excluded; use <see cref="ReadPendingCachedTelemetryAsync"/> for those.
/// </summary>
/// <param name="limit">Maximum number of rows to return.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a read-only list of pending audit events.</returns>
public Task<IReadOnlyList<AuditEvent>> ReadPendingAsync(int limit, CancellationToken ct = default)
{
if (limit <= 0)
@@ -449,51 +492,45 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
// AuditLog-005: read via the dedicated _readConnection so this scan
// (which can be expensive when the backlog grows under a central
// outage) does not block the batched writer on _writeLock. WAL mode
// gives us a stable snapshot of the table while writes proceed on the
// gives us a stable snapshot of the tables while writes proceed on the
// writer connection. _readLock serialises this connection across
// multiple concurrent read callers since SqliteConnection itself is
// not thread-safe.
// AuditLog-001: NOT IN ($cached1,$cached2,$cached3,$cached4) excludes the
// cached-lifecycle kinds — they flow through ReadPendingCachedTelemetryAsync
// + the combined-telemetry drain. Kind is stored as the enum's name (see
// FlushBatch's pKind.Value), so a string-IN against the constant kind
// names matches the on-disk shape exactly.
// C4: JOIN the sidecar and filter on IsCachedKind=0 — the cached-
// lifecycle kinds (IsCachedKind=1) flow through
// ReadPendingCachedTelemetryAsync + the combined-telemetry drain. The
// split is a precomputed integer predicate on the indexed sidecar, not
// a DetailsJson parse. Ordering is by the sidecar's OccurredAtUtc with
// EventId as the deterministic tiebreaker.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState = $pending
AND Kind NOT IN ($k0, $k1, $k2, $k3)
ORDER BY OccurredAtUtc ASC, EventId ASC
SELECT ae.EventId, ae.OccurredAtUtc, ae.Actor, ae.Action, ae.Outcome,
ae.Category, ae.Target, ae.SourceNode, ae.CorrelationId, ae.DetailsJson
FROM audit_event ae
JOIN audit_forward_state fs ON fs.EventId = ae.EventId
WHERE fs.ForwardState = $pending
AND fs.IsCachedKind = 0
ORDER BY fs.OccurredAtUtc ASC, ae.EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
cmd.Parameters.AddWithValue("$limit", limit);
var rows = new List<AuditEvent>(Math.Min(limit, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
return Task.FromResult(ReadRows(cmd, limit));
}
}
/// <inheritdoc />
/// <summary>
/// Returns up to <paramref name="limit"/> pending cached-lifecycle audit events, oldest first.
/// Only rows with cached-call kinds (CachedSubmit, ApiCallCached, DbWriteCached, CachedResolve) are included.
/// </summary>
/// <param name="limit">Maximum number of rows to return.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a read-only list of pending cached-telemetry audit events.</returns>
public Task<IReadOnlyList<AuditEvent>> ReadPendingCachedTelemetryAsync(
int limit, CancellationToken ct = default)
{
@@ -502,42 +539,29 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
}
// AuditLog-001: dedicated read surface for the cached-call lifecycle
// drain — symmetric to ReadPendingAsync but filtered to the four
// cached AuditKinds. Same _readConnection + _readLock pattern so the
// hot-path writer is not contended.
// AuditLog-001 / C4: dedicated read surface for the cached-call lifecycle
// drain — symmetric to ReadPendingAsync but filtered to IsCachedKind=1.
// Same _readConnection + _readLock pattern so the hot-path writer is not
// contended.
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState = $pending
AND Kind IN ($k0, $k1, $k2, $k3)
ORDER BY OccurredAtUtc ASC, EventId ASC
SELECT ae.EventId, ae.OccurredAtUtc, ae.Actor, ae.Action, ae.Outcome,
ae.Category, ae.Target, ae.SourceNode, ae.CorrelationId, ae.DetailsJson
FROM audit_event ae
JOIN audit_forward_state fs ON fs.EventId = ae.EventId
WHERE fs.ForwardState = $pending
AND fs.IsCachedKind = 1
ORDER BY fs.OccurredAtUtc ASC, ae.EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$k0", CachedTelemetryKindNames[0]);
cmd.Parameters.AddWithValue("$k1", CachedTelemetryKindNames[1]);
cmd.Parameters.AddWithValue("$k2", CachedTelemetryKindNames[2]);
cmd.Parameters.AddWithValue("$k3", CachedTelemetryKindNames[3]);
cmd.Parameters.AddWithValue("$limit", limit);
var rows = new List<AuditEvent>(Math.Min(limit, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
return Task.FromResult(ReadRows(cmd, limit));
}
}
@@ -554,6 +578,7 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
/// </summary>
/// <param name="limit">Maximum number of rows to return.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a read-only list of forwarded audit events.</returns>
public Task<IReadOnlyList<AuditEvent>> ReadForwardedAsync(int limit, CancellationToken ct = default)
{
if (limit <= 0)
@@ -563,34 +588,27 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
// AuditLog-005: mirror ReadPendingAsync — read via _readConnection /
// _readLock so this query never contends with the batched writer on
// _writeLock.
// _writeLock. C4: JOIN the sidecar and filter on ForwardState='Forwarded'
// (no IsCachedKind split — both cached and non-cached Forwarded rows are
// returned, as before).
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState = $forwarded
ORDER BY OccurredAtUtc ASC, EventId ASC
SELECT ae.EventId, ae.OccurredAtUtc, ae.Actor, ae.Action, ae.Outcome,
ae.Category, ae.Target, ae.SourceNode, ae.CorrelationId, ae.DetailsJson
FROM audit_event ae
JOIN audit_forward_state fs ON fs.EventId = ae.EventId
WHERE fs.ForwardState = $forwarded
ORDER BY fs.OccurredAtUtc ASC, ae.EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
cmd.Parameters.AddWithValue("$limit", limit);
var rows = new List<AuditEvent>(Math.Min(limit, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
return Task.FromResult(ReadRows(cmd, limit));
}
}
@@ -608,11 +626,25 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _connection.CreateCommand();
// Build a single IN (...) parameter list so we issue one UPDATE per
// batch regardless of size. Each id is bound as its own parameter,
// so no string concatenation of user data ever enters the SQL.
// C4: flip the sidecar — UPDATE audit_forward_state, not the canonical
// audit_event (which is append-only / write-once). Bump AttemptCount +
// stamp LastAttemptUtc so operators can see how many drain passes a row
// took to forward. Build a single IN (...) parameter list so we issue
// one UPDATE per batch regardless of size. Each id is bound as its own
// parameter, so no string concatenation of user data ever enters the SQL.
//
// Defensive state guard: only transition rows that are still Pending or
// Forwarded (i.e. not yet Reconciled). Without this guard a mis-called
// batch that includes a Reconciled EventId would silently demote it back
// to Forwarded — a state regression that would cause duplicate central
// ingestion. Symmetric with MarkReconciledAsync's
// WHERE ForwardState IN ($pending, $forwarded)
// guard. Current callers only pass Pending IDs, so normal-path behaviour
// is unchanged; the guard is purely defensive.
var sb = new System.Text.StringBuilder();
sb.Append("UPDATE AuditLog SET ForwardState = $forwarded WHERE EventId IN (");
sb.Append("UPDATE audit_forward_state SET ForwardState = $forwarded, ")
.Append("AttemptCount = AttemptCount + 1, LastAttemptUtc = $now ")
.Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN (");
for (int i = 0; i < eventIds.Count; i++)
{
if (i > 0) sb.Append(',');
@@ -623,13 +655,24 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
sb.Append(");");
cmd.CommandText = sb.ToString();
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
cmd.Parameters.AddWithValue("$now", DateTime.UtcNow.ToString(
"o", System.Globalization.CultureInfo.InvariantCulture));
cmd.ExecuteNonQuery();
return Task.CompletedTask;
}
}
/// <inheritdoc />
/// <summary>
/// Returns up to <paramref name="batchSize"/> pending or forwarded audit events
/// with <see cref="AuditEvent.OccurredAtUtc"/> &gt;= <paramref name="sinceUtc"/>, oldest first.
/// Used by the M6 reconciliation-pull handler.
/// </summary>
/// <param name="sinceUtc">Lower bound timestamp (UTC) for event occurrence.</param>
/// <param name="batchSize">Maximum number of rows to return.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>A task that resolves to a read-only list of audit events since the given timestamp.</returns>
public Task<IReadOnlyList<AuditEvent>> ReadPendingSinceAsync(
DateTime sinceUtc, int batchSize, CancellationToken ct = default)
{
@@ -639,22 +682,24 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
}
// AuditLog-005: read via _readConnection / _readLock — same lock-
// decoupling as ReadPendingAsync.
// decoupling as ReadPendingAsync. C4: JOIN the sidecar; the range scan
// is on the sidecar's duplicated OccurredAtUtc so it stays on IX_fwd.
// Both Pending and Forwarded rows are returned (the central reconciliation
// puller dedups on EventId; re-shipping a Forwarded-but-not-yet-ingested
// row is safe).
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
SourceSiteId, SourceNode, SourceInstanceId, SourceScript, Actor, Target,
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
ExecutionId, ParentExecutionId
FROM AuditLog
WHERE ForwardState IN ($pending, $forwarded)
AND OccurredAtUtc >= $since
ORDER BY OccurredAtUtc ASC, EventId ASC
SELECT ae.EventId, ae.OccurredAtUtc, ae.Actor, ae.Action, ae.Outcome,
ae.Category, ae.Target, ae.SourceNode, ae.CorrelationId, ae.DetailsJson
FROM audit_event ae
JOIN audit_forward_state fs ON fs.EventId = ae.EventId
WHERE fs.ForwardState IN ($pending, $forwarded)
AND fs.OccurredAtUtc >= $since
ORDER BY fs.OccurredAtUtc ASC, ae.EventId ASC
LIMIT $limit;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
@@ -666,14 +711,7 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
"o", System.Globalization.CultureInfo.InvariantCulture));
cmd.Parameters.AddWithValue("$limit", batchSize);
var rows = new List<AuditEvent>(Math.Min(batchSize, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
return Task.FromResult(ReadRows(cmd, batchSize));
}
}
@@ -691,8 +729,11 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
ObjectDisposedException.ThrowIf(_disposed, this);
using var cmd = _connection.CreateCommand();
// C4: flip the sidecar from Pending/Forwarded → Reconciled. Rows
// already Reconciled are left untouched (idempotent re-call), and the
// canonical audit_event row is never modified.
var sb = new System.Text.StringBuilder();
sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ")
sb.Append("UPDATE audit_forward_state SET ForwardState = $reconciled ")
.Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN (");
for (int i = 0; i < eventIds.Count; i++)
{
@@ -724,18 +765,17 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
// central outage the Pending backlog can grow to hundreds of thousands
// of rows and the COUNT(*) scan correspondingly stretches; that no
// longer adds tail latency to user-facing audit writes.
// C4: count over the sidecar (audit_forward_state) — the canonical
// audit_event table carries no ForwardState. The IX_fwd index makes both
// aggregates cheap (count is a covering scan, min is the first key).
lock (_readLock)
{
ObjectDisposedException.ThrowIf(_disposed, this);
// Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
// index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
// index makes both aggregates cheap (count is a covering scan, min
// is the first key).
using var cmd = _readConnection.CreateCommand();
cmd.CommandText = """
SELECT COUNT(*), MIN(OccurredAtUtc)
FROM AuditLog
FROM audit_forward_state
WHERE ForwardState = $pending;
""";
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
@@ -786,35 +826,48 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
? value
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
/// <summary>
/// Executes <paramref name="cmd"/> (one of the four reads, each already
/// projecting the 10 <c>audit_event</c> columns in canonical order) and
/// materialises the rows via <see cref="MapRow"/>.
/// </summary>
private static IReadOnlyList<AuditEvent> ReadRows(SqliteCommand cmd, int capacityHint)
{
var rows = new List<AuditEvent>(Math.Min(capacityHint, 256));
using var reader = cmd.ExecuteReader();
while (reader.Read())
{
rows.Add(MapRow(reader));
}
return rows;
}
/// <summary>
/// C4: builds the canonical <see cref="AuditEvent"/> DIRECTLY from the 10
/// stored <c>audit_event</c> columns — no 24-column <c>Recompose</c>, because
/// <c>audit_event</c> already holds the canonical fields + <c>DetailsJson</c>.
/// <c>Outcome</c> is stored as the enum's name; the safe
/// <see cref="AuditRowProjection.ParseEnum{TEnum}"/> degrades an unknown/renamed
/// value gracefully rather than throwing.
/// </summary>
private static AuditEvent MapRow(SqliteDataReader reader)
{
return new AuditEvent
{
EventId = Guid.Parse(reader.GetString(0)),
OccurredAtUtc = DateTime.Parse(reader.GetString(1),
System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.RoundtripKind),
Channel = Enum.Parse<AuditChannel>(reader.GetString(2)),
Kind = Enum.Parse<AuditKind>(reader.GetString(3)),
CorrelationId = reader.IsDBNull(4) ? null : Guid.Parse(reader.GetString(4)),
SourceSiteId = reader.IsDBNull(5) ? null : reader.GetString(5),
SourceNode = reader.IsDBNull(6) ? null : reader.GetString(6),
SourceInstanceId = reader.IsDBNull(7) ? null : reader.GetString(7),
SourceScript = reader.IsDBNull(8) ? null : reader.GetString(8),
Actor = reader.IsDBNull(9) ? null : reader.GetString(9),
Target = reader.IsDBNull(10) ? null : reader.GetString(10),
Status = Enum.Parse<AuditStatus>(reader.GetString(11)),
HttpStatus = reader.IsDBNull(12) ? null : reader.GetInt32(12),
DurationMs = reader.IsDBNull(13) ? null : reader.GetInt32(13),
ErrorMessage = reader.IsDBNull(14) ? null : reader.GetString(14),
ErrorDetail = reader.IsDBNull(15) ? null : reader.GetString(15),
RequestSummary = reader.IsDBNull(16) ? null : reader.GetString(16),
ResponseSummary = reader.IsDBNull(17) ? null : reader.GetString(17),
PayloadTruncated = reader.GetInt32(18) != 0,
Extra = reader.IsDBNull(19) ? null : reader.GetString(19),
ForwardState = Enum.Parse<AuditForwardState>(reader.GetString(20)),
ExecutionId = reader.IsDBNull(21) ? null : Guid.Parse(reader.GetString(21)),
ParentExecutionId = reader.IsDBNull(22) ? null : Guid.Parse(reader.GetString(22)),
OccurredAtUtc = new DateTimeOffset(DateTime.SpecifyKind(
DateTime.Parse(reader.GetString(1),
System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.RoundtripKind),
DateTimeKind.Utc)),
Actor = reader.GetString(2),
Action = reader.GetString(3),
Outcome = AuditRowProjection.ParseEnum(reader.GetString(4), AuditOutcome.Success),
Category = reader.IsDBNull(5) ? null : reader.GetString(5),
Target = reader.IsDBNull(6) ? null : reader.GetString(6),
SourceNode = reader.IsDBNull(7) ? null : reader.GetString(7),
CorrelationId = reader.IsDBNull(8) ? null : Guid.Parse(reader.GetString(8)),
DetailsJson = reader.IsDBNull(9) ? null : reader.GetString(9),
};
}
@@ -841,6 +894,7 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
}
/// <summary>Asynchronously disposes the audit writer and releases resources.</summary>
/// <returns>A <see cref="ValueTask"/> that completes when all resources have been released.</returns>
public async ValueTask DisposeAsync()
{
Task? writerLoop;
@@ -898,15 +952,19 @@ public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable
private sealed class PendingAuditEvent
{
/// <summary>Initializes a new instance of the PendingAuditEvent class.</summary>
/// <param name="evt">The audit event to persist.</param>
public PendingAuditEvent(AuditEvent evt)
/// <param name="evt">The canonical audit event to persist.</param>
/// <param name="forwardState">Initial site-local forwarding state written to the sidecar row (always Pending for fresh events).</param>
public PendingAuditEvent(AuditEvent evt, AuditForwardState forwardState)
{
Event = evt;
ForwardState = forwardState;
Completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
}
/// <summary>The audit event to persist.</summary>
/// <summary>The canonical audit event to persist.</summary>
public AuditEvent Event { get; }
/// <summary>Initial forwarding state for this row's sidecar (bound to audit_forward_state.ForwardState).</summary>
public AuditForwardState ForwardState { get; }
/// <summary>Task completion source for write completion signaling.</summary>
public TaskCompletionSource Completion { get; }
}
@@ -1,8 +1,8 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
@@ -141,37 +141,33 @@ public sealed class CachedCallLifecycleBridge : ICachedCallLifecycleObserver
var channel = ChannelStringToEnum(context.Channel);
return new CachedCallTelemetry(
Audit: new AuditEvent
{
EventId = Guid.NewGuid(),
OccurredAtUtc = DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
Channel = channel,
Kind = kind,
CorrelationId = context.TrackedOperationId.Value,
Audit: ScadaBridgeAuditEventFactory.Create(
channel: channel,
kind: kind,
status: status,
occurredAtUtc: DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
target: context.Target,
correlationId: context.TrackedOperationId.Value,
// Audit Log #23 (ExecutionId Task 4): the originating script
// execution's per-run correlation id, threaded through the S&F
// buffer; null on rows buffered before Task 4 (back-compat).
ExecutionId = context.ExecutionId,
executionId: context.ExecutionId,
// Audit Log #23 (ParentExecutionId Task 6): the spawning
// inbound-API request's ExecutionId, threaded through the S&F
// buffer alongside ExecutionId so the retry-loop cached rows
// correlate back to the cross-execution chain. Null for a
// non-routed run and on rows buffered before Task 6.
ParentExecutionId = context.ParentExecutionId,
SourceSiteId = string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
SourceInstanceId = context.SourceInstanceId,
parentExecutionId: context.ParentExecutionId,
sourceSiteId: string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
sourceInstanceId: context.SourceInstanceId,
// Audit Log #23 (ExecutionId Task 4): SourceScript is now
// threaded through the S&F buffer alongside ExecutionId — the
// retry-loop cached rows carry the same provenance the
// script-side cached rows do. Null on pre-Task-4 buffered rows.
SourceScript = context.SourceScript,
Target = context.Target,
Status = status,
HttpStatus = httpStatus,
DurationMs = context.DurationMs,
ErrorMessage = lastError,
ForwardState = AuditForwardState.Pending,
},
sourceScript: context.SourceScript,
httpStatus: httpStatus,
durationMs: context.DurationMs,
errorMessage: lastError),
Operational: new SiteCallOperational(
TrackedOperationId: context.TrackedOperationId,
Channel: context.Channel,
@@ -1,9 +1,9 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Integration;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
@@ -111,9 +111,11 @@ public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
// FallbackAuditWriter) handles transient writer failures upstream;
// a throw bubbling up here means the writer's own swallow contract
// failed, which is itself best-effort-handled.
// C3: Kind/Status are domain fields carried in DetailsJson — decompose to log them.
var d = AuditRowProjection.Decompose(telemetry.Audit);
_logger.LogWarning(ex,
"CachedCallTelemetryForwarder: audit emission threw for EventId {EventId} (Kind {Kind}, Status {Status})",
telemetry.Audit.EventId, telemetry.Audit.Kind, telemetry.Audit.Status);
d.EventId, d.Kind, d.Status);
}
}
@@ -128,9 +130,12 @@ public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
return;
}
// C3: the audit half's domain fields (Kind/SourceInstanceId/SourceScript)
// ride inside DetailsJson — decompose once for this packet.
var audit = AuditRowProjection.Decompose(telemetry.Audit);
try
{
switch (telemetry.Audit.Kind)
switch (audit.Kind)
{
case AuditKind.CachedSubmit:
// Enqueue — insert-if-not-exists with the operational
@@ -144,8 +149,8 @@ public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
telemetry.Operational.TrackedOperationId,
telemetry.Operational.Channel,
telemetry.Operational.Target,
telemetry.Audit.SourceInstanceId,
telemetry.Audit.SourceScript,
audit.SourceInstanceId,
audit.SourceScript,
sourceNode: _nodeIdentity?.NodeName,
ct).ConfigureAwait(false);
break;
@@ -180,7 +185,7 @@ public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
// forwarder.
_logger.LogWarning(
"CachedCallTelemetryForwarder: unexpected audit kind {Kind} on tracking emission for EventId {EventId}",
telemetry.Audit.Kind, telemetry.Audit.EventId);
audit.Kind, audit.EventId);
break;
}
}
@@ -1,5 +1,5 @@
using Akka.Actor;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
@@ -44,6 +44,9 @@ public sealed class ClusterClientSiteAuditClient : ISiteStreamAuditClient
private readonly IActorRef _siteCommunicationActor;
private readonly TimeSpan _askTimeout;
/// <summary>
/// Initializes a new instance that forwards audit telemetry to central via the site's <c>SiteCommunicationActor</c>.
/// </summary>
/// <param name="siteCommunicationActor">
/// The site's <c>SiteCommunicationActor</c> — it forwards the ingest command
/// over the registered central ClusterClient and routes the reply back to
@@ -22,6 +22,7 @@ public interface ISiteStreamAuditClient
/// </summary>
/// <param name="batch">The batch of audit events to forward.</param>
/// <param name="ct">Cancellation token for the operation.</param>
/// <returns>A task that resolves to the ingest acknowledgement containing accepted event IDs.</returns>
Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct);
/// <summary>
@@ -42,5 +43,6 @@ public interface ISiteStreamAuditClient
/// </remarks>
/// <param name="batch">The batch of cached-call telemetry packets to forward.</param>
/// <param name="ct">Cancellation token for the operation.</param>
/// <returns>A task that resolves to the ingest acknowledgement containing accepted event IDs.</returns>
Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct);
}
@@ -2,10 +2,11 @@ using Akka.Actor;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
using ZB.MOM.WW.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
using ZB.MOM.WW.ScadaBridge.Communication.Grpc;
namespace ZB.MOM.WW.ScadaBridge.AuditLog.Site.Telemetry;
@@ -259,8 +260,8 @@ public class SiteAuditTelemetryActor : ReceiveActor
// row stays Pending (still not in emittedEventIds) and
// central reconciliation will pick it up.
_logger.LogWarning(
"Cached-telemetry drain: audit row {EventId} ({Kind}) has no CorrelationId; skipping.",
auditRow.EventId, auditRow.Kind);
"Cached-telemetry drain: audit row {EventId} ({Action}) has no CorrelationId; skipping.",
auditRow.EventId, auditRow.Action);
continue;
}
@@ -363,10 +364,13 @@ public class SiteAuditTelemetryActor : ReceiveActor
private static CachedTelemetryPacket BuildCachedPacket(
AuditEvent auditRow, TrackingStatusSnapshot snapshot)
{
var sourceSite = auditRow.SourceSiteId ?? string.Empty;
// C3: SourceSiteId + Channel ride inside the canonical record's
// DetailsJson — decompose to read them.
var audit = AuditRowProjection.Decompose(auditRow);
var sourceSite = audit.SourceSiteId ?? string.Empty;
// Channel string form mirrors the AuditChannel-to-string convention used
// by SiteCallOperational + CachedCallLifecycleBridge.BuildPacket.
var channelString = auditRow.Channel.ToString();
var channelString = audit.Channel.ToString();
var target = auditRow.Target ?? snapshot.TargetSummary ?? string.Empty;
var operationalDto = new SiteCallOperationalDto
@@ -16,6 +16,7 @@
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
<PackageReference Include="ZB.MOM.WW.Configuration" />
</ItemGroup>
<ItemGroup>
@@ -13,6 +13,7 @@ public static class ApiMethodCommands
/// <param name="formatOption">Global option for the output format.</param>
/// <param name="usernameOption">Global option for the authentication username.</param>
/// <param name="passwordOption">Global option for the authentication password.</param>
/// <returns>The configured <c>api-method</c> command with all subcommands registered.</returns>
public static Command Build(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var command = new Command("api-method") { Description = "Manage inbound API methods" };
@@ -18,6 +18,7 @@ public static class AuditCommands
/// <param name="formatOption">Global <c>--format</c> option for output format.</param>
/// <param name="usernameOption">Global <c>--username</c> option for authentication.</param>
/// <param name="passwordOption">Global <c>--password</c> option for authentication.</param>
/// <returns>The configured <c>audit</c> <see cref="Command"/> with all sub-commands attached.</returns>
public static Command Build(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var command = new Command("audit") { Description = "Query and export the centralized audit log" };
@@ -74,6 +74,7 @@ public static class AuditExportHelpers
/// </summary>
/// <param name="args">The export arguments containing filters and format.</param>
/// <param name="now">The current time for resolving relative time specifications.</param>
/// <returns>The full query string (including the leading <c>?</c>) for the export endpoint.</returns>
public static string BuildQueryString(AuditExportArgs args, DateTimeOffset now)
{
var parts = new List<string>();
@@ -116,6 +117,7 @@ public static class AuditExportHelpers
/// <param name="args">The export arguments containing filters and output file path.</param>
/// <param name="output">Text writer for command output messages.</param>
/// <param name="now">The current time for resolving relative time specifications.</param>
/// <returns>0 on success, 1 on general error, or 2 on authorization failure.</returns>
public static async Task<int> RunExportAsync(
ManagementHttpClient client, AuditExportArgs args, TextWriter output, DateTimeOffset now)
{
@@ -178,6 +180,8 @@ public static class AuditExportHelpers
/// to extract the <c>code</c> field. Returns null if the body is empty, not valid JSON, or
/// has no <c>code</c> property — callers fall back to "ERROR" in that case.
/// </summary>
/// <param name="body">The HTTP response body string to parse for an error code.</param>
/// <returns>The <c>code</c> string from the JSON error envelope, or null if absent or unparseable.</returns>
internal static string? TryExtractErrorCode(string body)
{
if (string.IsNullOrWhiteSpace(body))
@@ -43,6 +43,7 @@ public static class AuditFormatterFactory
/// </summary>
/// <param name="format">Format name; <c>table</c> selects the table formatter, any other value selects JSONL.</param>
/// <param name="notices">Writer for notice messages emitted during formatting.</param>
/// <returns>The <see cref="IAuditFormatter"/> appropriate for the requested format.</returns>
public static IAuditFormatter Create(string format, TextWriter notices)
{
if (string.Equals(format, "table", StringComparison.OrdinalIgnoreCase))
@@ -50,6 +50,7 @@ public static class AuditLogCommands
/// <param name="formatOption">Global output format option.</param>
/// <param name="usernameOption">Global username option.</param>
/// <param name="passwordOption">Global password option.</param>
/// <returns>The configured <c>audit-config</c> command with all sub-commands registered.</returns>
public static Command Build(Option<string> urlOption, Option<string> formatOption, Option<string> usernameOption, Option<string> passwordOption)
{
var command = new Command("audit-config") { Description = "Query the configuration-change audit log" };
@@ -61,6 +61,7 @@ public static class AuditQueryHelpers
/// <param name="spec">The time specification string.</param>
/// <param name="now">The current time used as reference for relative specs.</param>
/// <exception cref="FormatException">The spec is neither a known relative form nor a parseable ISO-8601 timestamp.</exception>
/// <returns>The resolved absolute <see cref="DateTimeOffset"/> in UTC.</returns>
public static DateTimeOffset ResolveTimeSpec(string spec, DateTimeOffset now)
{
if (string.IsNullOrWhiteSpace(spec))
@@ -103,6 +104,7 @@ public static class AuditQueryHelpers
/// <param name="now">The current time for resolving relative time specs.</param>
/// <param name="afterOccurredAtUtc">Optional keyset cursor timestamp.</param>
/// <param name="afterEventId">Optional keyset cursor event ID.</param>
/// <returns>A URL query string (starting with <c>?</c>) containing the encoded filter parameters, or an empty string if no parameters are set.</returns>
public static string BuildQueryString(
AuditQueryArgs args, DateTimeOffset now, DateTimeOffset? afterOccurredAtUtc, string? afterEventId)
{
@@ -169,6 +171,7 @@ public static class AuditQueryHelpers
/// <param name="formatter">The audit result formatter.</param>
/// <param name="output">The output writer for results.</param>
/// <param name="now">The current time for resolving relative time specs.</param>
/// <returns>A task that resolves to <c>0</c> on success, <c>1</c> on HTTP/transport error, or <c>2</c> on authorization failure.</returns>
public static async Task<int> RunQueryAsync(
ManagementHttpClient client,
AuditQueryArgs args,
@@ -14,6 +14,7 @@ public static class AuditVerifyChainHelpers
/// with a real month (01-12). A malformed month (e.g. <c>2026-13</c>) is rejected.
/// </summary>
/// <param name="month">The month string to validate in YYYY-MM format.</param>
/// <returns><c>true</c> if the string is a well-formed YYYY-MM value with a real month; otherwise <c>false</c>.</returns>
public static bool IsValidMonth(string? month)
=> !string.IsNullOrWhiteSpace(month)
&& DateTime.TryParseExact(month, "yyyy-MM", CultureInfo.InvariantCulture,
@@ -61,7 +61,9 @@ public static class BundleCommands
var dbConnectionsOption = NameListOption("--db-connections", "Comma-separated database-connection names");
var notificationListsOption = NameListOption("--notification-lists", "Comma-separated notification-list names");
var smtpConfigsOption = NameListOption("--smtp-configs", "Comma-separated SMTP host names");
var apiKeysOption = NameListOption("--api-keys", "Comma-separated API-key names");
// Inbound API keys are not transported between environments (re-arch C4) — no
// --api-keys option. Re-create keys and re-grant their method scopes on the
// destination via the admin UI/CLI.
var apiMethodsOption = NameListOption("--api-methods", "Comma-separated API-method names");
var includeDepsOption = new Option<bool>("--include-dependencies")
{
@@ -85,7 +87,6 @@ public static class BundleCommands
cmd.Add(dbConnectionsOption);
cmd.Add(notificationListsOption);
cmd.Add(smtpConfigsOption);
cmd.Add(apiKeysOption);
cmd.Add(apiMethodsOption);
cmd.Add(includeDepsOption);
cmd.Add(sourceEnvOption);
@@ -106,7 +107,6 @@ public static class BundleCommands
DatabaseConnectionNames: result.GetValue(dbConnectionsOption),
NotificationListNames: result.GetValue(notificationListsOption),
SmtpConfigurationNames: result.GetValue(smtpConfigsOption),
ApiKeyNames: result.GetValue(apiKeysOption),
ApiMethodNames: result.GetValue(apiMethodsOption),
IncludeDependencies: includeDeps,
Passphrase: passphrase,
@@ -307,6 +307,13 @@ public static class BundleCommands
// for the post-write summary line.
internal const int Base64StreamChunkChars = 1024 * 1024; // 1 MB of base64 chars ≈ 768 KB decoded
/// <summary>
/// Decodes a base64 string into <paramref name="outputPath"/> in chunked fashion to avoid
/// large intermediate allocations. Returns the total number of decoded bytes written.
/// </summary>
/// <param name="base64">The base64-encoded content to decode and write.</param>
/// <param name="outputPath">Destination file path; created or overwritten.</param>
/// <returns>Total number of bytes written to the output file.</returns>
internal static long StreamBase64ToFile(string base64, string outputPath)
{
if (base64 is null) throw new ArgumentNullException(nameof(base64));
@@ -17,6 +17,7 @@ internal static class CliOptions
/// typo (e.g. <c>--format tabel</c>) is rejected with a clear parse error rather
/// than silently falling through to JSON.
/// </summary>
/// <returns>The configured <c>--format</c> option constrained to "json" or "table".</returns>
internal static Option<string> CreateFormatOption()
{
var formatOption = new Option<string>("--format")

Some files were not shown because too many files have changed in this diff Show More