Compare commits
334 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5198b114b4 | |||
| fd76c19007 | |||
| 24cdfe373c | |||
| 1ba62052d6 | |||
| cfd8f1ecf4 | |||
| 6aac4c8ed7 | |||
| 85bb61a1f3 | |||
| 705ae95404 | |||
| 6f5a35f222 | |||
| 0149ce6180 | |||
| 6b16a48886 | |||
| 990731d12f | |||
| fd12021984 | |||
| 4002f4197b | |||
| 6ffa47f258 | |||
| c9229c35fc | |||
| aadb1fd72a | |||
| 8243f61e96 | |||
| 53508c79b2 | |||
| 849a011400 | |||
| 405de525ca | |||
| 77922abb33 | |||
| 5f544bfe1e | |||
| aaa6df24cf | |||
| ae7329034f | |||
| e36f0bf9c8 | |||
| a3eb659b75 | |||
| d34f536220 | |||
| 40955bbca6 | |||
| 7a386a80ce | |||
| c503df4c4c | |||
| f1478c5a19 | |||
| f64a7aed02 | |||
| 2a76be1f94 | |||
| 37c7a0e5ac | |||
| b3b02a8cb6 | |||
| 44f1ee372a | |||
| d73b459057 | |||
| 7e9d74697b | |||
| 3cf2b4d47e | |||
| 7816b840c1 | |||
| ac1f73cf8a | |||
| e3519fdb39 | |||
| 6f0d2ca499 | |||
| fdd1a4b886 | |||
| 6f59a1b546 | |||
| de5280d1c7 | |||
| 8c78913503 | |||
| 6d073046c6 | |||
| 5fe08eaceb | |||
| 44f7aabe31 | |||
| babf5b99e7 | |||
| 194cae2fbf | |||
| 8fd0cf355b | |||
| ef5cf76026 | |||
| 80076a3951 | |||
| 1c9b2445ad | |||
| 163446948d | |||
| e58e038db9 | |||
| c66ef71017 | |||
| 399b4aac92 | |||
| ec92d55ebf | |||
| 932fda5594 | |||
| 5492c94e2f | |||
| 7a1c974839 | |||
| ff004e2e48 | |||
| 36d58e8988 | |||
| ba8ddcc032 | |||
| d40ee85e14 | |||
| 4b3a692170 | |||
| 91682cd862 | |||
| 2fa46ed400 | |||
| 3263b39477 | |||
| a1bdd94d4c | |||
| 263884fa63 | |||
| 9ba453191b | |||
| fac31c6018 | |||
| 9c955da2e7 | |||
| 6dea84cd28 | |||
| 8744630adb | |||
| 943c2ced39 | |||
| 38fc9b4102 | |||
| 1c20e81d77 | |||
| 450f8bca28 | |||
| ae4480e7aa | |||
| e052aa4ff8 | |||
| 13e84a76a7 | |||
| 12b86bea7a | |||
| a9f45b0861 | |||
| 2d13886286 | |||
| 8c2382c2bc | |||
| 6d7a03e099 | |||
| eb5fa8f2bc | |||
| 2138534581 | |||
| 66f6724c5d | |||
| ef49b55cf6 | |||
| 2744011ce9 | |||
| 70ed8d4557 | |||
| 42333a72ed | |||
| e93f655ce4 | |||
| 75b060e0a8 | |||
| cc2d6e91f1 | |||
| 660fdc4e93 | |||
| 6069a20e0f | |||
| c763bd9a04 | |||
| 640fd07454 | |||
| 25d9acbce3 | |||
| b0584f7a08 | |||
| db05af897e | |||
| adc490b690 | |||
| 1856b63f0c | |||
| 4eeda45f0e | |||
| b409afda2e | |||
| 23c0fd417e | |||
| 9b1379ed9b | |||
| 5a7f3e8bf6 | |||
| 37f17dc4a8 | |||
| ad7b330f43 | |||
| bba2ef1b4d | |||
| 25cdf857c9 | |||
| e7b40c1c50 | |||
| dae6de2c48 | |||
| ac7fc9ce4d | |||
| 065c8259ae | |||
| a7eea0a795 | |||
| 02727b3a66 | |||
| 56b26339ca | |||
| 1c862989b4 | |||
| 3c3f7770c1 | |||
| 855df759b5 | |||
| 6de377a39e | |||
| 1dfd67a90d | |||
| b31747a632 | |||
| e4d902753b | |||
| c410fc6d43 | |||
| f48efa7ca8 | |||
| d20e8f4e9d | |||
| 73a19c6f02 | |||
| c3d4e6b1e0 | |||
| f063b35633 | |||
| f4a7be4929 | |||
| a3b0fb7f08 | |||
| f81750b2aa | |||
| 6fe23a4d9b | |||
| 047988e4c8 | |||
| 63eb1f4225 | |||
| 42430dd10a | |||
| 2145b29d4d | |||
| 73719ee066 | |||
| 0a97fff906 | |||
| 2b54290c7f | |||
| de110f8b42 | |||
| bedfa6b8f3 | |||
| 6667f345fa | |||
| 3162286ade | |||
| e416b21dad | |||
| 0f28d13da7 | |||
| b86d7c61ab | |||
| 1c38dd540f | |||
| 4ca0b3ce2a | |||
| db34ba66bf | |||
| ee10f61582 | |||
| f0ee125afa | |||
| dd3351da93 | |||
| 82a8bbf225 | |||
| 9bf1497f03 | |||
| 87cae88f92 | |||
| b679430d13 | |||
| 126956eee6 | |||
| 5c3d601198 | |||
| ff8766ec8b | |||
| 55fbcce7a8 | |||
| 01480c6ea2 | |||
| 7173a79ad7 | |||
| d745ef0715 | |||
| eb22d3740f | |||
| ed442c7c8c | |||
| 6064c5c0fc | |||
| 3592e74085 | |||
| da68a2af7b | |||
| 7723bfb712 | |||
| a15ceb3ec9 | |||
| de839627ed | |||
| db32a149d3 | |||
| ce9d6301e3 | |||
| d9c99242a3 | |||
| 7d9550f779 | |||
| fb423b11ab | |||
| 08743bc42d | |||
| 8ac5ebe97e | |||
| e41a18ba7d | |||
| f80eea375c | |||
| 3f8b41182a | |||
| e8ae0e8544 | |||
| 39a3ca3347 | |||
| d3d4a5b13d | |||
| 9cc44cc8b2 | |||
| c929562e41 | |||
| 34ea97bae9 | |||
| 302380ef8e | |||
| e6e450a257 | |||
| 31befa1238 | |||
| f205746894 | |||
| 8d922391b8 | |||
| 72388a7616 | |||
| e6f7a7ff79 | |||
| 0ae1a254d7 | |||
| 61ec4161bf | |||
| c7ddfc7140 | |||
| 8a3e61c670 | |||
| 2039b1ddca | |||
| 117fa39d35 | |||
| 1517b9a03f | |||
| 1bbfad3fcd | |||
| 36a598840f | |||
| acb160ecce | |||
| 0daa63076d | |||
| 08ccd72365 | |||
| c334de03f4 | |||
| d93ca4c56e | |||
| fec0bb10ff | |||
| d08cedc8c4 | |||
| ec82027bd2 | |||
| d8d47821e3 | |||
| 38b51ef894 | |||
| d8dfbc79f4 | |||
| c2cd62e72a | |||
| e681a1f0e1 | |||
| 7589ea8da9 | |||
| e471a5a855 | |||
| 77cc4865c5 | |||
| d46cb56934 | |||
| d43d43d795 | |||
| 00ec265980 | |||
| f839f6ce45 | |||
| 354314dfe0 | |||
| 320e4d7479 | |||
| 17ef5f85de | |||
| 5efbb9a985 | |||
| 397498c120 | |||
| e0026c6da7 | |||
| 627c48c458 | |||
| a08ad09514 | |||
| e7ed858920 | |||
| f749a5f977 | |||
| 547a7b98e5 | |||
| 7ecf6448e3 | |||
| 562a1d1678 | |||
| 82745ef916 | |||
| 0be1feb561 | |||
| 8bb860ad5f | |||
| 22bac058dd | |||
| 34e464edab | |||
| 016f5d48a6 | |||
| 36332e5a94 | |||
| 0fa4ac5525 | |||
| 0f90c0ad9c | |||
| 25beb833fc | |||
| c95758c6ce | |||
| 1629a72093 | |||
| adcab9dcfc | |||
| d54c3da291 | |||
| 1bb0e62bb3 | |||
| 67b86aa683 | |||
| 0904401f1e | |||
| 113f00a6fa | |||
| 17861efa51 | |||
| 558f9ceb39 | |||
| a5653b4296 | |||
| c8b5871782 | |||
| 4b61e29e27 | |||
| 5e80f64cd8 | |||
| 213b9c7c0a | |||
| af22aa7ce1 | |||
| 9e7bc7b541 | |||
| 9b05e48ea6 | |||
| ad9872705d | |||
| afdf581e32 | |||
| 1d495d1a87 | |||
| 2ff62a2ceb | |||
| b88c75c116 | |||
| 3326bddeb0 | |||
| 05614e037a | |||
| 6a77c12735 | |||
| 703cb2d392 | |||
| 517437b0d9 | |||
| 41358c1cee | |||
| 77a05a8960 | |||
| 82e3eb0e93 | |||
| ab3721a2e8 | |||
| c41f43c87f | |||
| 4dc9f9e159 | |||
| 435c853dce | |||
| 04e00d56c6 | |||
| b8dece0e70 | |||
| 8d52890245 | |||
| fb589bf1da | |||
| c547f82957 | |||
| 6056ad58b0 | |||
| 5696a8af9f | |||
| 07cd185368 | |||
| 2c59d59b61 | |||
| 3022aa8379 | |||
| 761595309b | |||
| 87ac9b8a4d | |||
| ed7fddb0b5 | |||
| 397a62677f | |||
| 926ca902bd | |||
| f9b942bb94 | |||
| dff9e0aa76 | |||
| f2513a4ef4 | |||
| c8b8cd9150 | |||
| bb35453d58 | |||
| ba166bf503 | |||
| 7585612347 | |||
| b9e0ef4650 | |||
| aac3ee9bb3 | |||
| 097bb3361e | |||
| ef4614d710 | |||
| bb5519f6f2 | |||
| d7a16084b5 | |||
| f2d710a9dc | |||
| 0dd3be0bd8 | |||
| a59f5ec1ff | |||
| 6ccf3766dc | |||
| 0b56c809e1 | |||
| b83d16364a | |||
| 2a7e7289b3 | |||
| 0f96562bfd | |||
| a4ba2dfe01 | |||
| 0388720390 | |||
| f177b58699 | |||
| bbfa0c515e | |||
| d4e86c1b1d |
@@ -36,7 +36,7 @@ This project contains design documentation for a distributed SCADA system built
|
||||
- Use `git diff` to review changes before committing.
|
||||
- Commit related changes together with a descriptive message summarizing the design decision.
|
||||
|
||||
## Current Component List (20 components)
|
||||
## Current Component List (23 components)
|
||||
|
||||
1. Template Engine — Template modeling, inheritance, composition, validation, flattening, diffs.
|
||||
2. Deployment Manager — Central-side deployment pipeline, system-wide artifact deployment, instance lifecycle.
|
||||
@@ -45,7 +45,7 @@ This project contains design documentation for a distributed SCADA system built
|
||||
5. Central–Site Communication — Akka.NET ClusterClient (command/control) + gRPC server-streaming (real-time data), message patterns, debug streaming.
|
||||
6. Store-and-Forward Engine — Buffering, fixed-interval retry, parking, SQLite persistence, replication.
|
||||
7. External System Gateway — External system definitions, API method invocation, database connections.
|
||||
8. Notification Service — Notification lists, email delivery, store-and-forward integration.
|
||||
8. Notification Service — Central-only notification-list and SMTP definitions, per-type delivery adapters (sites no longer deliver notifications).
|
||||
9. Central UI — Web-based management interface, all workflows.
|
||||
10. Security & Auth — LDAP/AD authentication, role-based authorization, site-scoped permissions.
|
||||
11. Health Monitoring — Site health metrics collection and central reporting.
|
||||
@@ -58,6 +58,9 @@ This project contains design documentation for a distributed SCADA system built
|
||||
18. Management Service — Akka.NET actor providing programmatic access to all admin operations, ClusterClientReceptionist registration.
|
||||
19. CLI — Command-line tool using HTTP Management API, System.CommandLine, JSON/table output.
|
||||
20. Traefik Proxy — Reverse proxy/load balancer fronting central cluster, active node routing via `/health/active`, automatic failover.
|
||||
21. Notification Outbox — Central component ingesting store-and-forwarded notifications, `Notifications` audit table, dispatcher loop, retry/parking, delivery KPIs.
|
||||
22. Site Call Audit — Central component auditing site cached calls (`CachedCall`/`CachedWrite`); `SiteCalls` audit table, telemetry ingest, reconciliation, KPIs, central→site Retry/Discard relay; sites remain the source of truth.
|
||||
23. Audit Log — Central append-only AuditLog table spanning every script-trust-boundary action (outbound API sync+cached, outbound DB sync+cached, notifications, inbound API). Site SQLite hot-path + gRPC telemetry + reconciliation; combined telemetry with Site Call Audit; central direct-write for Notification Outbox dispatch + Inbound API; monthly partitioning, 365-day retention.
|
||||
|
||||
## Key Design Decisions (for context across sessions)
|
||||
|
||||
@@ -88,6 +91,9 @@ This project contains design documentation for a distributed SCADA system built
|
||||
- Dual call modes: `ExternalSystem.Call()` (synchronous) and `ExternalSystem.CachedCall()` (store-and-forward on transient failure).
|
||||
- Error classification: HTTP 5xx/408/429/connection errors = transient; other 4xx = permanent (returned to script).
|
||||
- Notification Service: SMTP with OAuth2 Client Credentials (Microsoft 365) or Basic Auth. BCC delivery, plain text.
|
||||
- Notification delivery is central-only: sites store-and-forward notifications to the central cluster (target = central, not SMTP); sites never talk to SMTP. Notification lists and SMTP config are no longer deployed to sites; recipient resolution happens at central, at delivery time.
|
||||
- Notification lists carry a `Type` discriminator (`Email` now; `Teams` and others later). `Notify.To("list")` is type-agnostic; delivery is via per-type `INotificationDeliveryAdapter` (success/transient/permanent classification, same pattern as External System Gateway).
|
||||
- `Notify.Send` is async — returns a `NotificationId` (GUID, idempotency key) status handle immediately. `Notify.Status(notificationId)` returns a status record (status, retry count, last error, key timestamps); answered site-locally as `Forwarding` while still in the site S&F buffer, otherwise round-trips to central.
|
||||
- Inbound API: `POST /api/{methodName}`, `X-API-Key` header, flat JSON, extended type system (Object, List).
|
||||
|
||||
### Templates & Deployment
|
||||
@@ -109,6 +115,31 @@ This project contains design documentation for a distributed SCADA system built
|
||||
- Async best-effort replication to standby (no ack wait).
|
||||
- Messages not cleared on instance deletion.
|
||||
- CachedCall idempotency is the caller's responsibility.
|
||||
- Notification Outbox: central `NotificationOutboxActor` singleton on the active central node — the first centrally-hosted store-and-forward component (the S&F Engine remains site-only).
|
||||
- `Notifications` table in central MS SQL is the single source of audit truth (one row per notification); type-agnostic via the `Type` discriminator.
|
||||
- Status lifecycle `Pending → Retrying → Delivered / Parked / Discarded`, plus site-local `Forwarding` (never persisted centrally).
|
||||
- Dispatcher loop polls due rows, resolves the list, delivers via the typed adapter; transient failures retry to `Parked`, permanent failures park immediately.
|
||||
- Site→central handoff is at-least-once: ack-after-persist plus insert-if-not-exists on `NotificationId`.
|
||||
- No Akka replication — MS SQL is the HA store; daily purge of terminal rows after a configurable window (default 365 days).
|
||||
- Notification Outbox retry reuses central SMTP max-retry-count and fixed interval.
|
||||
- Cached calls (`ExternalSystem.CachedCall`, `Database.CachedWrite`) return a `TrackedOperationId` tracking handle, unified with `Notify.Send`'s existing tracking model (`Notify.Status` retained as a thin alias).
|
||||
- A site-local operation tracking table (SQLite, alongside the S&F buffer) is the source of truth for cached-call status; `Tracking.Status(id)` reads it site-locally and authoritatively; terminal rows purged after a configurable window (default 7 days).
|
||||
- Unified tracking status lifecycle `Pending → Retrying → Delivered / Parked / Failed / Discarded`; `Failed` = permanent failure (also returned synchronously to the calling script). No `Forwarding` state for cached calls.
|
||||
- Site Call Audit (#22): central `SiteCallAuditActor` singleton with a `SiteCalls` audit table (central MS SQL) fed by best-effort site telemetry plus periodic reconciliation pulls — an eventually-consistent mirror, NOT a dispatcher; cached-call delivery stays site-local. Ingest is insert-if-not-exists then upsert-on-newer-status.
|
||||
- Central UI Site Calls page + central→site `RetryParkedOperation`/`DiscardParkedOperation` relay for parked cached calls; central never mutates the `SiteCalls` row directly.
|
||||
|
||||
### Centralized Audit Log
|
||||
- Layered design — append-only `AuditLog` (#23) sits alongside operational `Notifications` (#21) and `SiteCalls` (#22), not replacing them.
|
||||
- Scope = script trust boundary: outbound API (sync + cached), outbound DB (sync + cached), notifications, inbound API. Framework/internal traffic is explicitly excluded.
|
||||
- One row per lifecycle event; cached calls produce 4+ rows per operation (`Submitted`, `Forwarded`, `Attempted`, `Delivered`/`Parked`/`Discarded`).
|
||||
- `ExecutionId` (`uniqueidentifier NULL`) is the universal per-run correlation value — every audit row emitted by one script execution / inbound request shares it; `CorrelationId` remains the per-operation lifecycle id (NULL for sync one-shots).
|
||||
- Site SQLite hot-path first, then gRPC telemetry to central; ingest is idempotent on `EventId`; periodic reconciliation pull as fallback when telemetry is lost.
|
||||
- Cached operations: site emits a single additively-extended `CachedCallTelemetry` packet carrying both audit events and operational state; central writes `AuditLog` + `SiteCalls` in one transaction.
|
||||
- Payload cap 8 KB by default / 64 KB on error rows; auth headers redacted by default; SQL parameter values captured by default; per-target redaction opt-in.
|
||||
- Audit-write failure NEVER aborts the user-facing action — audit is best-effort, the action's own success/failure path is authoritative.
|
||||
- 365-day central retention with monthly partition-switch purge; 7-day site SQLite retention with a hard `ForwardState` invariant (no row purged until forwarded or reconciled).
|
||||
- Append-only enforced via DB roles (writer role has INSERT only, no UPDATE/DELETE); hash-chain tamper evidence and Parquet archival are deferred to v1.x.
|
||||
- Central UI: new top-level **Audit** nav group + Audit Log page, with drill-ins from Notifications, Site Calls, External Systems, Inbound API Keys, Sites, and Instances.
|
||||
|
||||
### Security & Auth
|
||||
- Authentication: direct LDAP bind (username/password), no Kerberos/NTLM. LDAPS/StartTLS required.
|
||||
@@ -130,6 +161,10 @@ This project contains design documentation for a distributed SCADA system built
|
||||
- Health reports: 30s interval, 60s offline threshold, monotonic sequence numbers, raw error counts per interval.
|
||||
- Dead letter monitoring as a health metric.
|
||||
- Site Event Logging: 30-day retention, 1GB storage cap, daily purge, paginated queries with keyword search.
|
||||
- Notification Outbox KPIs are central-computed point-in-time from the `Notifications` table (global + per-source-site): queue depth, stuck count, parked count, delivered-last-interval, oldest-pending age.
|
||||
- Stuck = `Pending`/`Retrying` older than a configurable age threshold (default 10 min) — display-only (KPI count + row badge), no escalation/alerting.
|
||||
- Headline KPI tiles surface on the Health dashboard; a new Central UI Notification Outbox page offers a queryable list with Retry/Discard actions on parked notifications.
|
||||
- Site Call Audit KPIs are central-computed point-in-time from the `SiteCalls` table (global + per-site), mirroring the Notification Outbox KPI shape; tiles surface on the Health dashboard alongside a queryable Central UI Site Calls page with Retry/Discard on parked rows.
|
||||
|
||||
### Code Organization
|
||||
- Entity classes are persistence-ignorant POCOs in Commons; EF mappings in Configuration Database.
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
<PackageVersion Include="Microsoft.AspNetCore.DataProtection" Version="10.0.7" />
|
||||
<PackageVersion Include="Microsoft.AspNetCore.DataProtection.EntityFrameworkCore" Version="10.0.7" />
|
||||
<PackageVersion Include="Microsoft.AspNetCore.Mvc.Testing" Version="10.0.7" />
|
||||
<PackageVersion Include="Microsoft.AspNetCore.TestHost" Version="10.0.7" />
|
||||
<PackageVersion Include="Microsoft.AspNetCore.SignalR.Client" Version="9.0.3" />
|
||||
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Scripting" Version="5.0.0" />
|
||||
<PackageVersion Include="Microsoft.CodeAnalysis.CSharp.Workspaces" Version="5.0.0" />
|
||||
@@ -64,6 +65,14 @@
|
||||
<PackageVersion Include="System.IdentityModel.Tokens.Jwt" Version="8.11.0" />
|
||||
<PackageVersion Include="xunit" Version="2.9.3" />
|
||||
<PackageVersion Include="xunit.runner.visualstudio" Version="3.1.4" />
|
||||
<!--
|
||||
Xunit.SkippableFact provides [SkippableFact] + Skip.IfNot/Skip.If for
|
||||
xunit v2. The native Skip API (Assert.Skip / Assert.SkipUnless /
|
||||
Assert.SkipWhen) only exists in xunit v3; xunit 2.9.x lacks it. Used by
|
||||
Bundle C MSSQL integration tests in ScadaLink.ConfigurationDatabase.Tests
|
||||
to mark tests as Skipped (not silently Passed) when MSSQL is unreachable.
|
||||
-->
|
||||
<PackageVersion Include="Xunit.SkippableFact" Version="1.5.61" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -14,7 +14,7 @@ This document serves as the master index for the SCADA system design. The system
|
||||
| Central Database | MS SQL Server, Entity Framework Core |
|
||||
| Site Storage | SQLite (deployed configs, S&F buffer, event logs) |
|
||||
| Authentication | Direct LDAP/AD bind (LDAPS/StartTLS), JWT sessions |
|
||||
| Notifications | SMTP with OAuth2 Client Credentials (Microsoft 365) |
|
||||
| Notifications | Delivered from the central cluster (SMTP, OAuth2/Microsoft 365); store-and-forwarded from sites |
|
||||
| Hosting | Windows Server, Windows Service |
|
||||
| Cluster | Akka.NET Cluster (active/standby, keep-oldest SBR) |
|
||||
| Logging | Serilog (structured) |
|
||||
@@ -38,10 +38,10 @@ This document serves as the master index for the SCADA system design. The system
|
||||
| 2 | Deployment Manager | [docs/requirements/Component-DeploymentManager.md](docs/requirements/Component-DeploymentManager.md) | Central-side deployment pipeline with deployment ID/idempotency, per-instance operation lock, state transition matrix, all-or-nothing site apply, system-wide artifact deployment with per-site status. |
|
||||
| 3 | Site Runtime | [docs/requirements/Component-SiteRuntime.md](docs/requirements/Component-SiteRuntime.md) | Site-side actor hierarchy with explicit supervision strategies, staggered startup, script trust model (constrained APIs), Tell/Ask conventions, concurrency serialization, and site-wide Akka stream with per-subscriber backpressure. |
|
||||
| 4 | Data Connection Layer | [docs/requirements/Component-DataConnectionLayer.md](docs/requirements/Component-DataConnectionLayer.md) | Common data connection interface (OPC UA, custom), Become/Stash connection actor model, auto-reconnect, immediate bad quality on disconnect, transparent re-subscribe, synchronous write failures, tag path resolution retry. |
|
||||
| 5 | Central–Site Communication | [docs/requirements/Component-Communication.md](docs/requirements/Component-Communication.md) | Dual transport: Akka.NET ClusterClient (command/control) + gRPC server-streaming (real-time data). 8 message patterns with per-pattern timeouts, SiteStreamGrpcServer/Client, application-level correlation IDs, transport heartbeat config, gRPC keepalive, message ordering, connection failure behavior. |
|
||||
| 5 | Central–Site Communication | [docs/requirements/Component-Communication.md](docs/requirements/Component-Communication.md) | Dual transport: Akka.NET ClusterClient (command/control) + gRPC server-streaming (real-time data). 9 message patterns with per-pattern timeouts, SiteStreamGrpcServer/Client, application-level correlation IDs, transport heartbeat config, gRPC keepalive, message ordering, connection failure behavior. |
|
||||
| 6 | Store-and-Forward Engine | [docs/requirements/Component-StoreAndForward.md](docs/requirements/Component-StoreAndForward.md) | Buffering (transient failures only), fixed-interval retry, parking, async best-effort replication, SQLite persistence at sites. |
|
||||
| 7 | External System Gateway | [docs/requirements/Component-ExternalSystemGateway.md](docs/requirements/Component-ExternalSystemGateway.md) | HTTP/REST + JSON, API key/Basic Auth, per-system timeout, dual call modes (Call/CachedCall), transient/permanent error classification, dedicated blocking I/O dispatcher, ADO.NET connection pooling. |
|
||||
| 8 | Notification Service | [docs/requirements/Component-NotificationService.md](docs/requirements/Component-NotificationService.md) | SMTP with OAuth2 (M365) or Basic Auth, BCC delivery, plain text, transient/permanent SMTP error classification, store-and-forward integration. |
|
||||
| 8 | Notification Service | [docs/requirements/Component-NotificationService.md](docs/requirements/Component-NotificationService.md) | Central-only — manages typed notification-list and SMTP definitions, supplies per-type delivery adapters (SMTP with OAuth2 (M365) or Basic Auth, BCC, plain text); delivery performed by the Notification Outbox. |
|
||||
| 9 | Central UI | [docs/requirements/Component-CentralUI.md](docs/requirements/Component-CentralUI.md) | Blazor Server with SignalR real-time push, load balancer failover with JWT, all management workflows. |
|
||||
| 10 | Security & Auth | [docs/requirements/Component-Security.md](docs/requirements/Component-Security.md) | Direct LDAP bind (LDAPS/StartTLS), JWT sessions (HMAC-SHA256, 15-min refresh, 30-min idle), role-based authorization, site-scoped permissions. |
|
||||
| 11 | Health Monitoring | [docs/requirements/Component-HealthMonitoring.md](docs/requirements/Component-HealthMonitoring.md) | 30s report interval, 60s offline threshold, monotonic sequence numbers, raw error counts, tag resolution counts, dead letter monitoring. |
|
||||
@@ -54,6 +54,9 @@ This document serves as the master index for the SCADA system design. The system
|
||||
| 18 | Management Service | [docs/requirements/Component-ManagementService.md](docs/requirements/Component-ManagementService.md) | Akka.NET ManagementActor on central, ClusterClientReceptionist registration, programmatic access to all admin operations, CLI interface. |
|
||||
| 19 | CLI | [docs/requirements/Component-CLI.md](docs/requirements/Component-CLI.md) | Standalone command-line tool, System.CommandLine, HTTP transport via Management API, JSON/table output, mirrors all Management Service operations. |
|
||||
| 20 | Traefik Proxy | [docs/requirements/Component-TraefikProxy.md](docs/requirements/Component-TraefikProxy.md) | Reverse proxy/load balancer fronting central cluster, active node routing via `/health/active`, automatic failover. |
|
||||
| 21 | Notification Outbox | [docs/requirements/Component-NotificationOutbox.md](docs/requirements/Component-NotificationOutbox.md) | Central component ingesting store-and-forwarded notifications into the `Notifications` audit table, with `NotificationOutboxActor` singleton dispatcher, per-type delivery adapters, retry/parking, status tracking, daily purge, and delivery KPIs. |
|
||||
| 22 | Site Call Audit | [docs/requirements/Component-SiteCallAudit.md](docs/requirements/Component-SiteCallAudit.md) | Central component auditing site cached calls (`ExternalSystem.CachedCall`/`Database.CachedWrite`) into the `SiteCalls` audit table, with `SiteCallAuditActor` singleton, telemetry ingest, periodic reconciliation, point-in-time KPIs, daily purge, and central→site Retry/Discard relay for parked calls. |
|
||||
| 23 | Audit Log | [docs/requirements/Component-AuditLog.md](docs/requirements/Component-AuditLog.md) | New central append-only AuditLog spanning every script-trust-boundary action (outbound API sync+cached, outbound DB sync+cached, notifications, inbound API). Site-local SQLite hot-path append + gRPC telemetry + central reconciliation; combined telemetry packet with Site Call Audit; central direct-write for Notification Outbox dispatch + Inbound API middleware; monthly partitioning, 365-day default retention. |
|
||||
|
||||
### Reference Documentation
|
||||
|
||||
@@ -88,6 +91,17 @@ This document serves as the master index for the SCADA system design. The system
|
||||
│ │ Mgmt │ ◄── CLI (ClusterClient) │
|
||||
│ │ Service │ ManagementActor + Receptionist │
|
||||
│ └──────────┘ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ Ntf │ │ Site │ │ Audit │ Observ. / │
|
||||
│ │ Outbox │ │ Call │ │ Log │ Audit area │
|
||||
│ │ (#21) │ │ Audit │ │ (#23) │ │
|
||||
│ │ │ │ (#22) │ │ │ │
|
||||
│ └────▲─────┘ └────▲─────┘ └────▲─────┘ │
|
||||
│ │ ingests │ ingests │ ingests │
|
||||
│ │ (S&F) │ (telemetry)│ (telemetry + │
|
||||
│ │ │ │ direct-write │
|
||||
│ │ │ │ from Ntf Outbox │
|
||||
│ │ │ │ & Inbound API) │
|
||||
│ ┌───────────────────────────────────┐ │
|
||||
│ │ Akka.NET Communication Layer │ │
|
||||
│ │ ClusterClient: command/control │ │
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
<Solution>
|
||||
<Folder Name="/src/">
|
||||
<Project Path="src/ScadaLink.AuditLog/ScadaLink.AuditLog.csproj" />
|
||||
<Project Path="src/ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
||||
<Project Path="src/ScadaLink.Host/ScadaLink.Host.csproj" />
|
||||
<Project Path="src/ScadaLink.TemplateEngine/ScadaLink.TemplateEngine.csproj" />
|
||||
@@ -10,6 +11,8 @@
|
||||
<Project Path="src/ScadaLink.StoreAndForward/ScadaLink.StoreAndForward.csproj" />
|
||||
<Project Path="src/ScadaLink.ExternalSystemGateway/ScadaLink.ExternalSystemGateway.csproj" />
|
||||
<Project Path="src/ScadaLink.NotificationService/ScadaLink.NotificationService.csproj" />
|
||||
<Project Path="src/ScadaLink.NotificationOutbox/ScadaLink.NotificationOutbox.csproj" />
|
||||
<Project Path="src/ScadaLink.SiteCallAudit/ScadaLink.SiteCallAudit.csproj" />
|
||||
<Project Path="src/ScadaLink.CentralUI/ScadaLink.CentralUI.csproj" />
|
||||
<Project Path="src/ScadaLink.Security/ScadaLink.Security.csproj" />
|
||||
<Project Path="src/ScadaLink.HealthMonitoring/ScadaLink.HealthMonitoring.csproj" />
|
||||
@@ -21,6 +24,7 @@
|
||||
<Project Path="src/ScadaLink.CLI/ScadaLink.CLI.csproj" />
|
||||
</Folder>
|
||||
<Folder Name="/tests/">
|
||||
<Project Path="tests/ScadaLink.AuditLog.Tests/ScadaLink.AuditLog.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.Commons.Tests/ScadaLink.Commons.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.Host.Tests/ScadaLink.Host.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.TemplateEngine.Tests/ScadaLink.TemplateEngine.Tests.csproj" />
|
||||
@@ -31,6 +35,8 @@
|
||||
<Project Path="tests/ScadaLink.StoreAndForward.Tests/ScadaLink.StoreAndForward.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.ExternalSystemGateway.Tests/ScadaLink.ExternalSystemGateway.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.NotificationService.Tests/ScadaLink.NotificationService.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.NotificationOutbox.Tests/ScadaLink.NotificationOutbox.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.SiteCallAudit.Tests/ScadaLink.SiteCallAudit.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.CentralUI.Tests/ScadaLink.CentralUI.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.Security.Tests/ScadaLink.Security.Tests.csproj" />
|
||||
<Project Path="tests/ScadaLink.HealthMonitoring.Tests/ScadaLink.HealthMonitoring.Tests.csproj" />
|
||||
|
||||
@@ -0,0 +1,604 @@
|
||||
# Centralized Audit Log — Design (Working Draft)
|
||||
|
||||
**Status:** Validated — ready for implementation planning.
|
||||
**Owner:** Joseph Doherty
|
||||
**Date:** 2026-05-20
|
||||
**Provisional component number:** #23 Audit Log
|
||||
|
||||
> A new central, append-only audit log capturing every action a script causes to cross the cluster trust boundary: outbound API calls (sync + cached), outbound DB writes (sync + cached), notifications sent, and inbound API requests that invoke a script.
|
||||
|
||||
---
|
||||
|
||||
## 1. Purpose
|
||||
|
||||
Provide a **single forensic + operational record** of every integration action initiated by, or terminating in, a script — answering both:
|
||||
|
||||
- **Compliance / forensic:** "Did instance X send notification Y on date Z? What was the body? Did external system A get called by script B last quarter, and with what result?"
|
||||
- **Operational visibility:** "Why is site S misbehaving right now? What did its scripts touch in the last 10 minutes? Which inbound API caller is hammering us?"
|
||||
|
||||
One store, rich payloads, long retention, dashboards + drilldowns + filter queries.
|
||||
|
||||
The audit log is **not** the operational state store. It does not drive dispatchers, retry loops, or Retry/Discard actions — those remain in [Notification Outbox](#21) and [Site Call Audit](#22). The audit log is the immutable history that **observes** those subsystems and adds coverage where they are silent.
|
||||
|
||||
---
|
||||
|
||||
## 2. Scope — the script trust boundary
|
||||
|
||||
The audit log captures **every action that a script causes to cross the cluster trust boundary**:
|
||||
|
||||
| Channel | Trigger | Direction | Covered today? |
|
||||
|---|---|---|---|
|
||||
| `ExternalSystem.Call(...)` | Script | Outbound | ❌ (gap) |
|
||||
| `ExternalSystem.CachedCall(...)` | Script | Outbound | ✅ `SiteCalls` (Site Call Audit) |
|
||||
| `Database.Connection().Execute*(...)` — writes | Script | Outbound | ❌ (gap) |
|
||||
| `Database.CachedWrite(...)` | Script | Outbound | ✅ `SiteCalls` (Site Call Audit) |
|
||||
| `Notify.To(list).Send(...)` | Script | Outbound | ✅ `Notifications` (Notification Outbox) |
|
||||
| `POST /api/{method}` (Inbound API) | External | Inbound (invokes a script) | ❌ (gap) |
|
||||
|
||||
**Out of scope** — framework traffic is *not* audited:
|
||||
|
||||
- Health checks, heartbeats, cluster membership messages.
|
||||
- gRPC inter-cluster real-time streams (attribute values, alarm states).
|
||||
- Data Connection Layer ↔ OPC UA / custom protocol traffic.
|
||||
- LDAP authentication probes, Traefik routing decisions.
|
||||
- Internal Configuration Database queries by the framework.
|
||||
- Site Event Log writes, audit log writes themselves.
|
||||
|
||||
This boundary is meaningful because the script trust model already controls what scripts can do; the audit log is the record of how that surface was exercised.
|
||||
|
||||
> **Note on DB reads.** Script-initiated reads via `Database.Connection().ExecuteReader(...)` count as actions from a script and ARE in scope. They are expected to be far less common than reads via DCL/subscriptions (which are framework traffic and excluded).
|
||||
|
||||
---
|
||||
|
||||
## 3. Architecture — layered, append-only
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────────────┐
|
||||
│ Central cluster (MS SQL) │
|
||||
│ │
|
||||
│ ┌──────────────────┐ ┌───────────────┐ ┌────────────────────┐ │
|
||||
│ │ Notification │ │ Site Call │ │ Inbound API │ │
|
||||
│ │ Outbox (#21) │ │ Audit (#22) │ │ (#14) │ │
|
||||
│ │ Notifications │ │ SiteCalls │ │ (no audit today) │ │
|
||||
│ └────────┬─────────┘ └───────┬───────┘ └─────────┬──────────┘ │
|
||||
│ │ emits │ emits │ emits │
|
||||
│ ▼ ▼ ▼ │
|
||||
│ ┌────────────────────────────────────────────────────────────┐ │
|
||||
│ │ AuditLog (new, append-only, MS SQL) │ │
|
||||
│ │ one row per lifecycle event across all channels │ │
|
||||
│ └─────────────────────────▲──────────────────────────────────┘ │
|
||||
│ │ telemetry (gRPC, idempotent) │
|
||||
└─────────────────────────────────┼─────────────────────────────────────┘
|
||||
│
|
||||
│
|
||||
┌─────────────────────────────────┼─────────────────────────────────────┐
|
||||
│ Site cluster (SQLite, per active node) │
|
||||
│ │ │
|
||||
│ ┌─────────────────────────┴──────────────────────────────┐ │
|
||||
│ │ Site-local AuditLog (SQLite, hot-path append) │ │
|
||||
│ └────▲───────────────▲───────────────▲───────────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ ┌─────────┴────────┐ ┌───┴──────┐ ┌─────┴────────────┐ │
|
||||
│ │ External System │ │ Database │ │ Site S&F / │ │
|
||||
│ │ Gateway (#7) │ │ Layer │ │ Notifications │ │
|
||||
│ │ sync + cached │ │ sync + │ │ (transitions) │ │
|
||||
│ └──────────────────┘ │ cached │ └──────────────────┘ │
|
||||
│ └──────────┘ │
|
||||
└───────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Key properties:**
|
||||
|
||||
- **Strictly append-only.** Once written, an AuditLog row is never updated or deleted (except by retention purge). Operational state (live status, parked-for-retry, etc.) lives in `Notifications` / `SiteCalls` — not in AuditLog.
|
||||
- **One row per lifecycle event.** A cached call that retries three times then parks produces five rows: enqueued, attempt #1, attempt #2, attempt #3, parked. A sync call produces one row. An inbound API hit produces one row.
|
||||
- **Site-local first for site-originated events.** Hot-path script calls never wait on the network for an audit write.
|
||||
- **Direct write for central-originated events.** Notification delivery attempts and inbound API hits land at central — they write directly to the central `AuditLog`. No detour through site SQLite.
|
||||
- **At-least-once telemetry, idempotent on `EventId`.** Same dedup model as Site Call Audit today.
|
||||
|
||||
---
|
||||
|
||||
## 4. Data Model (first cut)
|
||||
|
||||
Single wide table, polymorphic by `Channel` + `Kind` discriminators, JSON payload column.
|
||||
|
||||
### Central: `AuditLog`
|
||||
|
||||
| Column | Type | Notes |
|
||||
|---|---|---|
|
||||
| `EventId` | `uniqueidentifier` PK | Generated where the event originates (site or central). Idempotency key. |
|
||||
| `OccurredAtUtc` | `datetime2` | When the event happened (call returned, retry attempted, etc.). |
|
||||
| `IngestedAtUtc` | `datetime2` | When central persisted the row (lags `OccurredAtUtc` for site-originated rows). |
|
||||
| `Channel` | `varchar(32)` | `ApiOutbound` \| `DbOutbound` \| `Notification` \| `ApiInbound`. |
|
||||
| `Kind` | `varchar(32)` | Event kind discriminator (see kinds list below). |
|
||||
| `CorrelationId` | `uniqueidentifier` NULL | Ties multi-event operations together. `TrackedOperationId` for cached calls, `NotificationId` for notifications, request-id for inbound API. NULL for sync one-shot calls. |
|
||||
| `SourceSiteId` | `varchar(64)` NULL | NULL for central-originated events (inbound API, central notification dispatch). |
|
||||
| `SourceInstanceId` | `varchar(128)` NULL | Instance whose script initiated the action (when applicable). |
|
||||
| `SourceScript` | `varchar(128)` NULL | Script name within the instance. |
|
||||
| `Actor` | `varchar(128)` NULL | Inbound API: API key name. Outbound: script identity. Central: system user. |
|
||||
| `Target` | `varchar(256)` NULL | Outbound API: external system + method. DB: connection name. Notification: list name. Inbound API: method name. |
|
||||
| `Status` | `varchar(32)` | Outcome of *this event*: `Submitted`, `Forwarded`, `Attempted`, `Delivered`, `Failed`, `Parked`, `Discarded`, `Skipped`. |
|
||||
| `HttpStatus` | `int` NULL | HTTP-bearing events only. |
|
||||
| `DurationMs` | `int` NULL | Call/attempt duration. |
|
||||
| `ErrorMessage` | `nvarchar(1024)` NULL | Truncated; `ErrorDetail` for full text. |
|
||||
| `ErrorDetail` | `nvarchar(max)` NULL | Optional full exception/text on failures. |
|
||||
| `RequestSummary` | `nvarchar(max)` NULL | Truncated request payload (configurable cap, default 8 KB). Headers redacted. |
|
||||
| `ResponseSummary` | `nvarchar(max)` NULL | Truncated response payload (same cap). Full on errors. |
|
||||
| `PayloadTruncated` | `bit` | Set if either summary was truncated. |
|
||||
| `Extra` | `nvarchar(max)` NULL | Channel-specific JSON for fields we don't promote to columns. |
|
||||
|
||||
**Indexes (first cut):**
|
||||
|
||||
- `IX_AuditLog_OccurredAtUtc` — primary time-range index for global scans.
|
||||
- `IX_AuditLog_Site_Occurred (SourceSiteId, OccurredAtUtc)` — per-site filters.
|
||||
- `IX_AuditLog_Correlation (CorrelationId)` — drilldown from a single operation.
|
||||
- `IX_AuditLog_Channel_Status_Occurred (Channel, Status, OccurredAtUtc)` — KPI / dashboard tiles.
|
||||
- `IX_AuditLog_Target_Occurred (Target, OccurredAtUtc)` — "what did we send to system X."
|
||||
- Partitioning by month on `OccurredAtUtc` from day one (purge becomes a partition switch instead of a delete storm).
|
||||
|
||||
**`Kind` values (flat — 10 discriminators across all channels):**
|
||||
|
||||
| Kind | Fires when |
|
||||
|---|---|
|
||||
| `ApiCall` | Sync `ExternalSystem.Call(...)` returns (success or permanent failure). One row per call. |
|
||||
| `ApiCallCached` | A cached outbound-API attempt records its forward-ack (`Forwarded`) or each retry (`Attempted`). |
|
||||
| `DbWrite` | Sync `Database.Connection().Execute*(...)` / `ExecuteReader(...)` completes. One row per call. |
|
||||
| `DbWriteCached` | A cached outbound-DB attempt records its forward-ack (`Forwarded`) or each retry (`Attempted`). |
|
||||
| `NotifySend` | Script's `Notify.Send(...)` is enqueued on the site — first row in a notification's lifecycle (`Status=Submitted`). |
|
||||
| `NotifyDeliver` | Central Notification Outbox dispatcher records a delivery attempt (`Attempted`) or terminal outcome (`Delivered`/`Parked`/`Discarded`). |
|
||||
| `InboundRequest` | An inbound API request completes — one row per request, written at request end with final status. |
|
||||
| `InboundAuthFailure` | An inbound API request was rejected at the auth boundary (bad/missing key). One row, `Status=Failed`, `HttpStatus=401`. |
|
||||
| `CachedSubmit` | Script-side enqueue of a cached call (`ExternalSystem.CachedCall` / `Database.CachedWrite`); first row in the cached-call lifecycle, written to site SQLite before any forward attempt. |
|
||||
| `CachedResolve` | Terminal row for a cached operation — `Status` = `Delivered` / `Failed` / `Parked` / `Discarded`. |
|
||||
|
||||
### Site: `AuditLog` (SQLite)
|
||||
|
||||
Same shape minus `IngestedAtUtc` (irrelevant at the source) plus a local `ForwardState` column:
|
||||
|
||||
- `ForwardState`: `Pending` | `Forwarded` | `Reconciled`. Drives the telemetry loop and reconciliation pull.
|
||||
|
||||
**Site SQLite retention rule (hard invariant):**
|
||||
|
||||
A row is eligible for purge only when **both** conditions hold:
|
||||
|
||||
1. `OccurredAtUtc` is older than the configured site retention window (default **7 days**); AND
|
||||
2. `ForwardState IN ('Forwarded', 'Reconciled')` — i.e., central has acknowledged receipt.
|
||||
|
||||
Rows still in `ForwardState = 'Pending'` are **never** purged on the basis of age. A prolonged central outage will grow the site audit table indefinitely until central is reachable again. This is intentional — losing audit rows to make room is a compliance violation, not a self-healing behavior.
|
||||
|
||||
To bound that growth in practice, the site emits a **`SiteAuditBacklog`** health metric (pending row count, oldest pending age, bytes on disk). Crossing operator-configured thresholds surfaces as a Health dashboard warning on the relevant site tile. This is the same pattern used by the Store-and-Forward Engine's backlog metric.
|
||||
|
||||
Central is the durable home; site SQLite is a write-buffer with a forwarding guarantee.
|
||||
|
||||
---
|
||||
|
||||
## 5. Where this fits in the existing component matrix
|
||||
|
||||
This work probably becomes **component #23: Audit Log**, with edges into:
|
||||
|
||||
- **#7 External System Gateway** — emits audit events for sync `Call()`, sync DB writes (and reads from scripts), and cached operations.
|
||||
- **#14 Inbound API** — emits one row per request (success or failure) at request completion.
|
||||
- **#21 Notification Outbox** — emits an audit row on enqueue, on each delivery attempt, and on terminal status.
|
||||
- **#22 Site Call Audit** — emits an audit row on each lifecycle transition (enqueue, attempt, terminal). `SiteCalls` remains the operational state store; AuditLog is the immutable shadow.
|
||||
- **#3 Site Runtime / #16 Commons** — script-trust-boundary call paths gain a thin audit interface.
|
||||
- **#17 Configuration Database** — Audit Log is a separate concern from `IAuditService` (which stays config-change-only). Both coexist.
|
||||
|
||||
---
|
||||
|
||||
## 6. Ingestion paths
|
||||
|
||||
There are three write paths into the central `AuditLog`, all converging on the same table.
|
||||
|
||||
### 6.1 Site hot-path append (site-originated events)
|
||||
|
||||
1. Script issues an action across the trust boundary (`ExternalSystem.Call`, `Database` write/read, `Notify.Send`, etc.).
|
||||
2. The component completing the action (External System Gateway, Database Layer, S&F Engine) builds an `AuditEvent` value object with a fresh `EventId` (Guid v4) and `OccurredAtUtc = UtcNow`.
|
||||
3. Component appends the event to the site-local `AuditLog` SQLite via the `ISiteAuditWriter` interface. Single-statement `INSERT`, `ForwardState = 'Pending'`. Fire-and-forget from the caller's point of view (await returns once the local write is durable, typically microseconds).
|
||||
4. Control returns to the script. No central round-trip on the hot path.
|
||||
|
||||
Failure modes on the hot path:
|
||||
|
||||
- **SQLite write fails** (disk full, IO error): the audit writer logs a critical event to the Site Event Log, surfaces a `SiteAuditWriteFailures` health metric, and *the action proceeds*. We do not fail user-facing actions because the audit write failed — but the operator must be told loudly. (Open question: do we want a "strict mode" where audit-write failure aborts the action? Default off.)
|
||||
- **Audit writer not yet bootstrapped** (very early startup): events buffer in-memory bounded by a small ring; oldest discarded with a warning if it overflows. This window is normally sub-second.
|
||||
|
||||
### 6.2 Telemetry forward (site → central)
|
||||
|
||||
A `SiteAuditTelemetryActor` runs as a singleton on the active site node and drives the forwarding loop:
|
||||
|
||||
1. Selects up to N `Pending` rows from local `AuditLog` ordered by `OccurredAtUtc`.
|
||||
2. Sends them in a batched gRPC `IngestAuditEvents(events)` call to central (over the existing `SiteStream` channel — same transport as cached-call telemetry today).
|
||||
3. Central performs **insert-if-not-exists** on `EventId` (idempotent) and returns the accepted IDs.
|
||||
4. Site flips `ForwardState = 'Forwarded'` for accepted IDs. Rejected IDs (transient central error) stay `Pending` for the next sweep.
|
||||
|
||||
Cadence: short polling interval (default 5s) when the queue is non-empty, longer (default 30s) when idle. Telemetry runs on a dedicated dispatcher so it doesn't compete with the script blocking-I/O dispatcher.
|
||||
|
||||
### 6.3 Reconciliation pull (self-healing for missed telemetry)
|
||||
|
||||
A central `SiteAuditReconciliationActor` periodically (default every 5 minutes per site) asks each site: *"What's your highest `EventId.OccurredAtUtc` with `ForwardState = 'Pending'`? And how many pending rows do you have?"* If central sees a non-empty pending backlog that hasn't drained on its own (e.g., telemetry actor is wedged), it issues a `PullAuditEvents(sinceUtc, batchSize)` request that returns rows directly. Central inserts-if-not-exists and acks them — site flips to `ForwardState = 'Reconciled'`.
|
||||
|
||||
This is the same self-healing pattern Site Call Audit uses for `SiteCalls`.
|
||||
|
||||
### 6.4 Central direct-write (central-originated events)
|
||||
|
||||
Events that originate at central never touch site SQLite:
|
||||
|
||||
- **Inbound API** — request completed at central; one `ApiInbound`/`InboundRequest` row written via `ICentralAuditWriter` synchronously inside the request handler middleware before the HTTP response is flushed. Auth failures emit `ApiInbound`/`InboundAuthFailure` instead.
|
||||
- **Notification Outbox dispatcher** — each delivery attempt writes a `Notification`/`NotifyDeliver` row with `Status=Attempted`; terminal status writes a `Notification`/`NotifyDeliver` row with `Status=Delivered`/`Parked`/`Discarded`. (The site-originated `Notification`/`NotifySend` row, `Status=Submitted`, arrives via §6.2.)
|
||||
Central direct-writes use the same insert-if-not-exists semantics keyed on `EventId`, so a retried request handler can't produce duplicates.
|
||||
|
||||
### 6.5 Cached operations — site emits, central writes twice
|
||||
|
||||
For `ExternalSystem.CachedCall` and `Database.CachedWrite`, the **site** is the source of truth for every audit row. The site writes each lifecycle event — `CachedSubmit` (`Status=Submitted`), then `ApiCallCached`/`DbWriteCached` rows for the forward-ack (`Status=Forwarded`) and each retry (`Status=Attempted`), then a terminal `CachedResolve` row (`Status=Delivered`/`Failed`/`Parked`/`Discarded`) — to its local SQLite `AuditLog` on the hot path (or on the retry tick for `Attempted` rows), then forwards via the same telemetry channel described in §6.2. The telemetry message format gains the audit-row fields additively — one packet per lifecycle transition carries both the operational state update AND the audit row content.
|
||||
|
||||
On receipt, central does two things in **one transaction**:
|
||||
|
||||
1. Insert-if-not-exists the immutable `AuditLog` row, keyed on `EventId`.
|
||||
2. Upsert the operational `SiteCalls` row (existing Site Call Audit behavior — status, retry count, last error, timestamps).
|
||||
|
||||
This collapses what would otherwise be two telemetry concerns into one, keeps site SQLite as the single local source of truth for audit content, and preserves the existing operational `SiteCalls` shape for the dispatcher / UI. No central-side derivation; no double-emission from the site.
|
||||
|
||||
---
|
||||
|
||||
## 7. Per-channel event mapping
|
||||
|
||||
Worked examples — what each `Channel`/`Kind` row actually looks like. (Other columns omitted for brevity unless interesting.)
|
||||
|
||||
### 7.1 `ApiOutbound` — outbound HTTP via External System Gateway
|
||||
|
||||
**Sync call** (`ExternalSystem.Call("Weather", "GetForecast", { city: "Dublin" })` succeeds):
|
||||
|
||||
```
|
||||
EventId = <new guid>
|
||||
Channel = ApiOutbound
|
||||
Kind = ApiCall
|
||||
CorrelationId = NULL -- one-shot, no operation to correlate
|
||||
SourceSiteId = "site-01"
|
||||
SourceInstance = "Plant1.Boiler"
|
||||
SourceScript = "OnHourly"
|
||||
Target = "Weather/GetForecast"
|
||||
Status = Delivered
|
||||
HttpStatus = 200
|
||||
DurationMs = 142
|
||||
RequestSummary = '{"city":"Dublin"}' -- truncated to cap
|
||||
ResponseSummary= '{"tempC":11.4,...}' -- truncated to cap
|
||||
```
|
||||
|
||||
**Cached call** (`ExternalSystem.CachedCall(...)`, hits a 500, retries, succeeds on attempt 3):
|
||||
|
||||
```
|
||||
1. Kind=CachedSubmit Status=Submitted CorrelationId=<tracked-op-id>
|
||||
2. Kind=ApiCallCached Status=Forwarded CorrelationId=<same>
|
||||
3. Kind=ApiCallCached Status=Attempted HttpStatus=500 CorrelationId=<same>
|
||||
4. Kind=ApiCallCached Status=Attempted HttpStatus=500 CorrelationId=<same>
|
||||
5. Kind=ApiCallCached Status=Attempted HttpStatus=200 CorrelationId=<same>
|
||||
6. Kind=CachedResolve Status=Delivered CorrelationId=<same>
|
||||
```
|
||||
|
||||
The shadow of the `SiteCalls` row's lifecycle, but immutable and time-ordered.
|
||||
|
||||
### 7.2 `DbOutbound` — outbound DB via Database layer
|
||||
|
||||
**Sync write** (`db.Execute("INSERT INTO Readings ...", new {...})`):
|
||||
|
||||
```
|
||||
Channel = DbOutbound
|
||||
Kind = DbWrite
|
||||
Target = "PlantDB" -- connection name only, not server
|
||||
CorrelationId = NULL
|
||||
Status = Delivered
|
||||
DurationMs = 9
|
||||
RequestSummary = "INSERT INTO Readings(ts,val) VALUES (@p0,@p1)" -- SQL text
|
||||
Extra = '{"rowsAffected":1,"params":{"p0":"2026-05-20T14:00Z","p1":42.7}}' -- values captured by default
|
||||
|
||||
```
|
||||
|
||||
**Sync read** (`db.Query<...>(...)`):
|
||||
|
||||
```
|
||||
Channel = DbOutbound
|
||||
Kind = DbWrite
|
||||
Status = Delivered
|
||||
DurationMs = 31
|
||||
RequestSummary = "SELECT id, value FROM Readings WHERE ts > @p0"
|
||||
Extra = '{"rowsReturned":42}'
|
||||
ResponseSummary= NULL -- rows not captured by default; opt-in per connection
|
||||
```
|
||||
|
||||
(Reads and writes share the `DbWrite` kind — the kind distinguishes the trust-boundary call shape, not the SQL verb. Distinguish by `RequestSummary` / `Extra.rowsAffected` vs `Extra.rowsReturned` when needed.)
|
||||
|
||||
**Cached write** — same multi-row lifecycle as the cached API example, using `Kind=DbWriteCached` for the `Forwarded` / `Attempted` rows in place of `ApiCallCached`.
|
||||
|
||||
### 7.3 `Notification` — outbound notifications
|
||||
|
||||
```
|
||||
1. Kind=NotifySend Status=Submitted CorrelationId=<NotificationId> SourceSiteId="site-01" SourceInstance="Plant1.Boiler"
|
||||
2. Kind=NotifyDeliver Status=Attempted ErrorMessage="SMTP 451 ..." CorrelationId=<same> SourceSiteId=NULL (dispatch is central)
|
||||
3. Kind=NotifyDeliver Status=Attempted CorrelationId=<same>
|
||||
4. Kind=NotifyDeliver Status=Delivered CorrelationId=<same>
|
||||
Target = "OpsTeamEmail" -- notification list name
|
||||
Extra = '{"resolvedTargets":["a@x.com","b@x.com"], "subject":"Boiler high temp"}'
|
||||
RequestSummary = '...body, truncated...'
|
||||
```
|
||||
|
||||
Note the site→central handoff is implicit: row 1 arrives via §6.2 telemetry (it originated at the site script); rows 2–4 are written by the central dispatcher directly via §6.4.
|
||||
|
||||
### 7.4 `ApiInbound` — inbound API
|
||||
|
||||
One row per request, written at request completion:
|
||||
|
||||
```
|
||||
Channel = ApiInbound
|
||||
Kind = InboundRequest
|
||||
CorrelationId = <request-id> -- the request's correlation header (or generated)
|
||||
SourceSiteId = NULL -- central-originated event
|
||||
Actor = "AcmeSCADA" -- API key name (NOT the key itself)
|
||||
Target = "RecordReading" -- inbound method name
|
||||
Status = Delivered | Failed -- mapped from final HTTP outcome
|
||||
HttpStatus = 200 | 400 | 500
|
||||
DurationMs = 73
|
||||
RequestSummary = '{"siteId":"...","value":12.4}' -- truncated; secrets/PII per redaction policy
|
||||
ResponseSummary= '{"ok":true}' -- full body on 5xx
|
||||
Extra = '{"remoteIp":"203.0.113.42","userAgent":"...","scriptInvoked":"RecordReading.Handle"}'
|
||||
```
|
||||
|
||||
A bad API key → separate kind: `Kind=InboundAuthFailure`, `Status=Failed`, `HttpStatus=401`, `Actor=NULL`, `Extra` carries `remoteIp` for abuse triage.
|
||||
|
||||
---
|
||||
|
||||
## 8. Payload capture policy
|
||||
|
||||
### 8.1 Truncation
|
||||
- Default cap: **8 KB** for each of `RequestSummary` and `ResponseSummary`. Configurable globally; per-target overrides allowed (§8.4).
|
||||
- On any error row (`Status IN ('Failed', 'Parked', 'Discarded')`), the cap is raised to **64 KB** for that row — error context is precious.
|
||||
- When a body is truncated, `PayloadTruncated = 1` and the captured prefix is preserved verbatim (UTF-8 byte-safe truncation, no mid-character cuts).
|
||||
- Bodies exceeding the larger cap are still truncated; full bodies are never stored.
|
||||
|
||||
### 8.2 Redaction
|
||||
Redaction happens **at the write site**, before the row touches SQLite (or central MS SQL for §6.4 events). Unredacted secrets never persist.
|
||||
|
||||
- **HTTP headers** — `Authorization`, `Cookie`, `Set-Cookie`, `X-API-Key`, and any header matching the configured redact-list (regex) become `<redacted>`. List is operator-extensible.
|
||||
- **HTTP bodies** — captured verbatim by default. Operators can register per-External-System / per-Inbound-Method body redactors (regex → replacement) for known secret fields (e.g., `"password"\s*:\s*"[^"]+"`).
|
||||
- **SQL** — statement text and parameter values captured verbatim by default. Per-connection redaction opt-in (e.g., redact parameters whose name matches `@apikey|@token|@password`).
|
||||
- **Notification bodies** — captured per the existing notification rules (no behavioral change from today).
|
||||
- **Safety net** — if a configured redactor throws, the affected payload becomes `"<redacted: redactor error>"` and a `AuditRedactionFailure` health metric increments. We over-redact, never under-redact, on configuration faults.
|
||||
|
||||
### 8.3 Never captured
|
||||
- Raw API key material (only the key *name* via `Actor`).
|
||||
- LDAP bind credentials, cluster secrets, Configuration DB connection strings.
|
||||
- Framework traffic per §2 (out of scope by construction, not by redaction).
|
||||
|
||||
### 8.4 Configurability
|
||||
Bound from `appsettings.json` (new `AuditLog` options class owned by the Audit Log component):
|
||||
|
||||
```jsonc
|
||||
"AuditLog": {
|
||||
"DefaultCapBytes": 8192,
|
||||
"ErrorCapBytes": 65536,
|
||||
"HeaderRedactList": [ "Authorization", "Cookie", "Set-Cookie", "X-API-Key" ],
|
||||
"GlobalBodyRedactors": [
|
||||
{ "Pattern": "\"password\"\\s*:\\s*\"[^\"]+\"", "Replacement": "\"password\":\"<redacted>\"" }
|
||||
],
|
||||
"PerTargetOverrides": {
|
||||
"Weather/GetForecast": { "CapBytes": 4096 },
|
||||
"PlantDB": { "RedactSqlParamsMatching": "@apikey|@token" }
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Per-target keys bind by External System / Inbound Method / Notification List / Database Connection name.
|
||||
|
||||
---
|
||||
|
||||
## 9. Failure handling & idempotency
|
||||
|
||||
### 9.1 `EventId` is the dedup key
|
||||
- Generated at the originator (site for §6.1/§6.5, central for §6.4). Guid v4.
|
||||
- Central ingest is `INSERT … WHERE NOT EXISTS (SELECT 1 FROM AuditLog WHERE EventId = @id)`, executed under the PK constraint.
|
||||
- Idempotent across telemetry retries, reconciliation pulls, and any combination thereof.
|
||||
|
||||
### 9.2 Central MS SQL outage
|
||||
- Site telemetry calls fail; `ForwardState` stays `Pending`; backlog grows.
|
||||
- Reconciliation pulls also fail.
|
||||
- Site SQLite continues to absorb hot-path writes (no upstream dependency on the hot path).
|
||||
- `SiteAuditBacklog` health metric crosses thresholds → Health dashboard surfaces it on the affected site tile.
|
||||
- On recovery, telemetry drains; insert-if-not-exists handles any overlap.
|
||||
|
||||
### 9.3 Site SQLite write failure
|
||||
- Audit writer fails to append (disk full, schema lock, transient IO error).
|
||||
- **The action proceeds** — we do not fail script-initiated work because the audit write failed.
|
||||
- `SiteAuditWriteFailures` health metric increments; critical-severity Site Event Log entry.
|
||||
- A small in-memory ring (default 1024 rows) buffers events while the local writer is unhealthy; on ring overflow, oldest events are dropped with a Site Event Log warning per drop.
|
||||
|
||||
### 9.4 Telemetry actor wedged
|
||||
- Reconciliation pull (§6.3) is the fallback. If two consecutive reconciliation cycles report a non-draining backlog, the supervisor restarts the telemetry actor and a `SiteAuditTelemetryStalled` event fires.
|
||||
|
||||
### 9.5 Central direct-write failure
|
||||
- Inbound API: middleware audit failure is logged + metricked but never affects the HTTP response.
|
||||
- Notification Outbox dispatcher: audit failure logs critical and increments `CentralAuditWriteFailures`; the operational `Notifications` row update proceeds.
|
||||
|
||||
### 9.6 Dedup horizon — there isn't one
|
||||
`EventId` PK enforces uniqueness as long as a row exists in the table. Purge (§12) removes rows by `OccurredAtUtc`, not `EventId`; a stale telemetry retry arriving after the original was purged will insert a "new" row. Acceptable — a retry that arrives more than a year late is vanishingly rare and an extra row is harmless.
|
||||
|
||||
---
|
||||
|
||||
## 10. UI & query surface
|
||||
|
||||
### 10.1 Audit Log page (new, top-level)
|
||||
Lives under a new **Audit** nav group in Central UI (sibling to **Notifications**). Standard Blazor Server + Bootstrap, custom components per the project UI rules.
|
||||
|
||||
**Filter bar (top of page, collapses to one row when not focused):**
|
||||
- Time range (relative: 15m / 1h / 24h / 7d / custom).
|
||||
- Channel (multi-select: `ApiOutbound`, `DbOutbound`, `Notification`, `ApiInbound`).
|
||||
- Kind (filtered by selected channels).
|
||||
- Status (multi-select).
|
||||
- Site (multi-select, scoped to user's authorized sites).
|
||||
- Instance / Script (text search with autocomplete).
|
||||
- Target (text search — system+method, DB connection, list name).
|
||||
- Actor (text search — inbound API key name).
|
||||
- CorrelationId (paste a `TrackedOperationId` / `NotificationId` / request-id to see its full event sequence).
|
||||
- "Errors only" toggle (`Status IN ('Failed', 'Parked', 'Discarded')`).
|
||||
|
||||
**Results grid:**
|
||||
- Columns (resizable, reorderable, persisted per user): `OccurredAtUtc`, `Site`, `Channel`, `Kind`, `Status`, `Target`, `Actor`, `DurationMs`, `HttpStatus`, `ErrorMessage`.
|
||||
- Keyset pagination on `(OccurredAtUtc desc, EventId desc)`. Default page 100.
|
||||
- Click row → drilldown drawer.
|
||||
|
||||
**Drilldown drawer:**
|
||||
- Pretty-prints `RequestSummary` / `ResponseSummary` (JSON auto-detected; SQL syntax-highlighted).
|
||||
- Redaction indicators where headers/fields were stripped.
|
||||
- "Copy as cURL" for `ApiOutbound` / `ApiInbound` rows.
|
||||
- "Show all events for this operation" link → filters by `CorrelationId`.
|
||||
|
||||
### 10.2 Drill-in links from existing pages
|
||||
- **Notifications** row → "View audit history" → Audit Log filtered to `CorrelationId = NotificationId`.
|
||||
- **Site Calls** row → "View audit history" → Audit Log filtered to `CorrelationId = TrackedOperationId`.
|
||||
- **External Systems** detail → "Recent activity" → Audit Log filtered to `Target starts-with <system>`.
|
||||
- **Inbound API keys** detail → "Recent calls" → Audit Log filtered to `Actor = <key name>` AND `Channel = ApiInbound`.
|
||||
- **Sites** detail → new "Audit feed" tab.
|
||||
- **Instances** detail → new "Audit feed" tab.
|
||||
|
||||
### 10.3 Health dashboard tiles
|
||||
Three new tiles in an "Audit" KPI group:
|
||||
- **Audit volume** — events/min global + per-site sparkline.
|
||||
- **Audit error rate** — % rows where `Status IN ('Failed', 'Parked', 'Discarded')`, rolling 5 min.
|
||||
- **Audit backlog** — sum of `Pending` site rows; click → per-site breakdown.
|
||||
|
||||
### 10.4 Export
|
||||
Audit Log page **Export** button streams CSV (current filter) server-side. Default cap 100k rows; larger exports use the CLI (§15).
|
||||
|
||||
---
|
||||
|
||||
## 11. Security & tamper-evidence
|
||||
|
||||
### 11.1 Append-only enforcement
|
||||
- Application accesses `AuditLog` via a dedicated DB role `scadalink_audit_writer` granted `INSERT` + `SELECT` only — no `UPDATE`, no `DELETE`.
|
||||
- Purge runs under a separate role `scadalink_audit_purger` whose permissions are limited to the partition-switch operation (§12.2). Row-level `DELETE` is not granted even to purge.
|
||||
- A CI guard greps the data layer for any `UPDATE … AuditLog` or `DELETE … AuditLog` text and fails the build.
|
||||
|
||||
### 11.2 Authorization
|
||||
- Reading the Audit Log requires the existing **Audit** role (today used for the IAuditService config-change log) extended with a new **OperationalAudit** permission.
|
||||
- Per-site row scoping reuses the existing site-permission model from Security & Auth — a user sees only rows for sites they are authorized to operate.
|
||||
- Bulk export (UI button + CLI) requires an additional **AuditExport** permission.
|
||||
|
||||
### 11.3 Payload redaction at write
|
||||
See §8.2. Contract: unredacted secrets never persist. Safety net over-redacts on misconfiguration.
|
||||
|
||||
### 11.4 Tamper-evidence hash chain (deferred, v1.x)
|
||||
- Each row gains a `RowHash` column.
|
||||
- `RowHash = SHA-256(prev.RowHash || canonical(row))` per partition.
|
||||
- Computed by a chaining job that runs after each monthly partition closes.
|
||||
- Verifiable offline via `scadalink audit verify-chain --month YYYY-MM`.
|
||||
- Default **off** in v1 to avoid operational burden. Flag for v1.x.
|
||||
|
||||
### 11.5 Site SQLite security
|
||||
- File permissions: read/write by the ScadaLink service account only.
|
||||
- Not backed up off-machine — site SQLite is a buffer with a forwarding guarantee, not a record. Central is the durable home.
|
||||
|
||||
---
|
||||
|
||||
## 12. Retention & purge mechanics
|
||||
|
||||
### 12.1 Central retention defaults
|
||||
- **365 days** based on `OccurredAtUtc`. Configurable via `AuditLog:RetentionDays` (min 7, max 3650, validated at startup).
|
||||
- **Single global retention in v1** — no per-channel/Kind overrides. Deferred to v1.x once production cost data shows whether overrides are needed.
|
||||
|
||||
### 12.2 Partition strategy
|
||||
- Monthly partitions on `OccurredAtUtc`. Partition function `pf_AuditLog_Month`, scheme `ps_AuditLog_Month`, created in the EF Core migration.
|
||||
- Purge by partition switch: move the eligible partition to a staging table, then drop. No row-by-row delete; no log bloat.
|
||||
- Partition-maintenance job rolls forward each month (creates the next month's partition ahead of time).
|
||||
|
||||
### 12.3 Purge job
|
||||
- Singleton actor `AuditLogPurgeActor` on the active central node, runs daily.
|
||||
- Switches out any partition whose latest `OccurredAtUtc` is older than the global retention window. Pure partition-switch; no row-level deletes.
|
||||
- Emits a `AuditLog:Purged` event (partition range, rowcount, duration).
|
||||
|
||||
### 12.4 Site SQLite purge
|
||||
- Daily site job, hard invariant per §4: purge only `OccurredAtUtc < threshold AND ForwardState IN ('Forwarded','Reconciled')`.
|
||||
- Default site retention **7 days** (configurable, min 1, max 90).
|
||||
- Backlog metric (§9.2) provides visibility into "central outage → site bloat" before disk-full.
|
||||
|
||||
---
|
||||
|
||||
## 13. Performance & sizing
|
||||
|
||||
Rough back-of-envelope; load testing will confirm.
|
||||
|
||||
### 13.1 Per-site event rate (assumed nominal site)
|
||||
| Channel/Kind | Typ events/min | Peak events/min |
|
||||
|---|---:|---:|
|
||||
| `ApiOutbound.ApiCall` | 10 | 100 |
|
||||
| `ApiOutbound.ApiCallCached` (~4 rows/op incl. `CachedSubmit`/`CachedResolve`) | 4 | 20 |
|
||||
| `DbOutbound.DbWrite` (writes) | 30 | 300 |
|
||||
| `DbOutbound.DbWrite` (reads) | 60 | 600 |
|
||||
| `DbOutbound.DbWriteCached` (~4 rows/op incl. `CachedSubmit`/`CachedResolve`) | 4 | 20 |
|
||||
| `Notification.NotifySend` (site-emit) | 1 | 10 |
|
||||
| **Per-site total** | **~110** | **~1,050** |
|
||||
|
||||
### 13.2 Central total (50-site deployment)
|
||||
- Typical: ~5,500 events/min = ~**92 events/sec**.
|
||||
- Peak: ~52,500 events/min = ~**875 events/sec**.
|
||||
- Plus central-originated (Notification dispatch + Inbound API): assume ~30 events/sec typical.
|
||||
|
||||
MS SQL handles this with batched ingest and the time-aligned indexes.
|
||||
|
||||
### 13.3 Row size
|
||||
- Fixed columns: ~250 bytes.
|
||||
- Average captured payload: ~1 KB.
|
||||
- Per row: **~1.3 KB**.
|
||||
|
||||
### 13.4 Yearly central footprint
|
||||
- Typical: 100 events/sec × 86,400 × 365 × 1.3 KB ≈ **~4 TB** at default cap.
|
||||
- Cap reduction (8 KB → 2 KB) or per-channel retention shaves this multi-fold.
|
||||
|
||||
### 13.5 Site SQLite footprint
|
||||
- 110/min × 60 × 24 × 7 × 1.3 KB ≈ **~140 MB / site** at the 7-day window. Trivial.
|
||||
|
||||
### 13.6 Levers
|
||||
- Reduce `DefaultCapBytes` per §8.1.
|
||||
- Tighten per-channel retention per §12.1 (especially `DbOutbound.DbWrite` read traffic).
|
||||
- Defer to v1.x: Parquet archival to object storage before purge (§15.2).
|
||||
|
||||
---
|
||||
|
||||
## 14. KPI surface & relationship to existing KPIs
|
||||
|
||||
### 14.1 New Audit Log KPIs
|
||||
- **Volume** — events/min, global + per-site.
|
||||
- **Error rate** — % rows where `Status IN ('Failed', 'Parked', 'Discarded')`, rolling 5 min.
|
||||
- **Backlog** — sum of `Pending` site rows.
|
||||
- **Top inbound callers** — top-10 `Actor` by request count, last 1h.
|
||||
- **Top outbound 5xx** — top-10 `Target` by 5xx-status count, last 1h.
|
||||
|
||||
### 14.2 Relationship to existing KPIs
|
||||
- **Notification Outbox KPIs** (queue depth, parked, delivered-last-interval, etc.) — unchanged, sourced from `Notifications`. Audit Log KPIs describe the audit table itself, not the notification subsystem.
|
||||
- **Site Call Audit KPIs** — unchanged, sourced from `SiteCalls`.
|
||||
- Audit Log KPIs occupy their own group on the Health dashboard. Nothing is collapsed or superseded.
|
||||
|
||||
---
|
||||
|
||||
## 15. CLI & external access
|
||||
|
||||
### 15.1 CLI commands
|
||||
New `scadalink audit` command group:
|
||||
- `scadalink audit query --site <s> --since <t> --kind <k> [...]` — same filter set as the UI.
|
||||
- `scadalink audit export --since <t> --until <t> --format csv|jsonl|parquet --output <path>` — bulk export, server-side streaming.
|
||||
- `scadalink audit verify-chain --month <YYYY-MM>` — hash-chain verification (when §11.4 is enabled).
|
||||
|
||||
Requires the same **OperationalAudit** / **AuditExport** permissions as the UI.
|
||||
|
||||
### 15.2 Object-storage archival (deferred, v1.x)
|
||||
A monthly job dumps the closing partition to Parquet on operator-configured object storage before central purge — enabling indefinite cold retention without bloating MS SQL. Flag for v1.x; not in initial scope.
|
||||
|
||||
---
|
||||
|
||||
## 16. Locked decisions
|
||||
|
||||
| # | Question | Decision |
|
||||
|---|---|---|
|
||||
| 1 | Component number | **#23 Audit Log** (README matrix + HighLevelReqs). |
|
||||
| 2 | Nav placement | New top-level **Audit** nav group in Central UI. |
|
||||
| 3 | Hash-chain tamper evidence (§11.4) | Deferred to v1.x. v1 enforces append-only via DB grants only. |
|
||||
| 4 | Parquet archival to object storage (§15.2) | Deferred to v1.x. |
|
||||
| 5 | Per-channel retention overrides (§12.1) | Deferred to v1.x. v1 uses a single global `RetentionDays`. |
|
||||
| 6 | Default payload cap | **8 KB** for `RequestSummary` / `ResponseSummary`; **64 KB** on error rows (`Status IN ('Failed', 'Parked', 'Discarded')`). |
|
||||
|
||||
All earlier design decisions (purpose, topology, scope, payload depth, lifecycle granularity, retention default, site→central path, UI shape, cached-call audit emission, SQL parameter capture, never-fail-on-audit-failure) are also locked. See §1–§15.
|
||||
@@ -53,6 +53,10 @@
|
||||
"AuthMode": "None",
|
||||
"FromAddress": "scada-notifications@company.com"
|
||||
},
|
||||
"NotificationOutbox": {
|
||||
"DispatchInterval": "00:00:05",
|
||||
"DispatchBatchSize": 1000
|
||||
},
|
||||
"Logging": {
|
||||
"MinimumLevel": "Information"
|
||||
}
|
||||
|
||||
@@ -53,6 +53,10 @@
|
||||
"AuthMode": "None",
|
||||
"FromAddress": "scada-notifications@company.com"
|
||||
},
|
||||
"NotificationOutbox": {
|
||||
"DispatchInterval": "00:00:05",
|
||||
"DispatchBatchSize": 1000
|
||||
},
|
||||
"Logging": {
|
||||
"MinimumLevel": "Information"
|
||||
}
|
||||
|
||||
@@ -0,0 +1,380 @@
|
||||
# Notification Outbox Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Update the ScadaLink design documentation to introduce a central Notification Outbox — a new component #21 that receives store-and-forwarded notifications from sites, logs them to a type-agnostic `Notifications` table, and delivers them with retry, parking, status handles, and KPIs.
|
||||
|
||||
**Architecture:** This is a documentation-only change. The source of truth for the design is `docs/plans/notif.md` (the approved, refined design). Each task updates one or more existing component documents (or creates the new one) so the documentation set is internally consistent. There is no code and no test suite — verification for each task is a cross-reference consistency check (grep for stale references) plus a read-through against `notif.md`.
|
||||
|
||||
**Tech Stack:** Markdown design documents under `docs/requirements/`, the master `README.md` index, and `CLAUDE.md`. Conventions are defined in `CLAUDE.md` (Document Conventions, Editing Rules).
|
||||
|
||||
**Source of truth:** `docs/plans/notif.md` — every task below implements a slice of it. Read it first.
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Create Component-NotificationOutbox.md
|
||||
|
||||
**Files:**
|
||||
- Create: `docs/requirements/Component-NotificationOutbox.md`
|
||||
- Reference: `docs/plans/notif.md` (full design)
|
||||
- Reference: `docs/requirements/Component-StoreAndForward.md` (structure template — closest analogue)
|
||||
|
||||
**Step 1: Read the source and a template**
|
||||
|
||||
Read `docs/plans/notif.md` in full, and `docs/requirements/Component-StoreAndForward.md` for the standard component-document structure (Purpose, Location, Responsibilities, design sections, Dependencies, Interactions).
|
||||
|
||||
**Step 2: Write the component document**
|
||||
|
||||
Create `Component-NotificationOutbox.md` following the standard structure. Required sections:
|
||||
|
||||
- **Purpose** — central component that receives store-and-forwarded notifications from sites, logs every one to the `Notifications` table (single audit source), and delivers them via per-type adapters with retry, parking, status tracking, and KPIs.
|
||||
- **Location** — Central cluster. `NotificationOutboxActor` is a singleton on the active central node.
|
||||
- **Responsibilities** — owns the durable central queue (`Notifications` table), the dispatcher loop, retry scheduling, parking, per-notification status tracking, KPI computation. Delivery work runs on a dedicated blocking-I/O dispatcher.
|
||||
- **End-to-end flow** — reproduce the flow diagram from `notif.md` (script → site S&F → central ingest → outbox dispatch → adapter).
|
||||
- **The `Notifications` table** — the full field table from `notif.md`, the status lifecycle (`Forwarding` → `Pending` → `Retrying` → `Delivered` / `Parked` / `Discarded`), retry policy (reuses SMTP config max-retry + fixed interval), and retention (daily purge of terminal rows after ~1-year configurable window).
|
||||
- **Ingest & idempotency** — insert-if-not-exists on `NotificationId`; ack-after-persist; at-least-once site→central delivery.
|
||||
- **Dispatcher** — polls due rows, resolves the list, delivers via the matching adapter.
|
||||
- **Delivery adapters** — `INotificationDeliveryAdapter` per `Type` returning `success | transient | permanent`; Email adapter implemented now, Teams/others future.
|
||||
- **Active/standby** — singleton on active central node; state in MS SQL; no Akka replication; resumes from the table on failover.
|
||||
- **Monitoring** — KPIs (queue depth, stuck count, parked count, delivered-last-interval, oldest-pending age); stuck = `Pending`/`Retrying` older than 10 min (configurable), display-only.
|
||||
- **Configuration** — `NotificationOutboxOptions`: dispatch interval, stuck-age threshold, terminal-row retention window.
|
||||
- **Dependencies** — Notification Service (definitions + delivery adapters), Configuration Database (`Notifications` table), Central–Site Communication (notification submission), Health Monitoring (KPIs), Central UI (Outbox page).
|
||||
- **Interactions** — Site S&F Engine (inbound submissions), Notification Service, Central UI, Health Monitoring.
|
||||
|
||||
Match the prose density and heading style of the neighbouring component docs. Edit in place; no backup files (per `CLAUDE.md` Editing Rules).
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n "Notification Outbox\|NotificationOutboxActor\|Notifications table" docs/requirements/Component-NotificationOutbox.md`
|
||||
Expected: the new terms appear; the document reads as a peer of the other component docs.
|
||||
|
||||
Run: `grep -rn "Forwarding" docs/requirements/Component-NotificationOutbox.md`
|
||||
Expected: the `Forwarding` status is documented as site-local, never stored centrally.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-NotificationOutbox.md
|
||||
git commit -m "docs(notification-outbox): add Component-NotificationOutbox design doc"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Revise Component-NotificationService.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-NotificationService.md`
|
||||
- Reference: `docs/plans/notif.md`
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `docs/requirements/Component-NotificationService.md` in full. It currently describes site-side SMTP delivery.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- **Purpose / Location** — the service is now central-only: manages notification-list and SMTP definitions, and provides the delivery adapters. It no longer delivers from sites.
|
||||
- **Definitions** — notification lists gain a **`Type`** field (`Email` now; `Teams`/others later) plus type-specific targets. Definitions and SMTP config are **no longer deployed to sites** (remove the deploy-to-sites / local-SQLite responsibilities).
|
||||
- **Delivery** — replace the site-side delivery section. Delivery is performed centrally by the Notification Outbox calling an `INotificationDeliveryAdapter` per type. The Email adapter is the relocated SMTP composition/send logic. Recipient resolution happens at central at delivery time.
|
||||
- **Script API** — `Notify.To("list").Send(subject, body)` is now **async with a status handle**: it returns a `NotificationId` immediately. Add `Notify.Status(notificationId)` which returns a status record (status, retry count, last error, key timestamps). Note the site-local `Forwarding` status.
|
||||
- **Error classification** — there is no synchronous permanent-failure return to the script anymore; permanent failures result in a `Parked` row. Reframe accordingly.
|
||||
- **Dependencies / Interactions** — depends on Notification Outbox (not the site S&F Engine directly); remove the local-SQLite dependency.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n "site\|SQLite\|deploy" docs/requirements/Component-NotificationService.md`
|
||||
Expected: no remaining claims that sites send email or that lists/SMTP config deploy to site SQLite.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-NotificationService.md
|
||||
git commit -m "docs(notification-outbox): central-only Notification Service, typed lists, async API"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Revise Component-StoreAndForward.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-StoreAndForward.md`
|
||||
- Reference: `docs/plans/notif.md`
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `docs/requirements/Component-StoreAndForward.md`. It lists three categories including notifications, with SMTP as the notification delivery target.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- The notification category is **retained**, but its delivery *target* changes from SMTP to **the central cluster**. "Delivering" a buffered notification now means handing it to the Communication Layer for central and clearing it on central's ack.
|
||||
- The site→central forward uses a fixed retry interval configured in the host `appsettings.json` (it concerns reaching central, not any notification list — distinct from the per-entity retry settings used by the other two categories).
|
||||
- Update the Message Lifecycle, Retry Policy, Message Format, Dependencies, and Interactions sections so the notification category is described in terms of central delivery, not SMTP.
|
||||
- External-system-call and cached-database-write categories are unchanged.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -i "notif" docs/requirements/Component-StoreAndForward.md`
|
||||
Expected: notification references describe central as the target; no SMTP claims remain.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-StoreAndForward.md
|
||||
git commit -m "docs(notification-outbox): retarget S&F notification category to central"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Revise Component-HealthMonitoring.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-HealthMonitoring.md`
|
||||
- Reference: `docs/plans/notif.md` (Monitoring section)
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `docs/requirements/Component-HealthMonitoring.md`. The Monitored Metrics table includes "Store-and-forward buffer depth" with a notification sub-category.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- Add **Notification Outbox KPIs** as **central-computed headline metrics** (queue depth, stuck count, parked count) — distinct from the site-reported metrics, since the outbox is central-side and not part of the site health report.
|
||||
- Clarify that the S&F buffer-depth notification metric now means "notifications awaiting forward to central" (the site→central leg), still site-reported.
|
||||
- Note that outbox KPIs are point-in-time, computed on demand from the `Notifications` table (no time-series store), consistent with the existing "current status only" philosophy.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -i "notif\|outbox" docs/requirements/Component-HealthMonitoring.md`
|
||||
Expected: outbox KPIs present as central-computed; S&F notification metric reworded to the site→central leg.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-HealthMonitoring.md
|
||||
git commit -m "docs(notification-outbox): add central-computed outbox KPIs to Health Monitoring"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 5: Revise Component-SiteEventLogging.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-SiteEventLogging.md`
|
||||
- Reference: `docs/plans/notif.md` (Refinement decisions — site-side diagnostics)
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `docs/requirements/Component-SiteEventLogging.md`. The "Events Logged" table has categories for Script, Alarm, Deployment, Connection, S&F, Instance Lifecycle.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- Add a **Notification** category to the Events Logged table. It records **forward failures and long-buffered notifications only** — not routine enqueue/forward success events (central holds the authoritative record; site logging covers the in-transit blind spot).
|
||||
- Add `notification` to the Event Type enumeration in the Event Entry Schema.
|
||||
- Update Dependencies/Interactions to mention the site S&F notification path as a source.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -i "notif" docs/requirements/Component-SiteEventLogging.md`
|
||||
Expected: the Notification category appears and is scoped to forward failures / long-buffered only.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-SiteEventLogging.md
|
||||
git commit -m "docs(notification-outbox): add Notification category to Site Event Logging"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 6: Revise Component-Communication.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-Communication.md`
|
||||
- Reference: `docs/plans/notif.md` (end-to-end flow, ingest & idempotency)
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `docs/requirements/Component-Communication.md`. It has a numbered "Communication Patterns" section (1–8, with 6a) and a "Message Timeouts" table.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- Add a new communication pattern — **Notification Submission (Site → Central)** — pattern 9. Pattern: fire-and-forget with acknowledgment (like pattern 5). The site S&F engine sends a `NotificationSubmit` message to central; central acks after persisting the row to the `Notifications` table. The `NotificationId` GUID is the idempotency key.
|
||||
- Update the Purpose sentence's list of what the transport carries to include notification submission.
|
||||
- Add a row to the Message Timeouts table for the new pattern (if it uses ask-with-ack).
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -i "notification" docs/requirements/Component-Communication.md`
|
||||
Expected: the new pattern and the `NotificationSubmit` message appear.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-Communication.md
|
||||
git commit -m "docs(notification-outbox): add Notification Submission communication pattern"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 7: Revise Component-CentralUI.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-CentralUI.md`
|
||||
- Reference: `docs/plans/notif.md` (Monitoring — Surfacing)
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `docs/requirements/Component-CentralUI.md`. The "Workflows / Pages" section already has a "Notification List Management" page and a "Parked Message Management" page; the Health Monitoring Dashboard is also listed.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- Add a new **Notification Outbox** page section (Deployment Role) under Workflows / Pages, near Health Monitoring Dashboard / Parked Message Management. Contents: KPI tiles (queue depth, stuck, parked, delivered-last-interval, oldest-pending age); a queryable notification list filterable by status, type, source site, list, and time range, with a stuck-only toggle and keyword search on subject; Retry and Discard actions on parked notifications; badged stuck rows.
|
||||
- Note the typed-list change in the Notification List Management page (lists now have a `Type`).
|
||||
- Note the Health Monitoring Dashboard now shows headline outbox KPI tiles.
|
||||
- Follow the established UI conventions (Blazor Server + Bootstrap, custom components, clean corporate design — per `CLAUDE.md`).
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -i "outbox\|notification" docs/requirements/Component-CentralUI.md`
|
||||
Expected: the Notification Outbox page is present and consistent with the design.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-CentralUI.md
|
||||
git commit -m "docs(notification-outbox): add Notification Outbox page to Central UI"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 8: Revise Component-ConfigurationDatabase.md and Component-Commons.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-ConfigurationDatabase.md`
|
||||
- Modify: `docs/requirements/Component-Commons.md`
|
||||
- Reference: `docs/plans/notif.md` (Notifications table)
|
||||
|
||||
**Step 1: Read the current documents**
|
||||
|
||||
Read both documents. Configuration Database describes EF Core repositories, migrations, audit logging. Commons describes POCO entities, repository interfaces, and message contracts.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
In `Component-ConfigurationDatabase.md`:
|
||||
- Add the **`Notifications` table** to the data access layer — a new EF Core entity, repository, and migration. Note the type-agnostic schema (`Type` discriminator + `TypeData` JSON), the status enum, and the daily purge job for terminal rows after the configurable retention window.
|
||||
|
||||
In `Component-Commons.md`:
|
||||
- Add the **`Notification` entity POCO** (persistence-ignorant, in `Entities/`) and its **repository interface** (in `Interfaces/`).
|
||||
- Add the **`NotificationSubmit`** message contract (and its ack) under `Messages/`, following the additive-only versioning rule.
|
||||
- Add the typed notification-list fields if notification-list entities live in Commons.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -i "notification" docs/requirements/Component-ConfigurationDatabase.md docs/requirements/Component-Commons.md`
|
||||
Expected: the `Notifications` table, `Notification` entity, repository interface, and message contracts are documented in their respective layers.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-ConfigurationDatabase.md docs/requirements/Component-Commons.md
|
||||
git commit -m "docs(notification-outbox): add Notifications table, entity, and message contracts"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 9: Update README.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `README.md`
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `README.md`. It has a Technology Stack table, a Component Design Documents table (20 rows), and architecture diagrams.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- Add **row 21 — Notification Outbox** to the Component Design Documents table, linking `docs/requirements/Component-NotificationOutbox.md`, with a one-line description.
|
||||
- Update the Notification Service row (#8) description if it no longer reflects central-only delivery / typed lists.
|
||||
- Update the Technology Stack "Notifications" row to reflect central delivery.
|
||||
- If any architecture diagram shows the notification path, update it; otherwise leave diagrams unchanged.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n -c "Component-" README.md` and confirm the component table has 21 component rows.
|
||||
Run: `grep -n "Notification Outbox" README.md`
|
||||
Expected: the new component is row 21 and linked correctly.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add README.md
|
||||
git commit -m "docs(notification-outbox): add Notification Outbox to README component index"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 10: Update CLAUDE.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `CLAUDE.md`
|
||||
|
||||
**Step 1: Read the current document**
|
||||
|
||||
Read `CLAUDE.md`. It has "Current Component List (20 components)" and a "Key Design Decisions" section.
|
||||
|
||||
**Step 2: Apply the revisions**
|
||||
|
||||
- Change the heading to "Current Component List (21 components)" and add **21. Notification Outbox** with a one-line description.
|
||||
- Update the Notification Service entry (#8) to reflect central-only definitions + delivery adapters.
|
||||
- Under "Key Design Decisions → External Integrations" (or a suitable subsection), add entries capturing: notifications store-and-forwarded site→central; central `Notifications` table as single audit source; type-agnostic table (email now, Teams later); async `Notify.Send` returning a status handle; central Notification Outbox singleton; site-local `Forwarding` status.
|
||||
|
||||
**Step 3: Verify consistency**
|
||||
|
||||
Run: `grep -n "21\|Notification Outbox" CLAUDE.md`
|
||||
Expected: component count is 21 and the new component + design decisions are listed.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add CLAUDE.md
|
||||
git commit -m "docs(notification-outbox): update component list and design decisions in CLAUDE.md"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 11: Cross-reference consistency sweep
|
||||
|
||||
**Files:**
|
||||
- Verify: all of `docs/requirements/`, `README.md`, `CLAUDE.md`
|
||||
|
||||
**Step 1: Check for stale notification-delivery claims**
|
||||
|
||||
Run: `grep -rn -i "site.*send.*email\|email.*via SMTP\|SMTP.*at sites\|notification list.*deploy" docs/ README.md CLAUDE.md`
|
||||
Expected: no results claiming sites send email directly or that notification lists deploy to sites.
|
||||
|
||||
**Step 2: Check cross-references resolve**
|
||||
|
||||
Run: `grep -rn "Notification Outbox\|NotificationOutbox" docs/requirements/ README.md CLAUDE.md`
|
||||
Expected: every reference points to a real component / document; no dangling references.
|
||||
|
||||
**Step 3: Check component count agreement**
|
||||
|
||||
Confirm `README.md` component table and `CLAUDE.md` component list both say 21 components and list Notification Outbox identically.
|
||||
|
||||
**Step 4: Review the full diff**
|
||||
|
||||
Run: `git diff main --stat` (or review the branch's commits)
|
||||
Read through the changes and confirm the documentation set is internally consistent with `docs/plans/notif.md`.
|
||||
|
||||
**Step 5: Commit any fixes**
|
||||
|
||||
```bash
|
||||
git add -A
|
||||
git commit -m "docs(notification-outbox): fix stale cross-references after outbox addition"
|
||||
```
|
||||
|
||||
(Skip the commit if the sweep found nothing to fix.)
|
||||
|
||||
---
|
||||
|
||||
## Done
|
||||
|
||||
All component documents, the README index, and CLAUDE.md reflect the central Notification Outbox design. The documentation set is internally consistent with `docs/plans/notif.md`.
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-18-notification-outbox.md",
|
||||
"tasks": [
|
||||
{"id": 7, "subject": "Task 1: Create Component-NotificationOutbox.md", "status": "pending"},
|
||||
{"id": 8, "subject": "Task 2: Revise Component-NotificationService.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 9, "subject": "Task 3: Revise Component-StoreAndForward.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 10, "subject": "Task 4: Revise Component-HealthMonitoring.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 11, "subject": "Task 5: Revise Component-SiteEventLogging.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 12, "subject": "Task 6: Revise Component-Communication.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 13, "subject": "Task 7: Revise Component-CentralUI.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 14, "subject": "Task 8: Revise Component-ConfigurationDatabase.md and Component-Commons.md", "status": "pending", "blockedBy": [7]},
|
||||
{"id": 15, "subject": "Task 9: Update README.md", "status": "pending", "blockedBy": [7, 8, 9, 10, 11, 12, 13, 14]},
|
||||
{"id": 16, "subject": "Task 10: Update CLAUDE.md", "status": "pending", "blockedBy": [7, 8, 9, 10, 11, 12, 13, 14]},
|
||||
{"id": 17, "subject": "Task 11: Cross-reference consistency sweep", "status": "pending", "blockedBy": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}
|
||||
],
|
||||
"lastUpdated": "2026-05-18"
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
# Cached Call Tracking — Design
|
||||
|
||||
**Date**: 2026-05-19
|
||||
**Status**: Approved
|
||||
**Topic**: Trackable IDs for cached external system calls and cached database writes
|
||||
|
||||
## Problem
|
||||
|
||||
`ExternalSystem.CachedCall()` and `Database.CachedWrite()` are fire-and-forget: a
|
||||
script gets no handle back, cannot confirm delivery, and an operator cannot tie a
|
||||
parked S&F message to a known business operation. `Notify.Send()` already returns a
|
||||
trackable `NotificationId`. The goal is to give cached external/database calls the
|
||||
same first-class traceability, under a tracking model unified across all three
|
||||
store-and-forward producers.
|
||||
|
||||
## Decision
|
||||
|
||||
Add a trackable ID to cached calls via **Approach B — a sibling central component
|
||||
(`Site Call Audit`) plus shared tracking contracts in Commons**. The Notification
|
||||
Outbox is left unchanged; unification lives in shared types and a consistent script
|
||||
API, not in a merged table or component.
|
||||
|
||||
### Why a sibling, not a merged component
|
||||
|
||||
Delivery locality is the decisive constraint:
|
||||
|
||||
- **Notifications** are *central-delivered*: sites store-and-forward them to the
|
||||
central cluster, which delivers via SMTP. The `NotificationOutboxActor` runs a
|
||||
dispatcher loop. Central becomes the source of truth after handoff.
|
||||
- **Cached calls / DB writes** are *site-delivered*: the external system or database
|
||||
often sits on the site's own network and is unreachable from central. The site's
|
||||
S&F Engine must always own delivery, and the **site remains the source of truth**
|
||||
for status. Central audit is an eventually-consistent mirror.
|
||||
|
||||
Merging both into one component (Approach A) would put a dispatcher loop that is live
|
||||
for some rows and dormant for others into a single component, hiding a real
|
||||
architectural difference. Approach B expresses the difference honestly while still
|
||||
giving scripts a unified ID model and `Status()` API.
|
||||
|
||||
## Unified tracking model
|
||||
|
||||
### `TrackedOperationId`
|
||||
|
||||
A GUID, defined in Commons, generated **caller-side at the site at call time**. It is
|
||||
both the tracking handle returned to the script and the idempotency key for telemetry
|
||||
sent to central. `Notify.Send()`'s existing `NotificationId` is the notification-domain
|
||||
name for this same type — no behavior change for notifications.
|
||||
|
||||
### Script API
|
||||
|
||||
| Call | Returns |
|
||||
|---|---|
|
||||
| `ExternalSystem.CachedCall(system, method, params)` | `TrackedOperationId` |
|
||||
| `Database.CachedWrite(name, sql, params)` | `TrackedOperationId` |
|
||||
| `Notify.Send(...)` | `TrackedOperationId` (unchanged) |
|
||||
| `Tracking.Status(id)` | unified status record (status, retry count, last error, key timestamps) |
|
||||
|
||||
`Tracking.Status(id)` is the unified accessor. `Notify.Status(id)` is retained as a
|
||||
thin alias for backward compatibility.
|
||||
|
||||
### Status lifecycle
|
||||
|
||||
`Pending → Retrying → Delivered / Parked / Failed / Discarded`
|
||||
|
||||
- **Delivered** — succeeded. A cached call that succeeds on its first immediate
|
||||
attempt goes straight here and never enters the S&F buffer.
|
||||
- **Parked** — transient retries exhausted; awaiting manual action.
|
||||
- **Failed** — permanent failure (e.g. HTTP 4xx). The error is *also* returned
|
||||
synchronously to the calling script, exactly as today; the record captures it.
|
||||
This is the one state beyond the notification lifecycle.
|
||||
- **Discarded** — operator discarded a parked operation.
|
||||
|
||||
There is no `Forwarding` state for cached calls — that exists only because
|
||||
notifications hand off to central. For cached calls, `Tracking.Status(id)` is always
|
||||
answered site-locally and authoritatively.
|
||||
|
||||
## Site-side architecture
|
||||
|
||||
### Site-local operation tracking table
|
||||
|
||||
A new SQLite table alongside the existing S&F buffer DB. One row per
|
||||
`TrackedOperationId`, created the moment the script issues the cached call,
|
||||
regardless of outcome:
|
||||
|
||||
- Fields: kind, target summary (system+method, or DB name), status, retry count,
|
||||
last error, created/updated/terminal timestamps, source provenance
|
||||
(instance/script).
|
||||
- This table is the **status record**. The S&F buffer remains purely the **retry
|
||||
mechanism**; a buffered message references its `TrackedOperationId`.
|
||||
- Immediate success writes a terminal `Delivered` row directly here, with nothing
|
||||
placed in the S&F buffer.
|
||||
- `Tracking.Status(id)` reads this table — local, authoritative, available even when
|
||||
central is unreachable.
|
||||
- Retention: terminal rows purged after a configurable window (default 7 days; the
|
||||
site holds live operational state, central holds long-term audit).
|
||||
|
||||
### Telemetry to central
|
||||
|
||||
On every lifecycle transition (`Created → Retrying → Delivered/Parked/Failed/
|
||||
Discarded`) the site emits a telemetry event over the existing site→central channel:
|
||||
`TrackedOperationId`, kind, summary, status, retry count, last error, timestamps,
|
||||
source site. Best-effort, at-least-once, idempotent on the ID.
|
||||
|
||||
### Reconciliation
|
||||
|
||||
Because telemetry is best-effort, the central side periodically (and on reconnect)
|
||||
pulls "all tracking rows changed since cursor X" per site. Missed telemetry
|
||||
self-heals. The site never depends on central; central converges to the site.
|
||||
|
||||
### Carried-over rules (unchanged)
|
||||
|
||||
- Tracking rows, like buffered messages, are not cleared on instance deletion.
|
||||
- Cached-call idempotency remains the caller's responsibility — a retry can still
|
||||
double-deliver.
|
||||
|
||||
## Central — Site Call Audit component (new component #22)
|
||||
|
||||
### `SiteCalls` table (central MS SQL)
|
||||
|
||||
Sibling of the `Notifications` table. One row per `TrackedOperationId`: source site,
|
||||
kind, target summary, status, retry count, last error, created/updated/terminal
|
||||
timestamps. Fed only by site telemetry and reconciliation pulls.
|
||||
|
||||
Ingestion is **insert-if-not-exists**, then **upsert-on-newer-status**. The lifecycle
|
||||
is monotonic, so status only advances, never regresses — making at-least-once and
|
||||
out-of-order telemetry harmless. Daily purge of terminal rows after a configurable
|
||||
window (default 365 days, mirroring `Notifications`).
|
||||
|
||||
### `SiteCallAuditActor`
|
||||
|
||||
Singleton on the active central node. Ingests telemetry, runs the periodic
|
||||
reconciliation pulls, computes KPIs, and relays Retry/Discard commands to sites.
|
||||
|
||||
It is **not a dispatcher** — the crucial difference from `NotificationOutboxActor`.
|
||||
Central has no path to a site's external systems or databases; this component is an
|
||||
audit sink, a query surface, and a command relay only.
|
||||
|
||||
### KPIs
|
||||
|
||||
Point-in-time from the `SiteCalls` table, global and per-site, mirroring the
|
||||
Notification Outbox KPI shape: buffered count (`Pending`+`Retrying`), parked count,
|
||||
failed-last-interval, delivered-last-interval, oldest-pending age, and stuck count
|
||||
(`Pending`/`Retrying` older than a configurable threshold, default 10 minutes —
|
||||
display-only, no alerting).
|
||||
|
||||
## Central→site command path (Retry / Discard)
|
||||
|
||||
Parked operations live in the site's S&F buffer, so Retry/Discard from the Central UI
|
||||
must travel down to the owning site:
|
||||
|
||||
- New ClusterClient command/control messages, central→site:
|
||||
`RetryParkedOperation(TrackedOperationId)` and
|
||||
`DiscardParkedOperation(TrackedOperationId)`, riding the existing per-site
|
||||
ClusterClient.
|
||||
- The site applies the command to its S&F buffer / tracking table, then emits normal
|
||||
telemetry reflecting the new state (`Retrying`, or `Discarded`).
|
||||
- Central never directly mutates the `SiteCalls` row. It sends the command and lets
|
||||
the resulting telemetry update the audit row — the site stays the single source of
|
||||
truth.
|
||||
- If the site is offline, the command fails fast and the UI surfaces a
|
||||
"site unreachable" message.
|
||||
|
||||
## Central UI
|
||||
|
||||
New page — **Site Calls** — in the same nav group as the Notification Outbox page:
|
||||
|
||||
- Covers cached calls only: `ExternalCall` + `DatabaseWrite`. Notifications keep their
|
||||
existing dedicated Notification Outbox page.
|
||||
- Queryable list filtered by site, kind, status, and time range. Columns: timestamp,
|
||||
site, kind, target summary, status badge, retry count, last error.
|
||||
- Retry / Discard actions on `Parked` rows, issuing the central→site commands above.
|
||||
- Headline KPI tiles on the Health dashboard alongside the existing Notification
|
||||
Outbox tiles. Stuck rows get a display-only badge — no escalation.
|
||||
- Custom Blazor Server + Bootstrap components, consistent with the rest of the
|
||||
Central UI.
|
||||
|
||||
## Error handling & edge cases
|
||||
|
||||
- **Telemetry loss** — reconciliation pull self-heals; central is explicitly
|
||||
eventually-consistent.
|
||||
- **Out-of-order / duplicate telemetry** — monotonic-status upsert keyed on
|
||||
`TrackedOperationId` makes both harmless.
|
||||
- **Permanent failure on a cached call** — error returned synchronously to the script
|
||||
(unchanged) and recorded as terminal `Failed`.
|
||||
- **Site offline during Retry/Discard** — command fails fast; UI says so; the audit
|
||||
row is unchanged until confirming telemetry arrives.
|
||||
- **Cached-call double-delivery** — still the caller's responsibility; the idempotency
|
||||
note stays in the ESG doc.
|
||||
- **Instance deletion** — tracking rows and buffered messages survive, per the
|
||||
existing S&F rule.
|
||||
|
||||
## Affected documents
|
||||
|
||||
- **New**: `docs/requirements/Component-SiteCallAudit.md`
|
||||
- `Component-ExternalSystemGateway.md` — `CachedCall`/`CachedWrite` return
|
||||
`TrackedOperationId`; `Failed` state; `Tracking.Status`.
|
||||
- `Component-StoreAndForward.md` — site-local tracking table, telemetry emission,
|
||||
reconciliation, `TrackedOperationId` on buffer entries.
|
||||
- `Component-SiteRuntime.md` — Script Runtime API: return types and
|
||||
`Tracking.Status(id)`.
|
||||
- `Component-Communication.md` — telemetry channel and
|
||||
`RetryParkedOperation`/`DiscardParkedOperation` commands.
|
||||
- `Component-Commons.md` — `TrackedOperationId`, unified status enum, telemetry
|
||||
message contracts.
|
||||
- `Component-ConfigurationDatabase.md` — `SiteCalls` table, EF mapping, migration.
|
||||
- `Component-CentralUI.md` — new Site Calls page.
|
||||
- `Component-HealthMonitoring.md` — KPI tiles on the dashboard.
|
||||
- `Component-NotificationService.md` / `Component-NotificationOutbox.md` — note the
|
||||
shared `TrackedOperationId` model and `Notify.Status` alias.
|
||||
- `README.md` — component table updated to 22 components.
|
||||
- `CLAUDE.md` — component list and Key Design Decisions.
|
||||
|
||||
## Out of scope
|
||||
|
||||
- A CLI surface for site-local Retry/Discard (can be added later if needed).
|
||||
- Merging notifications into the Site Calls page or a unified outbox component.
|
||||
- Routing cached-call delivery through central.
|
||||
@@ -0,0 +1,566 @@
|
||||
# Cached Call Tracking Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Give cached external system calls and cached database writes a trackable `TrackedOperationId`, backed by a site-local tracking table and a new central `Site Call Audit` component, under a tracking model unified with `Notify.Send`.
|
||||
|
||||
**Architecture:** Approach B from the design doc — a sibling central component (`Site Call Audit`), not a merged outbox. The site stays the source of truth for cached-call status; central audit is an eventually-consistent mirror fed by best-effort telemetry plus a reconciliation pull. Delivery of cached calls remains site-local.
|
||||
|
||||
**Tech Stack:** This is a design-documentation change. "Implementation" means editing Markdown design documents under `docs/requirements/`, plus `README.md` and `CLAUDE.md`. No source code is touched. The authoritative design is `docs/plans/2026-05-19-cached-call-tracking-design.md` — read it before starting.
|
||||
|
||||
**Working conventions (from `CLAUDE.md`):**
|
||||
- Edit documents in place; no copies or backups.
|
||||
- Component docs follow: Purpose, Location, Responsibilities, design sections, Dependencies, Interactions.
|
||||
- Keep cross-references accurate across all docs.
|
||||
- Use `git diff` to review before committing.
|
||||
|
||||
**Per-task workflow (replaces TDD for this docs project):**
|
||||
1. Read the target file in full first.
|
||||
2. Make the edits described.
|
||||
3. **Verify**: run `git diff <file>` and confirm the change reads correctly and matches the design doc.
|
||||
4. **Cross-reference check**: run the grep given in the task; confirm no stale references.
|
||||
5. **Commit** with the given message.
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Create the Site Call Audit component document
|
||||
|
||||
**Files:**
|
||||
- Create: `docs/requirements/Component-SiteCallAudit.md`
|
||||
|
||||
**Step 1: Write the new component doc**
|
||||
|
||||
Create the file following the standard component structure. Content:
|
||||
|
||||
```markdown
|
||||
# Component: Site Call Audit
|
||||
|
||||
## Purpose
|
||||
|
||||
Provides central, queryable audit and operational visibility for cached calls
|
||||
made by site scripts — `ExternalSystem.CachedCall()` and `Database.CachedWrite()`.
|
||||
Each such call carries a `TrackedOperationId`; sites report lifecycle telemetry
|
||||
to this component, which maintains a central audit record, computes KPIs, and
|
||||
relays Retry/Discard actions back to the owning site.
|
||||
|
||||
This is the second centrally-hosted observability component for site
|
||||
store-and-forward activity (the Notification Outbox is the first). Unlike the
|
||||
Notification Outbox, Site Call Audit is **not a dispatcher** — it never delivers
|
||||
anything. Cached calls are delivered by the site's Store-and-Forward Engine
|
||||
against site-local external systems and databases, which central cannot reach.
|
||||
|
||||
## Location
|
||||
|
||||
Central cluster only. A singleton actor (`SiteCallAuditActor`) on the active
|
||||
central node. Registered as component #22 in the Host role configuration.
|
||||
|
||||
## Responsibilities
|
||||
|
||||
- Ingest cached-call lifecycle telemetry from sites into the central `SiteCalls`
|
||||
table.
|
||||
- Run periodic per-site reconciliation pulls so missed telemetry self-heals.
|
||||
- Compute point-in-time KPIs (global and per-site) from the `SiteCalls` table.
|
||||
- Relay operator Retry/Discard actions for parked cached calls to the owning
|
||||
site over the command/control channel.
|
||||
- Purge terminal audit rows after a configurable retention window.
|
||||
|
||||
## The `SiteCalls` Table
|
||||
|
||||
Lives in the central MS SQL configuration database — a sibling of the
|
||||
`Notifications` table. One row per `TrackedOperationId`:
|
||||
|
||||
- **TrackedOperationId** — GUID, primary key. Generated site-side at call time.
|
||||
- **SourceSite** — site that issued the call.
|
||||
- **Kind** — `ExternalCall` or `DatabaseWrite`.
|
||||
- **TargetSummary** — external system + method name, or database connection name.
|
||||
- **Status** — `Pending`, `Retrying`, `Delivered`, `Parked`, `Failed`, `Discarded`.
|
||||
- **RetryCount** — attempts so far.
|
||||
- **LastError** — most recent error detail, if any.
|
||||
- **Provenance** — source instance / script.
|
||||
- **CreatedAtUtc**, **UpdatedAtUtc**, **TerminalAtUtc** — key timestamps.
|
||||
|
||||
## Status Lifecycle
|
||||
|
||||
`Pending → Retrying → Delivered / Parked / Failed / Discarded`
|
||||
|
||||
- **Delivered** — succeeded. A cached call that succeeds on its first immediate
|
||||
attempt is recorded directly as `Delivered`.
|
||||
- **Parked** — transient retries exhausted; awaiting manual action.
|
||||
- **Failed** — permanent failure (e.g. HTTP 4xx). The error was also returned
|
||||
synchronously to the calling script; the record captures it.
|
||||
- **Discarded** — an operator discarded a parked operation.
|
||||
|
||||
The site is the source of truth. The `SiteCalls` row is an eventually-consistent
|
||||
mirror — never queried by scripts (`Tracking.Status()` is answered site-locally).
|
||||
|
||||
## Ingest & Idempotency
|
||||
|
||||
Telemetry ingestion is **insert-if-not-exists** keyed on `TrackedOperationId`,
|
||||
then **upsert-on-newer-status**. The lifecycle is monotonic, so status only
|
||||
advances and never regresses; at-least-once and out-of-order telemetry are
|
||||
therefore harmless.
|
||||
|
||||
## Reconciliation
|
||||
|
||||
Because telemetry is best-effort, `SiteCallAuditActor` periodically — and on site
|
||||
reconnect — pulls "all tracking rows changed since cursor X" from each site.
|
||||
Gaps left by lost telemetry self-heal. Central converges to the site; the site
|
||||
never depends on central.
|
||||
|
||||
## Retry / Discard Relay
|
||||
|
||||
Parked cached calls live in the owning site's S&F buffer. Operator Retry/Discard
|
||||
from the Central UI is relayed to that site as a `RetryParkedOperation` /
|
||||
`DiscardParkedOperation` command over the command/control channel. The site
|
||||
applies the change and emits telemetry reflecting the new state; central never
|
||||
mutates the `SiteCalls` row directly. If the site is offline the command fails
|
||||
fast and the UI surfaces a "site unreachable" message.
|
||||
|
||||
## KPIs
|
||||
|
||||
Point-in-time, computed from the `SiteCalls` table, global and per-source-site,
|
||||
mirroring the Notification Outbox KPI shape:
|
||||
|
||||
- Buffered count (`Pending` + `Retrying`)
|
||||
- Parked count
|
||||
- Failed-last-interval
|
||||
- Delivered-last-interval
|
||||
- Oldest-pending age
|
||||
- Stuck count — `Pending`/`Retrying` older than a configurable threshold
|
||||
(default 10 minutes); display-only, no escalation.
|
||||
|
||||
## Retention
|
||||
|
||||
Daily purge of terminal rows (`Delivered`, `Failed`, `Discarded`) after a
|
||||
configurable window (default 365 days), matching the `Notifications` purge.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **Configuration Database**: hosts the `SiteCalls` table and its repository.
|
||||
- **Central–Site Communication**: receives cached-call telemetry and reconciliation
|
||||
responses; sends Retry/Discard commands.
|
||||
- **Store-and-Forward Engine**: the site-side origin of cached-call telemetry and
|
||||
the executor of relayed Retry/Discard commands.
|
||||
- **Commons**: `TrackedOperationId`, status enum, telemetry message contracts.
|
||||
|
||||
## Interactions
|
||||
|
||||
- **Central UI**: the Site Calls page queries this component and issues
|
||||
Retry/Discard actions.
|
||||
- **Health Monitoring**: surfaces Site Call Audit KPI tiles on the dashboard.
|
||||
- **Cluster Infrastructure**: hosts the `SiteCallAuditActor` singleton with
|
||||
active/standby failover.
|
||||
```
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `git diff --stat` and open the new file.
|
||||
Expected: structure matches other `Component-*.md` files (Purpose → Interactions).
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-SiteCallAudit.md
|
||||
git commit -m "docs(requirements): add Site Call Audit component (#22)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Add shared tracking contracts to Commons
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-Commons.md` — sections `REQ-COM-1` (data types), `REQ-COM-5` (message contracts)
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
In `### REQ-COM-1: Shared Data Type System`, add `TrackedOperationId` as a shared
|
||||
type: a GUID identifying any tracked store-and-forward operation
|
||||
(`CachedCall`, `CachedWrite`, `Notify.Send`), generated caller-side at the site
|
||||
at call time, doubling as the telemetry idempotency key. Note that the existing
|
||||
`NotificationId` is the notification-domain name for this same concept.
|
||||
|
||||
Add a shared `TrackedOperationStatus` enum:
|
||||
`Pending`, `Retrying`, `Delivered`, `Parked`, `Failed`, `Discarded`.
|
||||
|
||||
In `### REQ-COM-5: Cross-Component Message Contracts`, add the cached-call
|
||||
telemetry and command contracts (additive-only, per REQ-COM-5a):
|
||||
- `CachedCallTelemetry` — `TrackedOperationId`, source site, `Kind`,
|
||||
target summary, status, retry count, last error, timestamps, provenance.
|
||||
- `CachedCallReconcileRequest` / `CachedCallReconcileResponse` — cursor-based
|
||||
per-site pull of changed tracking rows.
|
||||
- `RetryParkedOperation` / `DiscardParkedOperation` — central→site commands
|
||||
keyed by `TrackedOperationId` (generalize naming so they cover cached calls,
|
||||
not only legacy "parked message" wording).
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `git diff docs/requirements/Component-Commons.md`
|
||||
Expected: additive only; no existing type or contract removed/renamed.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-Commons.md
|
||||
git commit -m "docs(requirements): add TrackedOperationId and cached-call contracts to Commons"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Update the Store-and-Forward Engine doc
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-StoreAndForward.md` — `Responsibilities`,
|
||||
`Message Lifecycle`, `Persistence`, `Parked Message Management`, `Message Format`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- **Responsibilities / Persistence**: introduce the **site-local operation
|
||||
tracking table** — a SQLite table alongside the S&F buffer DB, holding one row
|
||||
per `TrackedOperationId` for cached calls regardless of outcome. It is the
|
||||
status record; the S&F buffer remains only the retry mechanism. State that
|
||||
`Tracking.Status(id)` reads this table, that it is the source of truth, and
|
||||
that terminal rows are purged after a configurable window (default 7 days).
|
||||
- **Message Lifecycle**: a cached call that succeeds on its first immediate
|
||||
attempt is written directly as a terminal `Delivered` tracking row and never
|
||||
enters the S&F buffer. A buffered cached-call message references its
|
||||
`TrackedOperationId`.
|
||||
- Add a **telemetry emission** note: on every lifecycle transition the site emits
|
||||
`CachedCallTelemetry` to central (best-effort, at-least-once, idempotent on the
|
||||
ID) and responds to `CachedCallReconcileRequest` pulls.
|
||||
- **Parked Message Management**: note that Retry/Discard of parked cached calls
|
||||
can be driven by central via `RetryParkedOperation`/`DiscardParkedOperation`,
|
||||
after which the site emits telemetry reflecting the new state.
|
||||
- **Message Format**: add `TrackedOperationId` to the listed per-message fields.
|
||||
|
||||
Leave the notification category behavior unchanged.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `git diff docs/requirements/Component-StoreAndForward.md`
|
||||
Expected: cached-call and DB-write categories gain tracking; notification flow untouched.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-StoreAndForward.md
|
||||
git commit -m "docs(requirements): add site-local tracking table and telemetry to Store-and-Forward"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Update the External System Gateway doc
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-ExternalSystemGateway.md` — `Cached Write`,
|
||||
`External System Call Modes`, `Call Timeout & Error Handling`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- `### Cached (Store-and-Forward)` and `### Cached Write (Store-and-Forward)`:
|
||||
state that `CachedCall`/`CachedWrite` now return a `TrackedOperationId`. They
|
||||
are no longer "fire-and-forget" with no handle — replace that wording with
|
||||
"deferred-delivery, returns a tracking handle". Immediate success → terminal
|
||||
`Delivered` record; transient failure → buffered, `Pending`/`Retrying`.
|
||||
- Permanent failure: the error is still returned synchronously to the script
|
||||
(unchanged) **and** recorded as a terminal `Failed` tracking record.
|
||||
- Keep the idempotency note — duplicate delivery on retry is still the caller's
|
||||
responsibility.
|
||||
- Add a one-line pointer that status is observable via `Tracking.Status(id)` and
|
||||
centrally via the Site Call Audit component.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -n "fire-and-forget\|TrackedOperationId" docs/requirements/Component-ExternalSystemGateway.md`
|
||||
Expected: "fire-and-forget" no longer describes cached calls; `TrackedOperationId` present.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-ExternalSystemGateway.md
|
||||
git commit -m "docs(requirements): cached calls return TrackedOperationId in ESG"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Update the Site Runtime Script Runtime API
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-SiteRuntime.md` — `### External Systems`,
|
||||
`### Notifications`, `### Database Access` under `## Script Runtime API`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- `### External Systems`: `ExternalSystem.CachedCall(...)` now returns a
|
||||
`TrackedOperationId`; drop "fire-and-forget", say it returns a tracking handle.
|
||||
- `### Database Access`: `Database.CachedWrite(...)` now returns a
|
||||
`TrackedOperationId`.
|
||||
- Add the unified accessor `Tracking.Status("trackedOperationId")` — returns a
|
||||
status record (status, retry count, last error, key timestamps) for any tracked
|
||||
operation, answered site-locally and authoritatively for cached calls.
|
||||
- `### Notifications`: note that `Notify.Status(...)` is retained as a thin alias
|
||||
of `Tracking.Status(...)`; `Notify.Send` returns a `TrackedOperationId`
|
||||
(the value historically called `NotificationId`).
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `git diff docs/requirements/Component-SiteRuntime.md`
|
||||
Expected: all three cached/async producers return `TrackedOperationId`; `Tracking.Status` documented.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-SiteRuntime.md
|
||||
git commit -m "docs(requirements): add Tracking.Status and cached-call handles to Script Runtime API"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 6: Update the Central–Site Communication doc
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-Communication.md` — `### 8. Remote Queries`,
|
||||
and add a new pattern for cached-call telemetry
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- Add a new communication pattern (e.g. `### 10. Cached Call Telemetry (Site → Central)`):
|
||||
the site S&F Engine pushes `CachedCallTelemetry` on every lifecycle transition;
|
||||
best-effort, at-least-once, idempotent on `TrackedOperationId`; transport is
|
||||
ClusterClient command/control. Also describe the reconciliation pull
|
||||
(`CachedCallReconcileRequest`/`Response`) initiated by `SiteCallAuditActor`.
|
||||
- `### 8. Remote Queries (Central → Site)`: generalize the "Retry or discard
|
||||
parked messages" command line to also cover cached calls keyed by
|
||||
`TrackedOperationId` (`RetryParkedOperation` / `DiscardParkedOperation`).
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -n "Telemetry\|RetryParkedOperation" docs/requirements/Component-Communication.md`
|
||||
Expected: new telemetry pattern and generalized command present.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-Communication.md
|
||||
git commit -m "docs(requirements): add cached-call telemetry pattern to Communication"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 7: Update the Configuration Database doc
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-ConfigurationDatabase.md` — `## Database Schema`
|
||||
(add a `### Site Calls` subsection), `## Scheduled Maintenance`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- Under `## Database Schema`, add a `### Site Calls` subsection describing the
|
||||
`SiteCalls` table (columns per Task 1's "The `SiteCalls` Table" list), noting
|
||||
it is populated only by Site Call Audit telemetry/reconciliation, and that
|
||||
ingestion is insert-if-not-exists + upsert-on-newer-status.
|
||||
- Under `## Scheduled Maintenance`, add a `### SiteCalls Table Purge` subsection
|
||||
mirroring the `### Notifications Table Purge` wording: daily purge of terminal
|
||||
rows after a configurable window (default 365 days).
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -n "SiteCalls" docs/requirements/Component-ConfigurationDatabase.md`
|
||||
Expected: schema subsection and purge subsection both present.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-ConfigurationDatabase.md
|
||||
git commit -m "docs(requirements): add SiteCalls table and purge to Configuration Database"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 8: Update the Central UI doc
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-CentralUI.md` — `## Workflows / Pages`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
Add a `### Site Calls (Deployment Role)` page after the
|
||||
`### Notification Outbox (Deployment Role)` section:
|
||||
- Queryable list of cached calls (`ExternalCall` + `DatabaseWrite` only —
|
||||
notifications keep their own Notification Outbox page).
|
||||
- Filters: site, kind, status, time range.
|
||||
- Columns: timestamp, site, kind, target summary, status badge, retry count,
|
||||
last error.
|
||||
- Retry / Discard actions on `Parked` rows; "site unreachable" handling when the
|
||||
owning site is offline.
|
||||
- Custom Blazor Server + Bootstrap components, no third-party frameworks.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -n "Site Calls" docs/requirements/Component-CentralUI.md`
|
||||
Expected: new page section present, scoped to cached calls.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-CentralUI.md
|
||||
git commit -m "docs(requirements): add Site Calls page to Central UI"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 9: Update the Health Monitoring doc
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-HealthMonitoring.md` — add a
|
||||
`## Site Call Audit KPIs` section after `## Notification Outbox KPIs`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
Add a `## Site Call Audit KPIs` section mirroring `## Notification Outbox KPIs`:
|
||||
the dashboard surfaces Site Call Audit headline KPI tiles (buffered, parked,
|
||||
failed-last-interval, delivered-last-interval, oldest-pending age, stuck count),
|
||||
computed point-in-time by the Site Call Audit component, global and per-site.
|
||||
Stuck is display-only.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -n "Site Call Audit KPIs" docs/requirements/Component-HealthMonitoring.md`
|
||||
Expected: section present.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-HealthMonitoring.md
|
||||
git commit -m "docs(requirements): add Site Call Audit KPIs to Health Monitoring"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 10: Note the shared model in Notification docs
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-NotificationService.md` — `## Script API`
|
||||
- Modify: `docs/requirements/Component-NotificationOutbox.md` — `## Purpose` or
|
||||
`### Status Lifecycle`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- `Component-NotificationService.md` `## Script API`: note that `Notify.Send`'s
|
||||
`NotificationId` is a `TrackedOperationId` (shared Commons type) and
|
||||
`Notify.Status` is an alias of the unified `Tracking.Status`.
|
||||
- `Component-NotificationOutbox.md`: add a sentence that the Notification Outbox
|
||||
and the Site Call Audit component share the `TrackedOperationId` tracking
|
||||
model and status lifecycle, but differ in delivery locality — the Notification
|
||||
Outbox delivers; Site Call Audit only audits.
|
||||
|
||||
Do not change any notification behavior.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `git diff docs/requirements/Component-NotificationService.md docs/requirements/Component-NotificationOutbox.md`
|
||||
Expected: additive notes only, no behavior change.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-NotificationService.md docs/requirements/Component-NotificationOutbox.md
|
||||
git commit -m "docs(requirements): note shared TrackedOperationId model in notification docs"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 11: Update the README component table
|
||||
|
||||
**Files:**
|
||||
- Modify: `README.md` — component table and any architecture diagram component count
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
Add row 22 — **Site Call Audit** — to the component table:
|
||||
"Central component auditing site cached calls (`CachedCall`/`CachedWrite`);
|
||||
`SiteCalls` table, telemetry ingest, reconciliation, KPIs, central→site
|
||||
Retry/Discard relay." Update any "21 components" count to 22.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -rn "21 component\|22 component" README.md`
|
||||
Expected: count reads 22; no stale "21".
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add README.md
|
||||
git commit -m "docs: add Site Call Audit to README component table"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 12: Update CLAUDE.md
|
||||
|
||||
**Files:**
|
||||
- Modify: `CLAUDE.md` — `## Current Component List`, `## Key Design Decisions`
|
||||
|
||||
**Step 1: Edit the doc**
|
||||
|
||||
- Change the heading `## Current Component List (21 components)` to `(22 components)`
|
||||
and add item 22 — **Site Call Audit** — with a one-line description.
|
||||
- Under `## Key Design Decisions`, in `### Store-and-Forward` (or `### UI & Monitoring`),
|
||||
add bullets summarizing: cached calls return a `TrackedOperationId`; site-local
|
||||
tracking table is the status source of truth; new central Site Call Audit
|
||||
component mirrors status via best-effort telemetry + reconciliation; cached-call
|
||||
delivery stays site-local; unified `Tracking.Status` accessor; `Failed` terminal
|
||||
state for permanent failures.
|
||||
|
||||
**Step 2: Verify**
|
||||
|
||||
Run: `grep -n "22 components\|Site Call Audit" CLAUDE.md`
|
||||
Expected: count is 22; component listed; design decisions present.
|
||||
|
||||
**Step 3: Commit**
|
||||
|
||||
```bash
|
||||
git add CLAUDE.md
|
||||
git commit -m "docs: record cached-call tracking in CLAUDE.md"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 13: Final cross-reference consistency pass
|
||||
|
||||
**Files:**
|
||||
- Potentially any `docs/requirements/Component-*.md`, `README.md`, `CLAUDE.md`
|
||||
|
||||
**Step 1: Sweep for stale or missing references**
|
||||
|
||||
Run each and review:
|
||||
```bash
|
||||
grep -rn "fire-and-forget" docs/requirements/
|
||||
grep -rn "21 component" README.md CLAUDE.md
|
||||
grep -rln "Site Call Audit" docs/requirements/ README.md CLAUDE.md
|
||||
grep -rn "TrackedOperationId" docs/requirements/
|
||||
```
|
||||
Expected: no "fire-and-forget" describing cached calls; no "21 component" left;
|
||||
Site Call Audit referenced by its dependents (Communication, Configuration
|
||||
Database, Central UI, Health Monitoring, Commons); `TrackedOperationId` used
|
||||
consistently.
|
||||
|
||||
**Step 2: Confirm new component's Dependencies/Interactions are reciprocated**
|
||||
|
||||
Verify each component named in `Component-SiteCallAudit.md` Dependencies/Interactions
|
||||
also references Site Call Audit where appropriate.
|
||||
|
||||
**Step 3: Fix any gaps found, then commit**
|
||||
|
||||
```bash
|
||||
git add -A
|
||||
git commit -m "docs(requirements): reconcile cross-references for Site Call Audit"
|
||||
```
|
||||
|
||||
If no gaps are found, skip the commit and note the plan is complete.
|
||||
|
||||
---
|
||||
|
||||
## Done
|
||||
|
||||
All cached-call tracking design changes are recorded. The design rationale lives
|
||||
in `docs/plans/2026-05-19-cached-call-tracking-design.md`.
|
||||
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-19-cached-call-tracking.md",
|
||||
"tasks": [
|
||||
{"id": 6, "subject": "Task 1: Create Site Call Audit component doc", "status": "pending"},
|
||||
{"id": 7, "subject": "Task 2: Add tracking contracts to Commons", "status": "pending", "blockedBy": [6]},
|
||||
{"id": 8, "subject": "Task 3: Update Store-and-Forward doc", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 9, "subject": "Task 4: Update External System Gateway doc", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 10, "subject": "Task 5: Update Site Runtime Script Runtime API", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 11, "subject": "Task 6: Update Communication doc", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 12, "subject": "Task 7: Update Configuration Database doc", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 13, "subject": "Task 8: Update Central UI doc", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 14, "subject": "Task 9: Update Health Monitoring doc", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 15, "subject": "Task 10: Note shared model in notification docs", "status": "pending", "blockedBy": [6, 7]},
|
||||
{"id": 16, "subject": "Task 11: Update README component table", "status": "pending", "blockedBy": [6]},
|
||||
{"id": 17, "subject": "Task 12: Update CLAUDE.md", "status": "pending", "blockedBy": [6]},
|
||||
{"id": 18, "subject": "Task 13: Final cross-reference consistency pass", "status": "pending", "blockedBy": [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]}
|
||||
],
|
||||
"lastUpdated": "2026-05-19"
|
||||
}
|
||||
@@ -0,0 +1,717 @@
|
||||
# Notification Outbox — Code Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Build the central Notification Outbox feature in the ScadaLink `src/` codebase — sites store-and-forward notifications to the central cluster, which logs each to a `Notifications` table and delivers it via per-type adapters with retry, parking, status handles, and KPIs.
|
||||
|
||||
**Architecture:** A new `ScadaLink.NotificationOutbox` project hosts a `NotificationOutboxActor` cluster singleton on the active central node. Sites enqueue notifications into the existing site Store-and-Forward Engine (notification category, retargeted from SMTP to "central"); the S&F engine forwards them to central via `ClusterClient`; the `CentralCommunicationActor` routes each `NotificationSubmit` to the outbox singleton, which inserts a row into the central MS SQL `Notifications` table (insert-if-not-exists on a site-generated `NotificationId` GUID) and acks. A timer-driven dispatcher polls due rows and delivers them through an `INotificationDeliveryAdapter` (Email adapter now; Teams later). A Blazor page surfaces KPIs and a queryable list with Retry/Discard.
|
||||
|
||||
**Tech Stack:** .NET 10, Akka.NET (cluster singletons, ClusterClient, TestKit), EF Core (MS SQL; SQLite in-memory for tests), Blazor Server + Bootstrap, xUnit + NSubstitute + bUnit. Solution: `ScadaLink.slnx`.
|
||||
|
||||
**Authoritative design:** `docs/plans/notif.md` and `docs/requirements/Component-NotificationOutbox.md`. Read both before starting.
|
||||
|
||||
---
|
||||
|
||||
## Conventions (read once, applies to every task)
|
||||
|
||||
These were confirmed by exploring the existing codebase. Follow them in every task.
|
||||
|
||||
- **Entities (Commons):** POCOs in `src/ScadaLink.Commons/Entities/<Area>/`. Auto-properties, parameterized constructor with null checks, navigation collections initialised to `new List<T>()`. No data annotations.
|
||||
- **EF mapping (ConfigurationDatabase):** Fluent `IEntityTypeConfiguration<T>` classes in `src/ScadaLink.ConfigurationDatabase/Configurations/`, auto-applied by `ApplyConfigurationsFromAssembly`. Enums stored as strings via `.HasConversion<string>()`. Add a `DbSet<T>` to `ScadaLinkDbContext`.
|
||||
- **Repositories:** Interface in `src/ScadaLink.Commons/Interfaces/Repositories/`, implementation in `src/ScadaLink.ConfigurationDatabase/Repositories/`. Inject `ScadaLinkDbContext`, use `_context.Set<T>()`, expose explicit `SaveChangesAsync`. Register in `ConfigurationDatabase/ServiceCollectionExtensions.cs` with `AddScoped`.
|
||||
- **Migrations:** `dotnet ef migrations add <Name> --project src/ScadaLink.ConfigurationDatabase` — timestamp-named. Applied via `MigrationHelper.ApplyOrValidateMigrationsAsync` (auto in dev).
|
||||
- **Message contracts (Commons):** `record` types in `src/ScadaLink.Commons/Messages/<Area>/`, named positional params, additive-only evolution.
|
||||
- **Options pattern:** `<Component>Options` class owned by the component project; component's `ServiceCollectionExtensions.Add<Component>()` calls `services.AddOptions<T>().BindConfiguration("ScadaLink:<Section>")`; Host also `services.Configure<T>(...)`. Config lives in `appsettings.Central.json` / `appsettings.Site.json`.
|
||||
- **Actors:** No Akka.DI framework. Dependencies passed via `Props.Create(() => new XActor(...))`. Actors that need scoped services take `IServiceProvider` and call `CreateScope()`. Cluster singletons use `ClusterSingletonManager.Props` + `ClusterSingletonProxy.Props`, created in `src/ScadaLink.Host/Actors/AkkaHostedService.cs`.
|
||||
- **Tests:** xUnit, NSubstitute, built-in `Assert`. One `tests/ScadaLink.<Component>.Tests/` project per `src/` project. Actor tests inherit `Akka.TestKit.Xunit2.TestKit`. Repository tests use SQLite in-memory (`DataSource=:memory:`, `OpenConnection()` + `EnsureCreated()`, `IDisposable`). Blazor tests inherit bUnit `BunitContext`. Test naming: `Method_Scenario_Result`.
|
||||
- **Run tests:** whole suite `dotnet test ScadaLink.slnx`; single project `dotnet test tests/ScadaLink.<X>.Tests/ScadaLink.<X>.Tests.csproj`; single test `--filter "FullyQualifiedName~<Class>.<Method>"`.
|
||||
- **Build:** `dotnet build ScadaLink.slnx`.
|
||||
- **TDD:** every task writes the failing test first, runs it red, implements, runs it green, commits. Use the superpowers-extended-cc:test-driven-development discipline.
|
||||
- **Commits:** one per task, message `feat(notification-outbox): <task summary>`.
|
||||
|
||||
**Status lifecycle** (central `Notifications` table — `Forwarding` is site-local, never stored centrally):
|
||||
`Pending → Retrying → Delivered | Parked`, plus `Discarded` (operator action only).
|
||||
|
||||
---
|
||||
|
||||
## Phase A — Data layer (Commons + ConfigurationDatabase)
|
||||
|
||||
### Task 1: Notification enums
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Types/Enums/NotificationType.cs`
|
||||
- Create: `src/ScadaLink.Commons/Types/Enums/NotificationStatus.cs`
|
||||
- Test: `tests/ScadaLink.Commons.Tests/Types/NotificationEnumTests.cs` (create if the test project lacks a `Types/` folder)
|
||||
|
||||
**Step 1 — failing test.** Assert the enums expose exactly the expected members:
|
||||
```csharp
|
||||
[Fact]
|
||||
public void NotificationStatus_HasExactlyTheCentralStates()
|
||||
{
|
||||
var names = Enum.GetNames<NotificationStatus>();
|
||||
Assert.Equal(
|
||||
new[] { "Pending", "Retrying", "Delivered", "Parked", "Discarded" },
|
||||
names);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void NotificationType_HasEmail()
|
||||
{
|
||||
Assert.True(Enum.IsDefined(NotificationType.Email));
|
||||
}
|
||||
```
|
||||
Note: `Forwarding` is intentionally NOT a `NotificationStatus` member — it is a site-local concept (Task 19), never persisted centrally.
|
||||
|
||||
**Step 2 — run red:** `dotnet test tests/ScadaLink.Commons.Tests/ScadaLink.Commons.Tests.csproj --filter "FullyQualifiedName~NotificationEnumTests"` → FAIL (types don't exist).
|
||||
|
||||
**Step 3 — implement.**
|
||||
```csharp
|
||||
// NotificationType.cs — namespace ScadaLink.Commons.Types.Enums
|
||||
public enum NotificationType { Email } // Teams and others added later
|
||||
|
||||
// NotificationStatus.cs — namespace ScadaLink.Commons.Types.Enums
|
||||
public enum NotificationStatus { Pending, Retrying, Delivered, Parked, Discarded }
|
||||
```
|
||||
|
||||
**Step 4 — run green.** Same filter → PASS.
|
||||
|
||||
**Step 5 — commit:**
|
||||
```bash
|
||||
git add src/ScadaLink.Commons/Types/Enums/NotificationType.cs src/ScadaLink.Commons/Types/Enums/NotificationStatus.cs tests/ScadaLink.Commons.Tests/Types/NotificationEnumTests.cs
|
||||
git commit -m "feat(notification-outbox): add NotificationType and NotificationStatus enums"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Task 2: `Notification` entity POCO
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Entities/Notifications/Notification.cs`
|
||||
- Test: `tests/ScadaLink.Commons.Tests/Entities/NotificationEntityTests.cs`
|
||||
|
||||
**Step 1 — failing test.** Verify the constructor sets required fields, defaults `Status` to `Pending` and `RetryCount` to 0, and rejects nulls:
|
||||
```csharp
|
||||
[Fact]
|
||||
public void Constructor_SetsDefaults()
|
||||
{
|
||||
var n = new Notification("id-1", NotificationType.Email, "ops-team", "subj", "body", "SiteA");
|
||||
Assert.Equal(NotificationStatus.Pending, n.Status);
|
||||
Assert.Equal(0, n.RetryCount);
|
||||
Assert.Equal("id-1", n.NotificationId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_NullListName_Throws()
|
||||
=> Assert.Throws<ArgumentNullException>(
|
||||
() => new Notification("id", NotificationType.Email, null!, "s", "b", "SiteA"));
|
||||
```
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Match the `Notifications` table schema in `notif.md`:
|
||||
```csharp
|
||||
namespace ScadaLink.Commons.Entities.Notifications;
|
||||
|
||||
public class Notification
|
||||
{
|
||||
public string NotificationId { get; set; } // GUID PK, generated at site
|
||||
public NotificationType Type { get; set; }
|
||||
public string ListName { get; set; }
|
||||
public string Subject { get; set; }
|
||||
public string Body { get; set; }
|
||||
public string? TypeData { get; set; } // JSON extensibility hook
|
||||
public NotificationStatus Status { get; set; } = NotificationStatus.Pending;
|
||||
public int RetryCount { get; set; }
|
||||
public string? LastError { get; set; }
|
||||
public string? ResolvedTargets { get; set; } // snapshotted at delivery, for audit
|
||||
public string SourceSiteId { get; set; }
|
||||
public string? SourceInstanceId { get; set; }
|
||||
public string? SourceScript { get; set; }
|
||||
public DateTimeOffset SiteEnqueuedAt { get; set; }
|
||||
public DateTimeOffset CreatedAt { get; set; } // central ingest time
|
||||
public DateTimeOffset? LastAttemptAt { get; set; }
|
||||
public DateTimeOffset? NextAttemptAt { get; set; }
|
||||
public DateTimeOffset? DeliveredAt { get; set; }
|
||||
|
||||
public Notification(string notificationId, NotificationType type, string listName,
|
||||
string subject, string body, string sourceSiteId)
|
||||
{
|
||||
NotificationId = notificationId ?? throw new ArgumentNullException(nameof(notificationId));
|
||||
Type = type;
|
||||
ListName = listName ?? throw new ArgumentNullException(nameof(listName));
|
||||
Subject = subject ?? throw new ArgumentNullException(nameof(subject));
|
||||
Body = body ?? throw new ArgumentNullException(nameof(body));
|
||||
SourceSiteId = sourceSiteId ?? throw new ArgumentNullException(nameof(sourceSiteId));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add Notification entity`).
|
||||
|
||||
---
|
||||
|
||||
### Task 3: `Type` field on `NotificationList`
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Commons/Entities/Notifications/NotificationList.cs`
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/Configurations/NotificationConfiguration.cs` (`NotificationListConfiguration`)
|
||||
- Test: `tests/ScadaLink.ConfigurationDatabase.Tests/RepositoryTests.cs` (add a test to the notification repository tests)
|
||||
|
||||
**Step 1 — failing test.** A `NotificationList` round-trips its `Type` through the repository:
|
||||
```csharp
|
||||
[Fact]
|
||||
public async Task NotificationList_PersistsType()
|
||||
{
|
||||
var list = new NotificationList("ops") { Type = NotificationType.Email };
|
||||
await _notificationRepo.AddNotificationListAsync(list);
|
||||
await _notificationRepo.SaveChangesAsync();
|
||||
_context.ChangeTracker.Clear();
|
||||
var loaded = await _notificationRepo.GetListByNameAsync("ops");
|
||||
Assert.Equal(NotificationType.Email, loaded!.Type);
|
||||
}
|
||||
```
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Add to `NotificationList`: `public NotificationType Type { get; set; } = NotificationType.Email;`. In `NotificationListConfiguration.Configure`, add `builder.Property(n => n.Type).HasConversion<string>().HasMaxLength(32).IsRequired();`.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add Type field to NotificationList`).
|
||||
|
||||
---
|
||||
|
||||
### Task 4: `Notification` EF configuration + DbSet
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.ConfigurationDatabase/Configurations/NotificationOutboxConfiguration.cs`
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/ScadaLinkDbContext.cs` (add `DbSet<Notification>`)
|
||||
- Test: `tests/ScadaLink.ConfigurationDatabase.Tests/RepositoryTests.cs`
|
||||
|
||||
**Step 1 — failing test.** A `Notification` round-trips all fields through the `DbContext` (use the SQLite in-memory fixture pattern). Assert the `Status`/`Type` enums persist as strings and the row is found by `NotificationId`.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Configuration:
|
||||
```csharp
|
||||
public class NotificationOutboxConfiguration : IEntityTypeConfiguration<Notification>
|
||||
{
|
||||
public void Configure(EntityTypeBuilder<Notification> builder)
|
||||
{
|
||||
builder.ToTable("Notifications");
|
||||
builder.HasKey(n => n.NotificationId);
|
||||
builder.Property(n => n.NotificationId).HasMaxLength(64);
|
||||
builder.Property(n => n.Type).HasConversion<string>().HasMaxLength(32).IsRequired();
|
||||
builder.Property(n => n.Status).HasConversion<string>().HasMaxLength(32).IsRequired();
|
||||
builder.Property(n => n.ListName).HasMaxLength(200).IsRequired();
|
||||
builder.Property(n => n.Subject).HasMaxLength(1000).IsRequired();
|
||||
builder.Property(n => n.Body).IsRequired(); // nvarchar(max)
|
||||
builder.Property(n => n.TypeData); // nvarchar(max), nullable
|
||||
builder.Property(n => n.ResolvedTargets); // nvarchar(max), nullable
|
||||
builder.Property(n => n.LastError).HasMaxLength(4000);
|
||||
builder.Property(n => n.SourceSiteId).HasMaxLength(100).IsRequired();
|
||||
builder.Property(n => n.SourceInstanceId).HasMaxLength(200);
|
||||
builder.Property(n => n.SourceScript).HasMaxLength(200);
|
||||
builder.HasIndex(n => new { n.Status, n.NextAttemptAt }); // dispatcher polling
|
||||
builder.HasIndex(n => new { n.SourceSiteId, n.CreatedAt }); // KPIs / UI query
|
||||
}
|
||||
}
|
||||
```
|
||||
Add `public DbSet<Notification> Notifications => Set<Notification>();` to `ScadaLinkDbContext`.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add Notification EF configuration and DbSet`).
|
||||
|
||||
---
|
||||
|
||||
### Task 5: `INotificationOutboxRepository` + implementation
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Interfaces/Repositories/INotificationOutboxRepository.cs`
|
||||
- Create: `src/ScadaLink.ConfigurationDatabase/Repositories/NotificationOutboxRepository.cs`
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs` (register `AddScoped`)
|
||||
- Test: `tests/ScadaLink.ConfigurationDatabase.Tests/RepositoryTests.cs`
|
||||
|
||||
**Step 1 — failing tests.** Cover the operations the outbox actor needs:
|
||||
- `InsertIfNotExistsAsync` inserts a new row and returns `true`; a second call with the same `NotificationId` returns `false` and does not duplicate (idempotency key).
|
||||
- `GetDueAsync(now, batchSize)` returns `Pending` rows and `Retrying` rows with `NextAttemptAt <= now`, ordered by `CreatedAt`, capped at `batchSize`.
|
||||
- `UpdateAsync` persists status transitions.
|
||||
- `GetByIdAsync` returns a row or null.
|
||||
- `QueryAsync(filter, page, pageSize)` filters by status/type/source site and paginates.
|
||||
- `DeleteTerminalOlderThanAsync(cutoff)` bulk-deletes `Delivered`/`Parked`/`Discarded` rows older than `cutoff` and returns the count; leaves non-terminal rows.
|
||||
- `ComputeKpisAsync` returns queue depth, stuck count, parked count, delivered-last-window, oldest-pending age.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Interface:
|
||||
```csharp
|
||||
public interface INotificationOutboxRepository
|
||||
{
|
||||
Task<bool> InsertIfNotExistsAsync(Notification n, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<Notification>> GetDueAsync(DateTimeOffset now, int batchSize, CancellationToken ct = default);
|
||||
Task<Notification?> GetByIdAsync(string notificationId, CancellationToken ct = default);
|
||||
Task UpdateAsync(Notification n, CancellationToken ct = default);
|
||||
Task<(IReadOnlyList<Notification> Rows, int TotalCount)> QueryAsync(
|
||||
NotificationOutboxFilter filter, int pageNumber, int pageSize, CancellationToken ct = default);
|
||||
Task<int> DeleteTerminalOlderThanAsync(DateTimeOffset cutoff, CancellationToken ct = default);
|
||||
Task<NotificationKpiSnapshot> ComputeKpisAsync(DateTimeOffset stuckCutoff, DateTimeOffset deliveredSince, CancellationToken ct = default);
|
||||
Task<int> SaveChangesAsync(CancellationToken ct = default);
|
||||
}
|
||||
```
|
||||
`NotificationOutboxFilter` (a `record` in Commons `Types/`) and `NotificationKpiSnapshot` (a `record`) are created in this task alongside the interface. `InsertIfNotExistsAsync`: check `await _context.Notifications.FindAsync(...)`, if present return false, else `AddAsync` + `SaveChangesAsync`, return true. `DeleteTerminalOlderThanAsync`: use `ExecuteDeleteAsync` with a `Where` on terminal statuses and `CreatedAt < cutoff`. Register in `ServiceCollectionExtensions.AddConfigurationDatabase`.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add NotificationOutbox repository`).
|
||||
|
||||
---
|
||||
|
||||
### Task 6: EF migration
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.ConfigurationDatabase/Migrations/<timestamp>_AddNotificationsTable.cs` (generated)
|
||||
|
||||
**Step 1 — generate:**
|
||||
```bash
|
||||
dotnet ef migrations add AddNotificationsTable --project src/ScadaLink.ConfigurationDatabase
|
||||
```
|
||||
This also picks up the `NotificationList.Type` column from Task 3.
|
||||
|
||||
**Step 2 — verify.** Inspect the generated migration: confirm a `Notifications` table with the columns and two indexes from Task 4, and an `AlterColumn`/`AddColumn` for `NotificationLists.Type`. Run the ConfigurationDatabase test project — the SQLite `EnsureCreated()` fixture builds from the model, and `dotnet build ScadaLink.slnx` must succeed.
|
||||
|
||||
**Step 3 — run:** `dotnet test tests/ScadaLink.ConfigurationDatabase.Tests/ScadaLink.ConfigurationDatabase.Tests.csproj` → PASS.
|
||||
|
||||
**Step 4 — commit** (`feat(notification-outbox): add Notifications table migration`).
|
||||
|
||||
---
|
||||
|
||||
## Phase B — Message contracts (Commons)
|
||||
|
||||
### Task 7: Site↔central notification message contracts
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Messages/Notification/NotificationMessages.cs`
|
||||
- Test: `tests/ScadaLink.Commons.Tests/Messages/NotificationMessagesTests.cs`
|
||||
|
||||
**Step 1 — failing test.** A trivial construction/round-trip test (these are records — assert positional construction and value equality; if the project has a serialization test helper, round-trip through it).
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Namespace `ScadaLink.Commons.Messages.Notification`:
|
||||
```csharp
|
||||
// Site → Central: submit a notification for central delivery (fire-and-forget with ack).
|
||||
public record NotificationSubmit(
|
||||
string NotificationId, string ListName, string Subject, string Body,
|
||||
string SourceSiteId, string? SourceInstanceId, string? SourceScript,
|
||||
DateTimeOffset SiteEnqueuedAt);
|
||||
|
||||
// Central → Site: ack after the row is persisted (idempotent — safe to re-send).
|
||||
public record NotificationSubmitAck(string NotificationId, bool Accepted, string? Error);
|
||||
|
||||
// Site → Central: query delivery status for a NotificationId.
|
||||
public record NotificationStatusQuery(string CorrelationId, string NotificationId);
|
||||
|
||||
public record NotificationStatusResponse(
|
||||
string CorrelationId, bool Found, string Status,
|
||||
int RetryCount, string? LastError,
|
||||
DateTimeOffset? DeliveredAt);
|
||||
```
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add site/central notification message contracts`).
|
||||
|
||||
---
|
||||
|
||||
### Task 8: Outbox UI query/action contracts
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Messages/Notification/NotificationOutboxQueries.cs`
|
||||
- Test: `tests/ScadaLink.Commons.Tests/Messages/NotificationOutboxQueriesTests.cs`
|
||||
|
||||
**Step 1 — failing test.** Construction test as in Task 7.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Records the Central UI / `CommunicationService` use to talk to the outbox actor:
|
||||
```csharp
|
||||
public record NotificationOutboxQueryRequest(
|
||||
string CorrelationId, string? StatusFilter, string? TypeFilter, string? SourceSiteFilter,
|
||||
string? ListNameFilter, bool StuckOnly, string? SubjectKeyword,
|
||||
DateTimeOffset? From, DateTimeOffset? To, int PageNumber, int PageSize);
|
||||
|
||||
public record NotificationSummary(
|
||||
string NotificationId, string Type, string ListName, string Subject, string Status,
|
||||
int RetryCount, string? LastError, string SourceSiteId, string? SourceInstanceId,
|
||||
DateTimeOffset CreatedAt, DateTimeOffset? DeliveredAt, bool IsStuck);
|
||||
|
||||
public record NotificationOutboxQueryResponse(
|
||||
string CorrelationId, bool Success, string? ErrorMessage,
|
||||
IReadOnlyList<NotificationSummary> Notifications, int TotalCount);
|
||||
|
||||
public record RetryNotificationRequest(string CorrelationId, string NotificationId);
|
||||
public record RetryNotificationResponse(string CorrelationId, bool Success, string? ErrorMessage);
|
||||
public record DiscardNotificationRequest(string CorrelationId, string NotificationId);
|
||||
public record DiscardNotificationResponse(string CorrelationId, bool Success, string? ErrorMessage);
|
||||
|
||||
public record NotificationKpiRequest(string CorrelationId);
|
||||
public record NotificationKpiResponse(
|
||||
string CorrelationId, int QueueDepth, int StuckCount, int ParkedCount,
|
||||
int DeliveredLastInterval, TimeSpan? OldestPendingAge);
|
||||
```
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add outbox query and action contracts`).
|
||||
|
||||
---
|
||||
|
||||
## Phase C — NotificationOutbox project + delivery
|
||||
|
||||
### Task 9: Create the `ScadaLink.NotificationOutbox` project
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.NotificationOutbox/ScadaLink.NotificationOutbox.csproj`
|
||||
- Create: `tests/ScadaLink.NotificationOutbox.Tests/ScadaLink.NotificationOutbox.Tests.csproj`
|
||||
- Modify: `ScadaLink.slnx` (add both projects)
|
||||
|
||||
**Step 1 — create the projects.** Copy the `.csproj` shape from `src/ScadaLink.NotificationService/ScadaLink.NotificationService.csproj` (same `TargetFramework`, central-managed package versions via `Directory.Packages.props`). The src project references `ScadaLink.Commons` and Akka packages (`Akka`, `Akka.Cluster.Tools`). The test project mirrors `tests/ScadaLink.NotificationService.Tests/` (xUnit, NSubstitute, `Akka.TestKit.Xunit2`) and references the new src project. Add both `<Project>` entries to `ScadaLink.slnx`.
|
||||
|
||||
**Step 2 — add a placeholder test** so the test project is non-empty:
|
||||
```csharp
|
||||
public class ProjectSmokeTest { [Fact] public void ProjectCompiles() => Assert.True(true); }
|
||||
```
|
||||
|
||||
**Step 3 — verify:** `dotnet build ScadaLink.slnx` succeeds; `dotnet test tests/ScadaLink.NotificationOutbox.Tests/ScadaLink.NotificationOutbox.Tests.csproj` → PASS.
|
||||
|
||||
**Step 4 — commit** (`feat(notification-outbox): scaffold ScadaLink.NotificationOutbox project`).
|
||||
|
||||
---
|
||||
|
||||
### Task 10: `NotificationOutboxOptions`
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.NotificationOutbox/NotificationOutboxOptions.cs`
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxOptionsTests.cs`
|
||||
|
||||
**Step 1 — failing test.** Assert the defaults.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.**
|
||||
```csharp
|
||||
public class NotificationOutboxOptions
|
||||
{
|
||||
public TimeSpan DispatchInterval { get; set; } = TimeSpan.FromSeconds(10);
|
||||
public int DispatchBatchSize { get; set; } = 100;
|
||||
public TimeSpan StuckAgeThreshold { get; set; } = TimeSpan.FromMinutes(10);
|
||||
public TimeSpan TerminalRetention { get; set; } = TimeSpan.FromDays(365);
|
||||
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromDays(1);
|
||||
public TimeSpan DeliveredKpiWindow { get; set; } = TimeSpan.FromMinutes(1);
|
||||
}
|
||||
```
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add NotificationOutboxOptions`).
|
||||
|
||||
---
|
||||
|
||||
### Task 11: `INotificationDeliveryAdapter` + `DeliveryOutcome`
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.NotificationOutbox/Delivery/INotificationDeliveryAdapter.cs`
|
||||
- Create: `src/ScadaLink.NotificationOutbox/Delivery/DeliveryOutcome.cs`
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/Delivery/DeliveryOutcomeTests.cs`
|
||||
|
||||
**Step 1 — failing test.** Assert `DeliveryOutcome` factory methods produce the right classification.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Mirror the External System Gateway error-classification pattern:
|
||||
```csharp
|
||||
public enum DeliveryResult { Success, TransientFailure, PermanentFailure }
|
||||
|
||||
public record DeliveryOutcome(DeliveryResult Result, string? ResolvedTargets, string? Error)
|
||||
{
|
||||
public static DeliveryOutcome Success(string resolvedTargets) => new(DeliveryResult.Success, resolvedTargets, null);
|
||||
public static DeliveryOutcome Transient(string error) => new(DeliveryResult.TransientFailure, null, error);
|
||||
public static DeliveryOutcome Permanent(string error) => new(DeliveryResult.PermanentFailure, null, error);
|
||||
}
|
||||
|
||||
public interface INotificationDeliveryAdapter
|
||||
{
|
||||
NotificationType Type { get; }
|
||||
Task<DeliveryOutcome> DeliverAsync(Notification notification, CancellationToken ct = default);
|
||||
}
|
||||
```
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add delivery adapter abstraction`).
|
||||
|
||||
---
|
||||
|
||||
### Task 12: `EmailNotificationDeliveryAdapter`
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.NotificationOutbox/Delivery/EmailNotificationDeliveryAdapter.cs`
|
||||
- Modify: `src/ScadaLink.NotificationOutbox/ScadaLink.NotificationOutbox.csproj` (reference `ScadaLink.NotificationService` for `ISmtpClientWrapper`)
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/Delivery/EmailNotificationDeliveryAdapterTests.cs`
|
||||
|
||||
**Step 1 — failing tests.** Using NSubstitute mocks of `INotificationOutboxRepository`-resolved data and a substituted `ISmtpClientWrapper`:
|
||||
- list resolved + send succeeds → `DeliveryResult.Success`, `ResolvedTargets` lists the recipient addresses.
|
||||
- list not found / no recipients → `PermanentFailure`.
|
||||
- SMTP throws `SmtpPermanentException` → `PermanentFailure`.
|
||||
- SMTP throws a transient error (socket/timeout) → `TransientFailure`.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** The adapter resolves the list + recipients + SMTP config from `INotificationRepository` (the existing notification-list repo — recipients are resolved centrally at delivery time), composes and sends via the existing `ISmtpClientWrapper` (`Func<ISmtpClientWrapper>` injected, same as `NotificationService`), classifies errors identically to `NotificationDeliveryService`. Reuse the SMTP composition logic from `src/ScadaLink.NotificationService/NotificationDeliveryService.cs` (BCC delivery, plain text, address validation, the `SmtpPermanentException` → permanent mapping). On success return `DeliveryOutcome.Success(<comma-joined recipient addresses>)`. `Type => NotificationType.Email`.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add Email delivery adapter`).
|
||||
|
||||
---
|
||||
|
||||
### Task 13: `NotificationOutboxActor` — ingest
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs`
|
||||
- Create: `src/ScadaLink.NotificationOutbox/Messages/InternalMessages.cs` (actor-internal tick messages)
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorIngestTests.cs`
|
||||
|
||||
**Step 1 — failing tests** (TestKit). The actor takes `IServiceProvider`, `NotificationOutboxOptions`, `ILogger`. Use a mocked `INotificationOutboxRepository` registered in the test `ServiceProvider`:
|
||||
- Send `NotificationSubmit` → actor calls `InsertIfNotExistsAsync` with a `Notification` whose fields map from the message, `Status = Pending`, `CreatedAt` set; replies `NotificationSubmitAck(NotificationId, Accepted: true, null)` to `Sender`.
|
||||
- Send the same `NotificationSubmit` twice → second `InsertIfNotExistsAsync` returns false; actor still replies `Accepted: true` (idempotent — the row already exists, ack so the site clears its buffer).
|
||||
- Repository throws → actor replies `Accepted: false` with the error (site will retry the forward).
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** `ReceiveActor`. On `NotificationSubmit`: build a `Notification`, `CreateScope()` to resolve `INotificationOutboxRepository`, call `InsertIfNotExistsAsync`, `PipeTo` the result back so the reply preserves `Sender`. Reply `NotificationSubmitAck`. Keep dispatch (Task 14) out of this task — ingest only.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add NotificationOutboxActor ingest`).
|
||||
|
||||
---
|
||||
|
||||
### Task 14: `NotificationOutboxActor` — dispatcher loop
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs`
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorDispatchTests.cs`
|
||||
|
||||
**Step 1 — failing tests** (TestKit, with a registered set of `INotificationDeliveryAdapter` and a mocked repo):
|
||||
- On a `DispatchTick`, the actor calls `GetDueAsync`, and for each row invokes the adapter for its `Type`.
|
||||
- adapter `Success` → row updated to `Delivered`, `DeliveredAt`/`ResolvedTargets`/`LastAttemptAt` set, `UpdateAsync` called.
|
||||
- adapter `TransientFailure` → `Retrying`, `RetryCount` incremented, `NextAttemptAt = now + retry interval`, `LastError` set.
|
||||
- adapter `TransientFailure` when `RetryCount` already at the SMTP-config max → `Parked`.
|
||||
- adapter `PermanentFailure` → `Parked` immediately, `LastError` set.
|
||||
- no adapter for the row's `Type` → `Parked` with an explanatory error.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** `IWithTimers`; in `PreStart` start a periodic `DispatchTick` every `options.DispatchInterval`. On `DispatchTick`: scope-resolve the repo, `GetDueAsync(now, options.DispatchBatchSize)`, and for each notification resolve the adapter from a `Dictionary<NotificationType, INotificationDeliveryAdapter>` (injected), `await DeliverAsync`, apply the status transition, `UpdateAsync`. Retry count/interval come from the central SMTP config (`SmtpConfiguration.MaxRetries` / `RetryDelay` via `INotificationRepository`). Run delivery on a blocking-safe path (the actor `PipeTo`s the async work; do not block the actor thread). Guard against overlapping ticks (ignore a new tick while one is in flight).
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add dispatcher loop to NotificationOutboxActor`).
|
||||
|
||||
---
|
||||
|
||||
### Task 15: `NotificationOutboxActor` — query, retry, discard, KPIs
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs`
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorQueryTests.cs`
|
||||
|
||||
**Step 1 — failing tests** (TestKit):
|
||||
- `NotificationOutboxQueryRequest` → actor calls `QueryAsync`, replies `NotificationOutboxQueryResponse` with mapped `NotificationSummary` rows; `IsStuck` true when `Status` is `Pending`/`Retrying` and `CreatedAt` older than `options.StuckAgeThreshold`.
|
||||
- `NotificationStatusQuery` → replies `NotificationStatusResponse` (`Found:false` when the id is unknown).
|
||||
- `RetryNotificationRequest` on a `Parked` row → row reset to `Pending`, `RetryCount` 0, `NextAttemptAt` cleared; replies success. On a non-`Parked` row → `Success:false`.
|
||||
- `DiscardNotificationRequest` on a `Parked` row → `Status = Discarded`; replies success.
|
||||
- `NotificationKpiRequest` → replies `NotificationKpiResponse` from `ComputeKpisAsync` (stuck cutoff = now − `StuckAgeThreshold`; delivered window = now − `DeliveredKpiWindow`).
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement** the additional `Receive<>` handlers, each scope-resolving the repo and `PipeTo`-ing the reply.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add query, retry, discard, and KPI handlers`).
|
||||
|
||||
---
|
||||
|
||||
### Task 16: Daily purge job
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs`
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/NotificationOutboxActorPurgeTests.cs`
|
||||
|
||||
**Step 1 — failing test.** On a `PurgeTick`, the actor calls `DeleteTerminalOlderThanAsync(now − options.TerminalRetention)`.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** In `PreStart` start a second periodic timer `PurgeTick` every `options.PurgeInterval`. Handler scope-resolves the repo and calls `DeleteTerminalOlderThanAsync`; log the deleted count.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add daily terminal-row purge`).
|
||||
|
||||
---
|
||||
|
||||
### Task 17: `AddNotificationOutbox` DI extension
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.NotificationOutbox/ServiceCollectionExtensions.cs`
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/ServiceRegistrationTests.cs`
|
||||
|
||||
**Step 1 — failing test.** Build a `ServiceCollection`, call `AddNotificationOutbox`, and assert `NotificationOutboxOptions`, the `EmailNotificationDeliveryAdapter`, and the adapter dictionary resolve.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** `public const string OptionsSection = "ScadaLink:NotificationOutbox";` plus `AddNotificationOutbox(this IServiceCollection)` registering `AddOptions<NotificationOutboxOptions>().BindConfiguration(OptionsSection)`, the SMTP client `Func<ISmtpClientWrapper>` (reuse `NotificationService`'s registration or register here), `EmailNotificationDeliveryAdapter`, and a registration that exposes `IReadOnlyDictionary<NotificationType, INotificationDeliveryAdapter>` built from all registered adapters.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add AddNotificationOutbox DI registration`).
|
||||
|
||||
---
|
||||
|
||||
## Phase D — Site retarget + central wiring
|
||||
|
||||
### Task 18: Retarget the site S&F notification handler to forward to central
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs` and/or the site registration that wires the `Notification` category delivery handler
|
||||
- Modify: `src/ScadaLink.Host/SiteServiceRegistration.cs` (where the notification handler is registered)
|
||||
- Test: `tests/ScadaLink.StoreAndForward.Tests/` (a test that the registered notification handler forwards to the communication actor and treats an ack as success)
|
||||
|
||||
**Step 1 — investigate + failing test.** Currently the `Notification` category handler calls `NotificationDeliveryService.DeliverBufferedAsync`. The new handler must instead send a `NotificationSubmit` to central via the site's communication actor (`ClusterClient.Send("/user/central-communication", submit)`) and treat a `NotificationSubmitAck(Accepted:true)` as delivered (`true`), a non-ack/timeout as transient (throw), so S&F retries the forward. Write a test with a `TestProbe` standing in for the central client: handler invoked → probe receives `NotificationSubmit`; reply `NotificationSubmitAck(Accepted:true)` → handler result `true`; timeout → handler throws (transient).
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Add a `NotificationForwarder` (small class or the handler lambda) that holds the site communication actor ref and does `Ask<NotificationSubmitAck>` with the host-configured forward-retry timeout. Register it as the `StoreAndForwardCategory.Notification` delivery handler in `SiteServiceRegistration`, replacing the `NotificationDeliveryService` handler. The S&F engine already buffers/retries on a thrown (transient) result — no S&F core change needed.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): forward site S&F notifications to central`).
|
||||
|
||||
---
|
||||
|
||||
### Task 19: `Notify.Send` async + `Notify.Status` (SiteRuntime)
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs` (`NotifyHelper`, `NotifyTarget`)
|
||||
- Test: `tests/ScadaLink.SiteRuntime.Tests/` (Notify API tests)
|
||||
|
||||
**Step 1 — failing tests.**
|
||||
- `Notify.To("list").Send("subj","body")` generates a GUID `NotificationId`, enqueues a `StoreAndForwardCategory.Notification` message into `StoreAndForwardService` (target `"central"`, payload = serialized `NotificationSubmit`), and returns the `NotificationId` string immediately.
|
||||
- `Notify.Status(id)` issues a `NotificationStatusQuery` to central and returns the mapped status record; while the notification is still in the site S&F buffer (central has no row / query says `Found:false` but the S&F buffer still holds the id) it reports `Forwarding`.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Change `NotifyTarget.Send` to return `Task<string>` (the `NotificationId`): create the GUID, build a `NotificationSubmit` (with `SourceSiteId`, `SourceInstanceId = _instanceName`, `SiteEnqueuedAt = UtcNow`), `EnqueueAsync(Notification, "central", payloadJson)`. Add `NotifyHelper.Status(string notificationId)` returning a status record: query central via the site communication actor; if central returns `Found:false` and the id is still buffered in S&F, return status `Forwarding`. Keep the script-facing surface minimal (`Send`, `Status`).
|
||||
|
||||
**Step 2 note:** the `Notify` API is consumed by compiled scripts — confirm the script trust model / compilation still accepts the changed signature; update any script-API surface tests.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): async Notify.Send with status handle`).
|
||||
|
||||
---
|
||||
|
||||
### Task 20: Central ingest routing
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs`
|
||||
- Test: `tests/ScadaLink.Communication.Tests/CentralCommunicationActorTests.cs`
|
||||
|
||||
**Step 1 — failing test.** When `CentralCommunicationActor` receives a `NotificationSubmit` (sent site→central via ClusterClient to `/user/central-communication`), it forwards it to the notification-outbox singleton proxy and the ack flows back to the original `Sender`. Use a `TestProbe` for the outbox proxy.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** `CentralCommunicationActor` takes an optional outbox-proxy `IActorRef` (passed at construction by the Host, Task 21). `Receive<NotificationSubmit>(m => _outboxProxy.Forward(m))` — `Forward` preserves the original sender so the `NotificationSubmitAck` returns to the site's ClusterClient.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): route NotificationSubmit to the outbox actor`).
|
||||
|
||||
---
|
||||
|
||||
### Task 21: Host registration + appsettings
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Host/Actors/AkkaHostedService.cs` (`RegisterCentralActors`)
|
||||
- Modify: `src/ScadaLink.Host/Program.cs` (call `AddNotificationOutbox`; `Configure<NotificationOutboxOptions>`)
|
||||
- Modify: `src/ScadaLink.Host/appsettings.Central.json` (`ScadaLink:NotificationOutbox` section)
|
||||
- Modify: `src/ScadaLink.Host/appsettings.Site.json` (site→central notification forward-retry interval, if not already covered by S&F config)
|
||||
- Test: `tests/ScadaLink.Host.Tests/` if present, else verify via build + the integration test in Task 25
|
||||
|
||||
**Step 1 — implement.** In `RegisterCentralActors`: create the `NotificationOutboxActor` as a **cluster singleton** (`ClusterSingletonManager.Props` + `ClusterSingletonProxy.Props`, singleton name `"notification-outbox"`, no explicit role — central nodes only run this role), passing `IServiceProvider`, `NotificationOutboxOptions`, the adapter dictionary, and a logger. Pass the singleton **proxy** ref into `CentralCommunicationActor`'s `Props.Create`. In `Program.cs` central path, call `builder.Services.AddNotificationOutbox()` and `services.Configure<NotificationOutboxOptions>(...GetSection(ServiceCollectionExtensions.OptionsSection))`. Add the `ScadaLink:NotificationOutbox` block to `appsettings.Central.json` with the Task 10 defaults.
|
||||
|
||||
**Step 2 — verify:** `dotnet build ScadaLink.slnx` succeeds.
|
||||
|
||||
**Step 3 — commit** (`feat(notification-outbox): register NotificationOutbox singleton in Host`).
|
||||
|
||||
---
|
||||
|
||||
## Phase E — Central UI
|
||||
|
||||
### Task 22: `CommunicationService` outbox methods
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Communication/CommunicationService.cs`
|
||||
- Test: `tests/ScadaLink.Communication.Tests/CommunicationServiceTests.cs` (or the existing service test file)
|
||||
|
||||
**Step 1 — failing tests.** New methods `QueryNotificationOutboxAsync`, `RetryNotificationAsync`, `DiscardNotificationAsync`, `GetNotificationKpisAsync` each `Ask` the central outbox proxy and return the typed response. (These are central-side and do not go through `SiteEnvelope` — they talk to the local outbox proxy directly.) Test with a `TestProbe` for the proxy.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Add an outbox-proxy `IActorRef` to `CommunicationService` (set by the Host like `SetCommunicationActor`). Each method `Ask<TResponse>(request, _options.QueryTimeout)`.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add CommunicationService outbox methods`).
|
||||
|
||||
---
|
||||
|
||||
### Task 23: Notification Outbox Blazor page + nav entry
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.CentralUI/Components/Pages/Monitoring/NotificationOutbox.razor`
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Layout/NavMenu.razor`
|
||||
- Test: `tests/ScadaLink.CentralUI.Tests/Pages/NotificationOutboxPageTests.cs` (bUnit)
|
||||
|
||||
**Step 1 — failing test** (bUnit). Render the page with a substituted `CommunicationService` returning a fixed KPI response and a page of `NotificationSummary` rows; assert the KPI tiles show the values and the table renders the rows; assert clicking Retry on a `Parked` row calls `RetryNotificationAsync`.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Model the page on `Components/Pages/Monitoring/ParkedMessages.razor`: `@page "/monitoring/notification-outbox"`, `@attribute [Authorize(Policy = AuthorizationPolicies.RequireDeployment)]`. KPI tile row (Bootstrap `card` tiles like `Health.razor`) bound to `GetNotificationKpisAsync`; a filter card (status, type, source site, list, time range, stuck-only toggle, subject keyword); a table of `NotificationSummary` with stuck rows badged; Retry/Discard buttons on `Parked` rows using `IDialogService.ConfirmAsync` + `ToastNotification`. Add a `NavLink` to `NavMenu.razor` under the Deployment-role Monitoring section (`href="/monitoring/notification-outbox"`).
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add Notification Outbox UI page`).
|
||||
|
||||
---
|
||||
|
||||
### Task 24: Health dashboard outbox KPI tiles
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Pages/Monitoring/Health.razor`
|
||||
- Test: `tests/ScadaLink.CentralUI.Tests/Pages/HealthPageTests.cs` (extend if present)
|
||||
|
||||
**Step 1 — failing test** (bUnit). With a substituted `CommunicationService.GetNotificationKpisAsync`, the Health page renders three headline outbox tiles: queue depth, stuck count, parked count.
|
||||
|
||||
**Step 2 — run red.**
|
||||
|
||||
**Step 3 — implement.** Add a "Notification Outbox" tile row to `Health.razor`, fetched on init / on the existing 10s polling timer, styled like the existing overview cards.
|
||||
|
||||
**Step 4 — run green. Step 5 — commit** (`feat(notification-outbox): add outbox KPI tiles to Health dashboard`).
|
||||
|
||||
---
|
||||
|
||||
## Phase F — Integration & verification
|
||||
|
||||
### Task 25: End-to-end integration test
|
||||
|
||||
**Files:**
|
||||
- Create: `tests/ScadaLink.IntegrationTests/NotificationOutboxFlowTests.cs`
|
||||
|
||||
**Step 1 — failing test.** Following the patterns in `tests/ScadaLink.IntegrationTests/`, exercise the flow with an in-memory/SQLite-backed `ScadaLinkDbContext` and a real `NotificationOutboxActor`: submit a `NotificationSubmit` → assert a `Notifications` row exists (`Pending`) → trigger a `DispatchTick` with a stub adapter that returns `Success` → assert the row is `Delivered`. Add a second case: stub adapter returns `PermanentFailure` → row `Parked`; then a `RetryNotificationRequest` → row back to `Pending`.
|
||||
|
||||
**Step 2 — run red. Step 3 — make it pass** (it should, if Phases A–D are correct; fix any wiring gaps found). **Step 4 — commit** (`test(notification-outbox): end-to-end outbox flow integration test`).
|
||||
|
||||
---
|
||||
|
||||
### Task 26: Full build + suite verification
|
||||
|
||||
**Files:** none (verification only).
|
||||
|
||||
**Step 1:** `dotnet build ScadaLink.slnx` → must succeed with no errors.
|
||||
|
||||
**Step 2:** `dotnet test ScadaLink.slnx` → the whole suite must pass. Investigate and fix any regressions (notably in `ScadaLink.NotificationService.Tests`, `ScadaLink.StoreAndForward.Tests`, `ScadaLink.SiteRuntime.Tests`, `ScadaLink.Communication.Tests` — the docs/design changed the notification path and existing tests may assert old behavior; update them to the new design).
|
||||
|
||||
**Step 3:** If the docker cluster is used for smoke testing, note that `bash docker/deploy.sh` rebuilds the image — out of scope for this plan unless the user asks.
|
||||
|
||||
**Step 4 — commit** any test fixes (`test(notification-outbox): update existing tests for the central-delivery model`).
|
||||
|
||||
---
|
||||
|
||||
## Follow-ups (post-merge, not blocking)
|
||||
|
||||
- **Remove the now-dead site-side `AddNotificationService()` (from Task 19 review).** After Task 19, the site script runtime no longer resolves `INotificationDeliveryService` (it enqueues into the Store-and-Forward engine instead). `src/ScadaLink.Host/SiteServiceRegistration.cs` still calls `AddNotificationService()`. Task 21 (Host registration) should drop it from the site path — `NotificationService` is now central-only.
|
||||
- **Re-align the Central UI script sandbox `Notify` API (from Task 19 review).** `SandboxNotifyTarget.Send` in `src/ScadaLink.CentralUI/ScriptAnalysis/` still returns `Task<NotificationResult>` and has no `Status` method, while the production `NotifyTarget.Send` now returns `Task<string>` plus `Notify.Status`. A script that test-runs cleanly in the sandbox would not compile against the real runtime. The sandbox `Notify` surface should be rewritten to match production so the test-run feature stays faithful.
|
||||
- **Populate `SourceScript` on outbound notifications (from Task 19 review).** `NotifyTarget.Send` currently passes `SourceScript: null` — the executing script name is not threaded down to the `NotifyHelper`. The payload field and the forwarder already carry it end to end; only the enqueue side needs the wiring.
|
||||
- **Share the SMTP helpers (from Task 12 review).** `EmailNotificationDeliveryAdapter` reimplements `ClassifySmtpError`/`SmtpErrorClass`, `ValidateAddresses`, and a `ScrubCredentials` helper because the originals are `internal` to `ScadaLink.NotificationService`. To avoid divergence (especially in the security-relevant credential redaction and the SMTP 4xx/5xx classification policy), promote `CredentialRedactor` to `public`, extract a `public static SmtpErrorClassifier`, and make `ValidateAddresses` shared — then have the adapter call them and delete the duplicates. The project reference already exists, so this is low-cost.
|
||||
|
||||
## Done
|
||||
|
||||
The Notification Outbox feature is implemented end to end: site scripts enqueue notifications that store-and-forward to central, the `NotificationOutboxActor` singleton ingests them into the `Notifications` table and delivers them via the Email adapter with retry/parking, operators see KPIs and manage notifications from the Central UI, and the full test suite passes. Teams and other delivery adapters can be added later by implementing `INotificationDeliveryAdapter` and registering it — no other change required.
|
||||
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-19-notification-outbox-implementation.md",
|
||||
"tasks": [
|
||||
{"id": 18, "subject": "Task 1: Notification enums", "status": "pending"},
|
||||
{"id": 19, "subject": "Task 2: Notification entity POCO", "status": "pending", "blockedBy": [18]},
|
||||
{"id": 20, "subject": "Task 3: Type field on NotificationList", "status": "pending", "blockedBy": [19]},
|
||||
{"id": 21, "subject": "Task 4: Notification EF configuration + DbSet", "status": "pending", "blockedBy": [20]},
|
||||
{"id": 22, "subject": "Task 5: NotificationOutbox repository", "status": "pending", "blockedBy": [21]},
|
||||
{"id": 23, "subject": "Task 6: EF migration AddNotificationsTable", "status": "pending", "blockedBy": [22]},
|
||||
{"id": 24, "subject": "Task 7: Site/central notification message contracts", "status": "pending", "blockedBy": [23]},
|
||||
{"id": 25, "subject": "Task 8: Outbox query/action contracts", "status": "pending", "blockedBy": [24]},
|
||||
{"id": 26, "subject": "Task 9: Scaffold ScadaLink.NotificationOutbox project", "status": "pending", "blockedBy": [25]},
|
||||
{"id": 27, "subject": "Task 10: NotificationOutboxOptions", "status": "pending", "blockedBy": [26]},
|
||||
{"id": 28, "subject": "Task 11: Delivery adapter abstraction", "status": "pending", "blockedBy": [27]},
|
||||
{"id": 29, "subject": "Task 12: Email delivery adapter", "status": "pending", "blockedBy": [28]},
|
||||
{"id": 30, "subject": "Task 13: NotificationOutboxActor ingest", "status": "pending", "blockedBy": [29]},
|
||||
{"id": 31, "subject": "Task 14: Dispatcher loop", "status": "pending", "blockedBy": [30]},
|
||||
{"id": 32, "subject": "Task 15: Query, retry, discard, KPI handlers", "status": "pending", "blockedBy": [31]},
|
||||
{"id": 33, "subject": "Task 16: Daily purge job", "status": "pending", "blockedBy": [32]},
|
||||
{"id": 34, "subject": "Task 17: AddNotificationOutbox DI extension", "status": "pending", "blockedBy": [33]},
|
||||
{"id": 35, "subject": "Task 18: Retarget site S&F notification handler to central", "status": "pending", "blockedBy": [34]},
|
||||
{"id": 36, "subject": "Task 19: Async Notify.Send + Notify.Status", "status": "pending", "blockedBy": [35]},
|
||||
{"id": 37, "subject": "Task 20: Central ingest routing", "status": "pending", "blockedBy": [36]},
|
||||
{"id": 38, "subject": "Task 21: Host registration + appsettings", "status": "pending", "blockedBy": [37]},
|
||||
{"id": 39, "subject": "Task 22: CommunicationService outbox methods", "status": "pending", "blockedBy": [38]},
|
||||
{"id": 40, "subject": "Task 23: Notification Outbox Blazor page", "status": "pending", "blockedBy": [39]},
|
||||
{"id": 41, "subject": "Task 24: Health dashboard outbox KPI tiles", "status": "pending", "blockedBy": [40]},
|
||||
{"id": 42, "subject": "Task 25: End-to-end integration test", "status": "pending", "blockedBy": [41]},
|
||||
{"id": 43, "subject": "Task 26: Full build + suite verification", "status": "pending", "blockedBy": [42]}
|
||||
],
|
||||
"lastUpdated": "2026-05-19"
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
# Notifications Nav Group — Design
|
||||
|
||||
**Date:** 2026-05-19
|
||||
|
||||
**Goal:** Consolidate all notification-related Central UI pages into a dedicated
|
||||
**Notifications** left-menu section, split the combined Outbox page into a report
|
||||
and a KPIs page, give Notification Lists a proper home, and add a per-source-site
|
||||
KPI breakdown.
|
||||
|
||||
## Background
|
||||
|
||||
Notification-related UI is currently scattered:
|
||||
|
||||
| Page | Route | Nav section | Policy |
|
||||
|---|---|---|---|
|
||||
| SMTP Configuration | `/admin/smtp` | Admin | RequireAdmin |
|
||||
| Notification Outbox (KPI tiles **+** filterable table) | `/monitoring/notification-outbox` | Monitoring | RequireDeployment |
|
||||
| Notification Lists | `/design/notification-lists/...` (form only) | none — table embedded in the External Systems page | RequireDesign |
|
||||
|
||||
The Outbox page mixes KPI tiles and the filterable `Notifications`-table report on
|
||||
one page. Notification Lists has no list page of its own — its table is bolted
|
||||
onto `ExternalSystems.razor`. KPI infrastructure
|
||||
(`NotificationKpiRequest`/`Response`, `INotificationOutboxRepository.ComputeKpisAsync`)
|
||||
is global-only, despite CLAUDE.md stating KPIs are "global + per-source-site".
|
||||
|
||||
## Architecture
|
||||
|
||||
A new **Notifications** left-menu section consolidates these pages. Routes move
|
||||
to a consistent `/notifications/*` prefix. The combined Outbox page is split into
|
||||
two. Notification Lists gets a dedicated page. A bounded backend addition supplies
|
||||
per-source-site KPIs. No actor topology, persistence, or message-evolution rules
|
||||
change beyond the additive KPI contracts.
|
||||
|
||||
## 1. Nav menu
|
||||
|
||||
New `Notifications` section in `NavMenu.razor`, placed **between Deployment and
|
||||
Monitoring**. Final section order: Dashboard, Admin, Design, Deployment,
|
||||
Notifications, Monitoring, Audit Log.
|
||||
|
||||
| Menu item | Route | Policy |
|
||||
|---|---|---|
|
||||
| SMTP Configuration | `/notifications/smtp` | RequireAdmin |
|
||||
| Notification Lists | `/notifications/lists` | RequireDesign |
|
||||
| Notification Report | `/notifications/report` | RequireDeployment |
|
||||
| Notification KPIs | `/notifications/kpis` | RequireDeployment |
|
||||
|
||||
Each item is wrapped in its own per-item `AuthorizeView` policy (same pattern the
|
||||
Monitoring section already uses for its mixed-role items). The section header is a
|
||||
plain `div` — every authenticated user holds at least one of Admin/Design/Deployment,
|
||||
so the header always has ≥1 visible child and cannot be orphaned.
|
||||
|
||||
SMTP Configuration is **removed** from the Admin section; Notification Outbox is
|
||||
**removed** from the Monitoring section.
|
||||
|
||||
## 2. SMTP Configuration
|
||||
|
||||
Move `Components/Pages/Admin/SmtpConfiguration.razor` →
|
||||
`Components/Pages/Notifications/SmtpConfiguration.razor`. Route `/admin/smtp` →
|
||||
`/notifications/smtp`. Page content, `RequireAdmin` policy, and the
|
||||
`SmtpConfiguration` namespace alias are unchanged.
|
||||
|
||||
## 3. Notification Lists (new page)
|
||||
|
||||
New `Components/Pages/Notifications/NotificationLists.razor`
|
||||
(`/notifications/lists`, RequireDesign): a `DataTable` of notification lists with
|
||||
Add and per-row Edit actions, plus an empty state — extracted verbatim from the
|
||||
notification-lists block currently in `ExternalSystems.razor`.
|
||||
|
||||
- `NotificationListForm.razor` routes move:
|
||||
`/design/notification-lists/create` → `/notifications/lists/create`,
|
||||
`/design/notification-lists/{Id:int}/edit` → `/notifications/lists/{Id:int}/edit`.
|
||||
Its "Back" navigation targets `/notifications/lists`.
|
||||
- The notification-lists section is **removed** from `ExternalSystems.razor`,
|
||||
leaving that page purely external systems. The three `/design/notification-lists/...`
|
||||
navigate-links in `ExternalSystems.razor` are removed with it.
|
||||
|
||||
## 4. Notification Report
|
||||
|
||||
New `Components/Pages/Notifications/NotificationReport.razor`
|
||||
(`/notifications/report`, RequireDeployment), split from the existing
|
||||
`Monitoring/NotificationOutbox.razor`. Retains the full filter bar, the paginated
|
||||
`Notifications`-table query (`NotificationOutboxQueryRequest`), and the per-row
|
||||
Retry/Discard actions. The **KPI tile row is removed** from this page.
|
||||
|
||||
`Components/Pages/Monitoring/NotificationOutbox.razor` and its Monitoring nav entry
|
||||
are **deleted**.
|
||||
|
||||
## 5. Notification KPIs
|
||||
|
||||
New `Components/Pages/Notifications/NotificationKpis.razor`
|
||||
(`/notifications/kpis`, RequireDeployment) with a manual Refresh button. Two parts:
|
||||
|
||||
1. **Global tiles** — the existing 5: Queue Depth, Stuck, Parked, Delivered Last
|
||||
Interval, Oldest Pending Age.
|
||||
2. **Per-source-site breakdown table** — one row per site with the same five
|
||||
metrics, so operators can see which site is backing up.
|
||||
|
||||
### Backend addition for per-site KPIs
|
||||
|
||||
Bounded, additive, follows the existing global-KPI pattern:
|
||||
|
||||
- `INotificationOutboxRepository.ComputePerSiteKpisAsync(...)` → returns a
|
||||
per-site collection (a new `SiteNotificationKpiSnapshot` record carrying the
|
||||
source site id plus the five metrics). Implemented in
|
||||
`NotificationOutboxRepository`.
|
||||
- New message pair in `Messages/Notification/NotificationOutboxQueries.cs`:
|
||||
`PerSiteNotificationKpiRequest` / `PerSiteNotificationKpiResponse` (additive —
|
||||
honors message-evolution rules).
|
||||
- A handler in `NotificationOutboxActor` for the new request, mirroring the
|
||||
existing `NotificationKpiRequest` handler.
|
||||
- A `CommunicationService.GetPerSiteNotificationKpisAsync(...)` method mirroring
|
||||
`GetNotificationKpisAsync`.
|
||||
|
||||
Per CLAUDE.md, KPIs remain point-in-time computed from the `Notifications` table —
|
||||
no time-series store, no historical charts (YAGNI).
|
||||
|
||||
## 6. Health dashboard
|
||||
|
||||
`Monitoring/Health.razor` keeps its KPI tile row unchanged. A "View details →"
|
||||
link is added from that tile row to `/notifications/kpis`.
|
||||
|
||||
## Error handling
|
||||
|
||||
Unchanged from the current Outbox page: KPI/query faults surface as an inline
|
||||
warning alert (`Success == false` → `ErrorMessage`); the site-name lookup degrades
|
||||
gracefully to raw site ids. Per-site KPI faults are reported the same way.
|
||||
|
||||
## Testing
|
||||
|
||||
- bUnit component tests for `NotificationLists`, `NotificationReport`,
|
||||
`NotificationKpis`, and the moved `SmtpConfiguration` page.
|
||||
- A `NavMenu` test asserting the Notifications section renders and that per-item
|
||||
visibility honors Admin/Design/Deployment roles.
|
||||
- Repository tests for `ComputePerSiteKpisAsync`.
|
||||
- Actor test for the `PerSiteNotificationKpiRequest` handler.
|
||||
- `CommunicationService` test for `GetPerSiteNotificationKpisAsync`.
|
||||
|
||||
## Out of scope
|
||||
|
||||
- Historical/trend KPI charts (no time-series store).
|
||||
- Any change to notification delivery, store-and-forward, or the `Notifications`
|
||||
table schema.
|
||||
- Renaming the Notification Outbox **component** (#21) — only the UI page names change.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-19-notifications-nav-group.md",
|
||||
"tasks": [
|
||||
{"id": 53, "subject": "Task 1: Per-site KPI domain type + repo contract", "status": "completed"},
|
||||
{"id": 54, "subject": "Task 2: ComputePerSiteKpisAsync repository impl", "status": "completed", "blockedBy": [53]},
|
||||
{"id": 55, "subject": "Task 3: Per-site KPI message contracts", "status": "completed", "blockedBy": [53]},
|
||||
{"id": 56, "subject": "Task 4: Actor per-site KPI handler", "status": "completed", "blockedBy": [54, 55]},
|
||||
{"id": 57, "subject": "Task 5: CommunicationService per-site KPI accessor", "status": "completed", "blockedBy": [56]},
|
||||
{"id": 58, "subject": "Task 6: Move SMTP page to /notifications/smtp", "status": "completed"},
|
||||
{"id": 59, "subject": "Task 7: New Notification Lists page", "status": "completed"},
|
||||
{"id": 60, "subject": "Task 8: Move list form route; drop External Systems tab", "status": "completed", "blockedBy": [59]},
|
||||
{"id": 61, "subject": "Task 9: New Notification Report page; retire Outbox page", "status": "completed"},
|
||||
{"id": 62, "subject": "Task 10: New Notification KPIs page", "status": "completed", "blockedBy": [57]},
|
||||
{"id": 63, "subject": "Task 11: NavMenu Notifications section", "status": "completed", "blockedBy": [58, 59, 61, 62]},
|
||||
{"id": 64, "subject": "Task 12: Health dashboard KPI page link", "status": "completed", "blockedBy": [62]},
|
||||
{"id": 65, "subject": "Task 13: Full build + suite verification", "status": "completed", "blockedBy": [53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]}
|
||||
],
|
||||
"lastUpdated": "2026-05-19"
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,324 @@
|
||||
# Audit Log #23 — M1 Foundation Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development to implement this plan task-by-task (bundled cadence per `feedback_subagent_cadence`).
|
||||
|
||||
**Goal:** Land the `AuditLog` table (monthly-partitioned) plus DB roles in MS SQL, and add the Commons types + EF repo + new `ScadaLink.AuditLog` project skeleton that every later milestone depends on. After M1 the database is ready, the new project is wired into the solution, and `dotnet build && dotnet test` are both green.
|
||||
|
||||
**Architecture:** New `AuditEvent` record + audit enums + writer interfaces in Commons. New EF entity configuration + EF Core migration creating `AuditLog` table aligned to `ps_AuditLog_Month` partition scheme on `OccurredAtUtc`, plus `scadalink_audit_writer` and `scadalink_audit_purger` SQL roles. New `IAuditLogRepository` with append-only surface (no Update, no row-delete). New `src/ScadaLink.AuditLog/` project skeleton + `AuditLogOptions`.
|
||||
|
||||
**Tech Stack:** .NET 10 / EF Core 10.0.7 / Microsoft.Data.SqlClient 6.0.2 / xUnit 2.9.3 / running `infra/mssql` container for integration tests.
|
||||
|
||||
**Brainstorm decisions (locked):**
|
||||
- **MSSQL test harness:** integration tests hit the existing `infra/mssql` container (require `cd infra && docker compose up -d`).
|
||||
- **AuditEvent shape:** one record with nullable `IngestedAtUtc` (set centrally) and nullable `ForwardState` (set site-locally).
|
||||
- **Filegroup:** PRIMARY, hard-coded.
|
||||
- **Indexes:** five named explicitly via `.HasDatabaseName("IX_AuditLog_…")`.
|
||||
|
||||
**Pre-existing reality:**
|
||||
- `Entities/Audit/AuditLogEntry.cs` (config-audit, 9 cols) **coexists with** new `AuditEvent` — no rename, no removal.
|
||||
- `IAuditService` (config-audit) is distinct from new `IAuditWriter` / `ICentralAuditWriter`.
|
||||
- `tests/ScadaLink.IntegrationTests/` uses EF in-memory — NOT usable for partition/role tests.
|
||||
- Roadmap M1-T10 (project skeleton) must run before M1-T9 (options class). **Swapped in this plan.**
|
||||
|
||||
---
|
||||
|
||||
## Bundles (cadence-aligned)
|
||||
|
||||
Tasks 1–11 from the roadmap are grouped into 6 bundles. Each bundle = one implementer dispatch + one combined spec+quality reviewer. The final cross-bundle reviewer runs over the whole branch.
|
||||
|
||||
- **Bundle A — Commons types** (roadmap T1+T2+T3+T4): enums, AuditEvent record + ForwardState enum, IAuditWriter / ICentralAuditWriter, telemetry message DTOs.
|
||||
- **Bundle B — EF entity mapping** (T5): DbSet + IEntityTypeConfiguration<AuditEvent> + indexes.
|
||||
- **Bundle C — Migration with partitioning + DB roles** (T6+T7 merged — one migration file).
|
||||
- **Bundle D — Repository** (T8): IAuditLogRepository + EF implementation + DI registration.
|
||||
- **Bundle E — AuditLog project skeleton + options** (T10 then T9): new `src/ScadaLink.AuditLog/` project + `AuditLogOptions`.
|
||||
- **Bundle F — Docs paper trail** (T11): controller-direct edit; no subagent needed for a 1–3 line update.
|
||||
|
||||
---
|
||||
|
||||
## Bundle A — Commons types
|
||||
|
||||
### Task 1: Add audit enums to Commons
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Types/Enums/AuditChannel.cs`
|
||||
- Create: `src/ScadaLink.Commons/Types/Enums/AuditKind.cs`
|
||||
- Create: `src/ScadaLink.Commons/Types/Enums/AuditStatus.cs`
|
||||
- Create: `src/ScadaLink.Commons/Types/Enums/AuditForwardState.cs`
|
||||
- Create: `tests/ScadaLink.Commons.Tests/Types/Enums/AuditEnumTests.cs`
|
||||
|
||||
**AuditChannel members** (4): `ApiOutbound`, `DbOutbound`, `Notification`, `ApiInbound`.
|
||||
|
||||
**AuditKind members** (10, per alog.md §4): `ApiCall`, `ApiCallCached`, `DbWrite`, `DbWriteCached`, `NotifySend`, `NotifyDeliver`, `InboundRequest`, `InboundAuthFailure`, `CachedSubmit`, `CachedResolve`.
|
||||
|
||||
**AuditStatus members** (8, per alog.md §4): `Submitted`, `Forwarded`, `Attempted`, `Delivered`, `Failed`, `Parked`, `Discarded`, `Skipped`.
|
||||
|
||||
**AuditForwardState members** (3): `Pending`, `Forwarded`, `Reconciled`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests assert each enum's exact member set via `Enum.GetValues(typeof(T)).Cast<T>().Select(x => x.ToString())` against a string-array literal.
|
||||
2. Run: fail (enums don't exist).
|
||||
3. Implement the four enums (no `[Flags]`).
|
||||
4. Run: pass.
|
||||
5. Commit: `feat(commons): add Audit{Channel,Kind,Status,ForwardState} enums for #23`.
|
||||
|
||||
### Task 2: Add AuditEvent record
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Entities/Audit/AuditEvent.cs` — `public sealed record AuditEvent` with the 20 central columns per alog.md §4, plus nullable `AuditForwardState? ForwardState` and nullable `DateTime? IngestedAtUtc`.
|
||||
- Create: `tests/ScadaLink.Commons.Tests/Entities/Audit/AuditEventTests.cs`.
|
||||
|
||||
Properties (in alog.md §4 order):
|
||||
`Guid EventId`, `DateTime OccurredAtUtc`, `DateTime? IngestedAtUtc`, `AuditChannel Channel`, `AuditKind Kind`, `Guid? CorrelationId`, `string? SourceSiteId`, `string? SourceInstanceId`, `string? SourceScript`, `string? Actor`, `string? Target`, `AuditStatus Status`, `int? HttpStatus`, `int? DurationMs`, `string? ErrorMessage`, `string? ErrorDetail`, `string? RequestSummary`, `string? ResponseSummary`, `bool PayloadTruncated`, `string? Extra`, `AuditForwardState? ForwardState`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing test constructs an `AuditEvent`, asserts each property reads back as set, asserts `with` expression produces a new instance with one field changed.
|
||||
2. Run: fail.
|
||||
3. Implement record with all properties as `init`-only.
|
||||
4. Run: pass.
|
||||
5. Commit: `feat(commons): add AuditEvent record (#23)`.
|
||||
|
||||
### Task 3: Add IAuditWriter and ICentralAuditWriter
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Interfaces/Services/IAuditWriter.cs`
|
||||
- Create: `src/ScadaLink.Commons/Interfaces/Services/ICentralAuditWriter.cs`
|
||||
- Create: `tests/ScadaLink.Commons.Tests/Interfaces/Services/AuditWriterContractTests.cs`
|
||||
|
||||
Both interfaces expose `Task WriteAsync(AuditEvent evt, CancellationToken ct = default)`. XML doc comments name Audit Log #23 as the owner; `IAuditWriter` is the abstraction the boundary code calls, `ICentralAuditWriter` is the central-only flavor (used by direct-write paths in M2+).
|
||||
|
||||
**Steps:**
|
||||
1. Failing reflection test: `typeof(IAuditWriter).GetMethod("WriteAsync")` returns a method whose parameters are `(AuditEvent, CancellationToken)` and return type is `Task`. Same for `ICentralAuditWriter`.
|
||||
2. Run: fail.
|
||||
3. Implement both interfaces with XML docs.
|
||||
4. Run: pass.
|
||||
5. Commit: `feat(commons): add IAuditWriter and ICentralAuditWriter (#23)`.
|
||||
|
||||
### Task 4: Add audit telemetry + pull message DTOs
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Messages/Integration/AuditTelemetryEnvelope.cs` — `public sealed record AuditTelemetryEnvelope(Guid EnvelopeId, string SourceSiteId, IReadOnlyList<AuditEvent> Events)`.
|
||||
- Create: `src/ScadaLink.Commons/Messages/Integration/PullAuditEventsRequest.cs` — `public sealed record PullAuditEventsRequest(string SourceSiteId, DateTime SinceUtc, int BatchSize)`.
|
||||
- Create: `src/ScadaLink.Commons/Messages/Integration/PullAuditEventsResponse.cs` — `public sealed record PullAuditEventsResponse(IReadOnlyList<AuditEvent> Events, bool MoreAvailable)`.
|
||||
- Create: `tests/ScadaLink.Commons.Tests/Messages/Integration/AuditTelemetryMessagesTests.cs`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing test constructs envelope with 3 events and asserts immutability and enumerability.
|
||||
2. Failing test constructs `PullAuditEventsRequest` + `PullAuditEventsResponse` with `MoreAvailable=true`.
|
||||
3. Run: fail.
|
||||
4. Implement records.
|
||||
5. Run: pass.
|
||||
6. Commit: `feat(commons): add audit telemetry + pull message DTOs (#23)`.
|
||||
|
||||
**Bundle A acceptance:** Commons project compiles. Four enum tests, AuditEvent test, two interface contract tests, two telemetry-message tests all green. No existing tests regress.
|
||||
|
||||
---
|
||||
|
||||
## Bundle B — EF entity mapping
|
||||
|
||||
### Task 5: Extend ScadaLinkDbContext + add IEntityTypeConfiguration<AuditEvent>
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/ScadaLinkDbContext.cs` — add `public DbSet<AuditEvent> AuditLogs => Set<AuditEvent>();` in the existing `// Audit` section, **directly after** the existing `AuditLogEntries` DbSet. Do not remove or modify `AuditLogEntries`.
|
||||
- Create: `src/ScadaLink.ConfigurationDatabase/Configurations/AuditLogEntityTypeConfiguration.cs` — `IEntityTypeConfiguration<AuditEvent>` mapping to table `AuditLog`, columns per alog.md §4 (with max lengths), PK on `EventId`, enum columns stored as `varchar(32)` via `HasConversion<string>().HasMaxLength(32)`. **No partition function declared here** — that goes in the migration's raw SQL.
|
||||
|
||||
Five indexes with explicit names:
|
||||
- `IX_AuditLog_OccurredAtUtc` on (`OccurredAtUtc` desc)
|
||||
- `IX_AuditLog_Site_Occurred` on (`SourceSiteId`, `OccurredAtUtc` desc)
|
||||
- `IX_AuditLog_CorrelationId` on (`CorrelationId`) where `CorrelationId IS NOT NULL`
|
||||
- `IX_AuditLog_Channel_Status_Occurred` on (`Channel`, `Status`, `OccurredAtUtc` desc)
|
||||
- `IX_AuditLog_Target_Occurred` on (`Target`, `OccurredAtUtc` desc) where `Target IS NOT NULL`
|
||||
|
||||
- Modify: `OnModelCreating` — apply via `modelBuilder.ApplyConfiguration(new AuditLogEntityTypeConfiguration())`.
|
||||
- Create: `tests/ScadaLink.ConfigurationDatabase.Tests/Configurations/AuditLogEntityTypeConfigurationTests.cs` — use a `ModelBuilder` directly (no DbContext required) and assert:
|
||||
- mapped table name is `AuditLog`,
|
||||
- PK is `EventId`,
|
||||
- exactly 21 properties are mapped (20 + ForwardState; IngestedAtUtc is one of the 20 per spec; but ForwardState is the +1),
|
||||
- the five indexes exist with the documented names.
|
||||
|
||||
**Steps:**
|
||||
1. Failing test: model asserts on table name + PK + property count.
|
||||
2. Implement config + apply in `OnModelCreating`; add the DbSet.
|
||||
3. Failing test: model asserts five named indexes.
|
||||
4. Add `HasIndex(...).HasDatabaseName(...)` for each.
|
||||
5. Run: pass.
|
||||
6. Commit: `feat(configdb): map AuditEvent to AuditLog table with PK and five named indexes (#23)`.
|
||||
|
||||
**Bundle B acceptance:** ConfigurationDatabase project compiles. Mapping test passes. No existing ConfigurationDatabase.Tests regress.
|
||||
|
||||
---
|
||||
|
||||
## Bundle C — Migration with partitioning + DB roles
|
||||
|
||||
### Task 6+7 (merged): Create migration with partition function/scheme/table + DB roles
|
||||
|
||||
**Files:**
|
||||
- Generate: `src/ScadaLink.ConfigurationDatabase/Migrations/<yyyyMMddHHmmss>_AddAuditLogTable.cs` via:
|
||||
```
|
||||
dotnet ef migrations add AddAuditLogTable --project src/ScadaLink.ConfigurationDatabase \
|
||||
--startup-project src/ScadaLink.Host --output-dir Migrations
|
||||
```
|
||||
- Customize the migration's `Up()`:
|
||||
1. Raw SQL: create partition function `pf_AuditLog_Month` (RANGE RIGHT FOR VALUES with month-boundaries from `2026-01-01` through `2027-12-01` UTC), and partition scheme `ps_AuditLog_Month` ALL TO ([PRIMARY]).
|
||||
2. Drop EF's auto-generated `CREATE TABLE` and replace with raw SQL that creates `AuditLog` ON `ps_AuditLog_Month(OccurredAtUtc)`. (Or: let EF generate the table, then `ALTER TABLE … ADD CONSTRAINT … PK … ON ps_AuditLog_Month(OccurredAtUtc)` — whichever EF 10 supports cleanly.)
|
||||
3. Create the five named indexes via `migrationBuilder.CreateIndex(...)`, partition-aligned on `ps_AuditLog_Month(OccurredAtUtc)` where appropriate.
|
||||
4. Raw SQL roles, idempotent (`IF NOT EXISTS … CREATE ROLE`):
|
||||
- `scadalink_audit_writer`: GRANT INSERT ON AuditLog; GRANT SELECT ON AuditLog. (No UPDATE, no DELETE.)
|
||||
- `scadalink_audit_purger`: GRANT ALTER ON SCHEMA::dbo; GRANT SELECT ON AuditLog. (Enables ALTER PARTITION FUNCTION SWITCH and SWITCH PARTITION.)
|
||||
- `Down()` drops indexes, table, scheme, function, then both roles.
|
||||
- Create: `tests/ScadaLink.ConfigurationDatabase.Tests/Migrations/AddAuditLogTableMigrationTests.cs` — uses a fixture connecting to the running `infra/mssql` container via the connection string in `infra/mssql/.env` (or skips with `Skip.If` when the env var `SCADALINK_MSSQL_TEST_CONN` is unset, so CI without the container still passes).
|
||||
|
||||
Integration test assertions:
|
||||
- `sys.partition_functions` contains `pf_AuditLog_Month`.
|
||||
- `sys.partition_schemes` contains `ps_AuditLog_Month`.
|
||||
- `INFORMATION_SCHEMA.TABLES` contains `AuditLog` aligned to the partition scheme.
|
||||
- `sys.indexes` contains the five expected named indexes.
|
||||
- `sys.database_principals` contains both roles.
|
||||
- Smoke test: log in as a user mapped to `scadalink_audit_writer`, attempt `UPDATE AuditLog …`, expect `SqlException` with permission error.
|
||||
|
||||
**Steps:**
|
||||
1. Generate the migration; let EF auto-fill the body.
|
||||
2. Failing integration test: assert partition function exists.
|
||||
3. Edit migration to add the partition function + scheme + table alignment.
|
||||
4. Re-run: pass.
|
||||
5. Failing integration test: assert five indexes exist.
|
||||
6. Add named indexes to migration.
|
||||
7. Failing integration test: assert both roles exist with documented grants.
|
||||
8. Add roles to migration.
|
||||
9. Failing integration test: smoke `UPDATE AuditLog` as writer expects permission error.
|
||||
10. Verify role grants exclude UPDATE.
|
||||
11. Run: pass.
|
||||
12. Commit: `feat(configdb): add AuditLog migration with monthly partitioning and DB roles (#23)`.
|
||||
|
||||
**Notes for the implementer:**
|
||||
- Use `Microsoft.Data.SqlClient` directly in the test fixture (not EF) to issue raw SQL for grant assertions.
|
||||
- `Skip.If(string.IsNullOrEmpty(Environment.GetEnvironmentVariable("SCADALINK_MSSQL_TEST_CONN")), "MSSQL not available")` — keeps tests CI-safe.
|
||||
- Test database name: `ScadaLinkAuditMigrationTest_<guid>` (created per fixture, dropped on dispose).
|
||||
|
||||
**Bundle C acceptance:** Migration applied to a fresh test DB on the `infra/mssql` container creates the partition function/scheme/table/indexes/roles. Smoke test confirms UPDATE is denied for the writer role. All migration tests pass when `SCADALINK_MSSQL_TEST_CONN` is set; skip cleanly when unset.
|
||||
|
||||
---
|
||||
|
||||
## Bundle D — Repository
|
||||
|
||||
### Task 8: IAuditLogRepository + EF implementation + DI
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Interfaces/Repositories/IAuditLogRepository.cs` — three methods:
|
||||
- `Task InsertIfNotExistsAsync(AuditEvent evt, CancellationToken ct = default);`
|
||||
- `Task<IReadOnlyList<AuditEvent>> QueryAsync(AuditLogQueryFilter filter, AuditLogPaging paging, CancellationToken ct = default);`
|
||||
- `Task SwitchOutPartitionAsync(DateTime monthBoundary, CancellationToken ct = default);`
|
||||
|
||||
Plus two small DTOs in the same file (or co-located `Types/Audit/`):
|
||||
- `AuditLogQueryFilter` record: nullable `AuditChannel?`, `AuditKind?`, `AuditStatus?`, `string? SourceSiteId`, `string? Target`, `string? Actor`, `Guid? CorrelationId`, `DateTime? FromUtc`, `DateTime? ToUtc`.
|
||||
- `AuditLogPaging` record: `int PageSize`, `Guid? AfterEventId`, `DateTime? AfterOccurredAtUtc` (keyset).
|
||||
|
||||
- Create: `src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs` — implements all three methods:
|
||||
- `InsertIfNotExistsAsync` uses raw SQL `IF NOT EXISTS (SELECT 1 FROM AuditLog WHERE EventId = @id) INSERT INTO AuditLog …` via `DbContext.Database.ExecuteSqlInterpolatedAsync` (bypasses change tracker).
|
||||
- `QueryAsync` builds an `IQueryable<AuditEvent>`, applies filters, projects, paged by keyset on `(OccurredAtUtc desc, EventId desc)`.
|
||||
- `SwitchOutPartitionAsync` builds a unique staging table name, runs `CREATE TABLE … <staging>` with identical schema and ON `[PRIMARY]`, runs `ALTER TABLE AuditLog SWITCH PARTITION <n> TO <staging>`, then `DROP TABLE <staging>`. All inside a single transaction. Computes partition number from `monthBoundary` via `$partition.pf_AuditLog_Month(@boundary)`.
|
||||
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/ServiceCollectionExtensions.cs` — add `services.AddScoped<IAuditLogRepository, AuditLogRepository>();` after `INotificationOutboxRepository` line.
|
||||
|
||||
- Create: `tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs` — uses the same MSSQL fixture from Bundle C (skipped when env var unset) since `InsertIfNotExistsAsync` uses raw SQL that won't run on EF in-memory.
|
||||
|
||||
Tests:
|
||||
- Insert for fresh `EventId` writes one row.
|
||||
- Calling `InsertIfNotExistsAsync` again with the same `EventId` is a no-op (no exception, row count unchanged).
|
||||
- `QueryAsync` returns rows in `(OccurredAtUtc desc, EventId desc)` order honoring all filter predicates.
|
||||
- `QueryAsync` with non-null `AfterEventId`/`AfterOccurredAtUtc` keysets correctly to the next page.
|
||||
- `SwitchOutPartitionAsync` for an old boundary removes the rows belonging to that partition from the live table.
|
||||
|
||||
**Steps:**
|
||||
1. Failing test: insert + duplicate insert.
|
||||
2. Implement using raw SQL.
|
||||
3. Failing test: query order + filters.
|
||||
4. Implement.
|
||||
5. Failing test: keyset paging.
|
||||
6. Implement.
|
||||
7. Failing test: switch-out partition.
|
||||
8. Implement.
|
||||
9. Run all: pass.
|
||||
10. Commit: `feat(configdb): IAuditLogRepository + EF implementation, append-only with partition-switch purge (#23)`.
|
||||
|
||||
**Bundle D acceptance:** Repository tests green. DI smoke test from existing ConfigurationDatabase.Tests still passes.
|
||||
|
||||
---
|
||||
|
||||
## Bundle E — AuditLog project skeleton + options
|
||||
|
||||
### Task 10 (first): Scaffold `src/ScadaLink.AuditLog/` project
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/ScadaLink.AuditLog.csproj` — TargetFramework `net10.0` (matches solution), references `ScadaLink.Commons` + `ScadaLink.ConfigurationDatabase`.
|
||||
- Create: `src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs` — `public static class ServiceCollectionExtensions { public static IServiceCollection AddAuditLog(this IServiceCollection services, IConfiguration config) { … } }` registering `AuditLogOptions` (from Task 9) and forwarding to `services.AddScoped<IAuditLogRepository, AuditLogRepository>()` (already registered by ConfigurationDatabase, so this is a no-op but documents the dependency).
|
||||
- Create: `tests/ScadaLink.AuditLog.Tests/ScadaLink.AuditLog.Tests.csproj` with one smoke test.
|
||||
- Create: `tests/ScadaLink.AuditLog.Tests/AddAuditLogTests.cs` — smoke test: `services.AddAuditLog(config); var p = services.BuildServiceProvider(); Assert.NotNull(p.GetService<IOptions<AuditLogOptions>>());`.
|
||||
- Modify: `ScadaLink.slnx` — add both projects.
|
||||
|
||||
**Steps:**
|
||||
1. `dotnet new classlib -n ScadaLink.AuditLog -o src/ScadaLink.AuditLog --framework net10.0` (then delete the default `Class1.cs`).
|
||||
2. `dotnet new xunit -n ScadaLink.AuditLog.Tests -o tests/ScadaLink.AuditLog.Tests --framework net10.0`.
|
||||
3. Add `<ProjectReference>` to Commons + ConfigurationDatabase in the src csproj; add reference to ScadaLink.AuditLog in the test csproj.
|
||||
4. Add both projects to `ScadaLink.slnx` (inside the existing `/src/` and `/tests/` folders).
|
||||
5. Add `<PackageReference Include="Microsoft.Extensions.Configuration.Binder" />` to the src csproj (already in `Directory.Packages.props`).
|
||||
6. Create stub `ServiceCollectionExtensions.AddAuditLog` (just registers options; writer impl comes in M2).
|
||||
7. Commit: `feat(auditlog): scaffold ScadaLink.AuditLog project (#23)`.
|
||||
|
||||
### Task 9: AuditLogOptions
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Configuration/AuditLogOptions.cs` — class with:
|
||||
- `int DefaultCapBytes` (default 8192)
|
||||
- `int ErrorCapBytes` (default 65536)
|
||||
- `List<string> HeaderRedactList` (default: `[ "Authorization", "X-Api-Key", "Cookie", "Set-Cookie" ]`)
|
||||
- `List<string> GlobalBodyRedactors` (default: empty)
|
||||
- `Dictionary<string, PerTargetRedactionOverride> PerTargetOverrides` (default empty)
|
||||
- `int RetentionDays` (default 365; range [30, 3650])
|
||||
- Create: `src/ScadaLink.AuditLog/Configuration/PerTargetRedactionOverride.cs` — minimal: `int? CapBytes`, `List<string>? AdditionalBodyRedactors`.
|
||||
- Create: `src/ScadaLink.AuditLog/Configuration/AuditLogOptionsValidator.cs` — `IValidateOptions<AuditLogOptions>` checking `DefaultCapBytes > 0`, `ErrorCapBytes >= DefaultCapBytes`, `RetentionDays` in `[30, 3650]`.
|
||||
- Modify: `src/ScadaLink.AuditLog/ServiceCollectionExtensions.AddAuditLog` to `services.AddOptions<AuditLogOptions>().Bind(config.GetSection("AuditLog")).ValidateOnStart(); services.AddSingleton<IValidateOptions<AuditLogOptions>, AuditLogOptionsValidator>();`.
|
||||
- Add: `tests/ScadaLink.AuditLog.Tests/Configuration/AuditLogOptionsTests.cs`:
|
||||
- Bind valid section → values present.
|
||||
- Bind invalid `RetentionDays = 0` → validator rejects.
|
||||
- Bind invalid `ErrorCapBytes < DefaultCapBytes` → validator rejects.
|
||||
|
||||
**Steps:**
|
||||
1. Failing test: valid bind round-trip.
|
||||
2. Implement options class.
|
||||
3. Failing test: invalid `RetentionDays`.
|
||||
4. Implement validator.
|
||||
5. Failing test: invalid `ErrorCapBytes`.
|
||||
6. Validator covers it.
|
||||
7. Run: pass.
|
||||
8. Commit: `feat(auditlog): add AuditLogOptions + validator (#23)`.
|
||||
|
||||
**Bundle E acceptance:** New `src/ScadaLink.AuditLog/` project builds. Solution still builds. Smoke + options tests green. `ScadaLink.slnx` includes both new entries.
|
||||
|
||||
---
|
||||
|
||||
## Bundle F — Docs paper trail (controller-direct)
|
||||
|
||||
### Task 11: Register AuditLog project in Component-Host.md and confirm README
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-Host.md` — list `ScadaLink.AuditLog` in the central role's registration set.
|
||||
- Modify: `README.md` — confirm row #23 link reflects the new project (no functional change unless missing).
|
||||
|
||||
This is a 1–3 line edit. Per the cadence memory, controller does it directly without a subagent.
|
||||
|
||||
**Commit:** `docs(audit): register ScadaLink.AuditLog project in Host role (#23)`.
|
||||
|
||||
---
|
||||
|
||||
## Final cross-bundle review
|
||||
|
||||
After all bundles ship:
|
||||
|
||||
- Dispatch a final code-reviewer subagent over the whole M1 branch.
|
||||
- Acceptance gate (from goal prompt step E):
|
||||
- `dotnet test ScadaLink.slnx` green (full solution).
|
||||
- All M1 roadmap acceptance criteria met; each cited by name to the proving test.
|
||||
- If green, merge to main `--no-ff` with summary message (step F).
|
||||
- Update M2–M8 sections of the roadmap with realities learned (step G), commit.
|
||||
- Status paragraph (step H).
|
||||
- Proceed to M2 (step I).
|
||||
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-20-auditlog-m1-foundation.md",
|
||||
"tasks": [
|
||||
{"id": "A1", "subject": "Bundle A T1: Add audit enums (Channel, Kind, Status, ForwardState)", "status": "pending"},
|
||||
{"id": "A2", "subject": "Bundle A T2: Add AuditEvent record", "status": "pending", "blockedBy": ["A1"]},
|
||||
{"id": "A3", "subject": "Bundle A T3: Add IAuditWriter + ICentralAuditWriter", "status": "pending", "blockedBy": ["A2"]},
|
||||
{"id": "A4", "subject": "Bundle A T4: Add audit telemetry + pull message DTOs", "status": "pending", "blockedBy": ["A2"]},
|
||||
{"id": "A-rev", "subject": "Bundle A combined spec+quality review", "status": "pending", "blockedBy": ["A1", "A2", "A3", "A4"]},
|
||||
{"id": "B5", "subject": "Bundle B T5: ScadaLinkDbContext.AuditLogs + IEntityTypeConfiguration<AuditEvent> with five named indexes", "status": "pending", "blockedBy": ["A-rev"]},
|
||||
{"id": "B-rev", "subject": "Bundle B review", "status": "pending", "blockedBy": ["B5"]},
|
||||
{"id": "C67", "subject": "Bundle C T6+T7: AddAuditLogTable migration (partition fn/scheme/table/indexes) + DB roles, with infra/mssql integration tests", "status": "pending", "blockedBy": ["B-rev"]},
|
||||
{"id": "C-rev", "subject": "Bundle C review", "status": "pending", "blockedBy": ["C67"]},
|
||||
{"id": "D8", "subject": "Bundle D T8: IAuditLogRepository + EF implementation + DI registration", "status": "pending", "blockedBy": ["C-rev"]},
|
||||
{"id": "D-rev", "subject": "Bundle D review", "status": "pending", "blockedBy": ["D8"]},
|
||||
{"id": "E10", "subject": "Bundle E T10: Scaffold src/ScadaLink.AuditLog/ project + slnx entries", "status": "pending", "blockedBy": ["D-rev"]},
|
||||
{"id": "E9", "subject": "Bundle E T9: AuditLogOptions + validator", "status": "pending", "blockedBy": ["E10"]},
|
||||
{"id": "E-rev", "subject": "Bundle E review", "status": "pending", "blockedBy": ["E10", "E9"]},
|
||||
{"id": "F11", "subject": "Bundle F T11 (controller-direct): Register ScadaLink.AuditLog in Component-Host.md + README confirm", "status": "pending", "blockedBy": ["E-rev"]},
|
||||
{"id": "FINAL-rev", "subject": "Final cross-bundle review over the whole M1 branch", "status": "pending", "blockedBy": ["F11"]},
|
||||
{"id": "MERGE", "subject": "Verify gate: full solution dotnet test green, then merge --no-ff to main", "status": "pending", "blockedBy": ["FINAL-rev"]},
|
||||
{"id": "ROADMAP", "subject": "Update downstream M2-M8 sections of roadmap with realities learned in M1", "status": "pending", "blockedBy": ["MERGE"]}
|
||||
],
|
||||
"lastUpdated": "2026-05-20T00:00:00Z"
|
||||
}
|
||||
@@ -0,0 +1,408 @@
|
||||
# Audit Log #23 — M2 Site Pipeline (sync-only) Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development (bundled cadence per `feedback_subagent_cadence`).
|
||||
|
||||
**Goal:** First end-to-end audit emission. A script-initiated `ExternalSystem.Call()` produces exactly one `ApiOutbound`/`ApiCall` row in the central `AuditLog` table via site SQLite hot-path + gRPC push telemetry + central ingest actor. Audit-write failures NEVER abort the script.
|
||||
|
||||
**Architecture (decisions locked):**
|
||||
- Provenance: **Wrap CallAsync in ScriptRuntimeContext** — IExternalSystemClient.CallAsync signature unchanged; ScriptRuntimeContext.ExternalSystem.Call captures instance/script/site and emits the AuditEvent via IAuditWriter.
|
||||
- Direction: **Push primary** — SiteAuditTelemetryActor batches Pending rows and pushes via a new `IngestAuditEvents` unary gRPC RPC on `sitestream.proto`. Pull (reconciliation) deferred to M6.
|
||||
- E2E: **Component-level test** via TestKit + MSSQL fixture; stubbed gRPC client forwards directly to the central ingest actor. No expansion of `ScadaLinkWebApplicationFactory`.
|
||||
- Site writer: **Mirror SiteEventLogger** — `Channel<PendingAuditEvent>` + background writer Task for sub-ms enqueue durability.
|
||||
|
||||
**M1 realities baked in:**
|
||||
- Enum vocabulary: `AuditKind.ApiCall` for sync API call; `AuditStatus.Delivered` for success, `AuditStatus.Failed` for HTTP non-2xx (permanent OR transient → both Failed for a sync call; cached path differs in M3). The "Status=Success/TransientFailure/PermanentFailure" wording in the roadmap is stale and must be replaced with the new vocabulary.
|
||||
- `AuditLogRepository.InsertIfNotExistsAsync` race window — M2 is the first concurrent writer; harden it before AuditLogIngestActor lands.
|
||||
- Keyset tiebreaker test gap from Bundle D — add a same-OccurredAt test in M2.
|
||||
- `MsSqlMigrationFixture` reusable as-is; promoted to `[CollectionDefinition]`-shared if multiple test classes need it (defer until actually needed).
|
||||
- `Xunit.SkippableFact` + `Skip.IfNot(_fixture.Available, _fixture.SkipReason)` for any MSSQL-dependent tests.
|
||||
- `ScadaLink.AuditLog/Site/` and `ScadaLink.AuditLog/Central/` and `ScadaLink.AuditLog/Telemetry/` subfolders. DI extension `AddAuditLog` is the registration point.
|
||||
|
||||
**Tech stack additions:**
|
||||
- `Microsoft.Data.Sqlite 10.0.7` (pinned).
|
||||
- `Akka.TestKit.Xunit2 1.5.62` (pinned).
|
||||
- `Grpc.Tools` already configured in `ScadaLink.Communication.csproj`.
|
||||
|
||||
---
|
||||
|
||||
## Bundles
|
||||
|
||||
- **Bundle A — Repo race-fix + tiebreaker test** (M1 realities catch-up).
|
||||
- **Bundle B — Site SQLite writer + fallback** (M2-T1, T2, T3, T4).
|
||||
- **Bundle C — gRPC proto + mapper** (M2-T5, T6).
|
||||
- **Bundle D — Telemetry actor + ingest actor + gRPC handler** (M2-T7, T8).
|
||||
- **Bundle E — Host wiring** (M2-T9).
|
||||
- **Bundle F — ESG emission via ScriptRuntimeContext wrapper** (M2-T10).
|
||||
- **Bundle G — Health metric SiteAuditWriteFailures** (M2-T11).
|
||||
- **Bundle H — Component-level integration test** (M2-T12).
|
||||
|
||||
Final cross-bundle reviewer pass, then merge + roadmap update.
|
||||
|
||||
---
|
||||
|
||||
## Bundle A — Repo race-fix + keyset tiebreaker test
|
||||
|
||||
### Task A1: Harden `InsertIfNotExistsAsync` against duplicate-key race
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs:30-60` — wrap the `ExecuteSqlInterpolatedAsync` call in a `try/catch Microsoft.Data.SqlClient.SqlException` that swallows error numbers 2601 and 2627 (unique-index violation on `UX_AuditLog_EventId`) and logs at Debug. Other SqlExceptions rethrow.
|
||||
- Modify: `tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs` — add:
|
||||
- `InsertIfNotExistsAsync_ConcurrentDuplicateInserts_ProduceExactlyOneRow` — fire 50 parallel `InsertIfNotExistsAsync` calls with the same `EventId`, assert row count = 1 and no exception escapes.
|
||||
- `QueryAsync_Keyset_SameOccurredAtUtc_TiebreaksOnEventId` — Bundle D reviewer's deferred recommendation. Insert 4 rows with identical OccurredAtUtc but distinct EventIds; page through them with PageSize=2; assert no overlap, correct count, and that the second page's first row's EventId is strictly less than the first page's last row's EventId.
|
||||
|
||||
**Steps:**
|
||||
1. Write failing concurrency test.
|
||||
2. Run: expect SqlException 2601/2627 OR identical-row-count violation.
|
||||
3. Add try/catch in the repo.
|
||||
4. Run: pass.
|
||||
5. Write failing keyset-tiebreaker test.
|
||||
6. Run: depending on EF Core 10's Guid.CompareTo translation, this may already pass — confirm.
|
||||
7. If passing, the test locks in the behavior; commit anyway.
|
||||
8. Commit: `fix(configdb): InsertIfNotExistsAsync swallows duplicate-key races + add keyset tiebreaker test (#23)`.
|
||||
|
||||
**Bundle A acceptance:** All ConfigurationDatabase.Tests still green; 2 new tests pass.
|
||||
|
||||
---
|
||||
|
||||
## Bundle B — Site SQLite writer + fallback (mirror SiteEventLogger pattern)
|
||||
|
||||
### Task B1: `SqliteAuditWriter` — schema + connection bootstrap
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs` — implements `IAuditWriter` per Bundle A's signature (single `Task WriteAsync(AuditEvent evt, CancellationToken ct = default)`). Constructor takes `IOptions<SqliteAuditWriterOptions>` + `ILogger`. Single `SqliteConnection` opened at construction (`Data Source={path};Cache=Shared`). Sync `_writeLock` Monitor-pattern (mirrors `SiteEventLogger.cs:32`). Inline `InitializeSchema()` runs `PRAGMA auto_vacuum = INCREMENTAL` + `CREATE TABLE IF NOT EXISTS AuditLog (...)`.
|
||||
- Create: `src/ScadaLink.AuditLog/Site/SqliteAuditWriterOptions.cs` — `string DatabasePath = "auditlog.db"`, `int ChannelCapacity = 4096` (bounded; drop-oldest applies in Bundle B-T3 ring overflow, but the writer's pending channel is bounded as a safety net), `int BatchSize = 256`, `int FlushIntervalMs = 50`.
|
||||
- Create: `tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterSchemaTests.cs`.
|
||||
|
||||
**Schema (20 site columns + ForwardState — IngestedAtUtc is central-only):**
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS AuditLog (
|
||||
EventId TEXT NOT NULL,
|
||||
OccurredAtUtc TEXT NOT NULL,
|
||||
Channel TEXT NOT NULL,
|
||||
Kind TEXT NOT NULL,
|
||||
CorrelationId TEXT NULL,
|
||||
SourceSiteId TEXT NULL,
|
||||
SourceInstanceId TEXT NULL,
|
||||
SourceScript TEXT NULL,
|
||||
Actor TEXT NULL,
|
||||
Target TEXT NULL,
|
||||
Status TEXT NOT NULL,
|
||||
HttpStatus INTEGER NULL,
|
||||
DurationMs INTEGER NULL,
|
||||
ErrorMessage TEXT NULL,
|
||||
ErrorDetail TEXT NULL,
|
||||
RequestSummary TEXT NULL,
|
||||
ResponseSummary TEXT NULL,
|
||||
PayloadTruncated INTEGER NOT NULL,
|
||||
Extra TEXT NULL,
|
||||
ForwardState TEXT NOT NULL,
|
||||
PRIMARY KEY (EventId)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS IX_SiteAuditLog_ForwardState_Occurred
|
||||
ON AuditLog (ForwardState, OccurredAtUtc);
|
||||
```
|
||||
|
||||
**Tests:**
|
||||
1. `Opens_Creates_AuditLog_Table_With_All_Columns_And_PK`
|
||||
2. `Opens_Creates_IX_ForwardState_Occurred_Index`
|
||||
3. `PRAGMA_auto_vacuum_Is_INCREMENTAL`
|
||||
|
||||
**Steps:**
|
||||
1. Failing test asserts table + PK + 20 columns + index via `PRAGMA table_info(AuditLog)` + `PRAGMA index_list(AuditLog)`.
|
||||
2. Implement constructor + InitializeSchema with inline SQL.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): SqliteAuditWriter schema bootstrap (#23)`.
|
||||
|
||||
### Task B2: `SqliteAuditWriter` — Channel<T> + background writer for hot-path
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs` — add `Channel<PendingAuditEvent> _writeQueue` (bounded BoundedChannelFullMode.Wait, default capacity 4096), background `Task ProcessWriteQueueAsync()` launched in constructor. `WriteAsync` enqueues + returns the pending's `TaskCompletionSource`. The loop reads up to `BatchSize`, opens a transaction, INSERTs all events, commits, completes the TCS for each.
|
||||
- Pattern mirrors `src/ScadaLink.SiteEventLogging/SiteEventLogger.cs:135-173`.
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterWriteTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `WriteAsync_FreshEvent_PersistsWithForwardStatePending` — write one event, query SQLite, assert row has `ForwardState='Pending'`.
|
||||
2. `WriteAsync_Concurrent_1000Calls_All_Persist_NoExceptions` — fire 1000 parallel WriteAsync, assert row count = 1000 and zero exceptions surface.
|
||||
3. `WriteAsync_LatencyP99_LessThan_5ms_For_Enqueue` — assert TCS Task.IsCompleted within reasonable time AFTER awaiting, but the enqueue itself returns near-instantly (verify via a stopwatch around the Channel.Writer.TryWriteAsync).
|
||||
4. `WriteAsync_DuplicateEventId_FirstWriteWins_NoException` — insert same EventId twice, assert one row only and no exception (the PRIMARY KEY violation is caught/swallowed in the writer loop).
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests for 1, 2, 4.
|
||||
2. Implement Channel + background loop + transactional batch INSERT.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): SqliteAuditWriter Channel-based hot-path write (#23)`.
|
||||
|
||||
### Task B3: `RingBufferFallback`
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Site/RingBufferFallback.cs` — `Channel<AuditEvent>` bounded at 1024 with `BoundedChannelFullMode.DropOldest`. Exposes `bool TryEnqueue(AuditEvent)`, `IAsyncEnumerable<AuditEvent> DrainAsync(CancellationToken)`, and an event `RingBufferOverflowed` (callback for the health counter).
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Site/RingBufferFallbackTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `Enqueue_1025_Into_1024Cap_Ring_DropsOldest_AndRaisesOverflow` — invoke 1025 enqueues, assert the OverflowEvent counter increments once, and the surviving 1024 are the latest.
|
||||
2. `DrainAsync_Yields_FIFO_Then_Completes_When_Empty`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement using `Channel.CreateBounded<AuditEvent>(new BoundedChannelOptions(1024) { FullMode = BoundedChannelFullMode.DropOldest })`.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): RingBufferFallback with drop-oldest overflow (#23)`.
|
||||
|
||||
### Task B4: `FallbackAuditWriter` — compose primary + ring
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Site/FallbackAuditWriter.cs` — implements `IAuditWriter`. Constructor takes the primary `SqliteAuditWriter` + `RingBufferFallback` + `IAuditWriteFailureCounter` (lightweight DI'd interface, Bundle G implements it as `SiteAuditWriteFailures` counter on health metrics). On primary success: returns. On primary throw: increments counter, enqueues into ring (DropOldest), returns success. On the NEXT successful primary call (success after a failure window), drains the ring back through the primary.
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Site/FallbackAuditWriterTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `WriteAsync_PrimaryThrows_EventLandsInRing_CallReturnsSuccess`.
|
||||
2. `WriteAsync_PrimaryRecovers_RingDrains_InFIFOOrder_OnNextWrite`.
|
||||
3. `WriteAsync_PrimaryAlwaysSucceeds_Ring_StaysEmpty`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement; mock the primary with a `Func<AuditEvent, Task>` flip-switch failure.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): FallbackAuditWriter compose SQLite + ring (#23)`.
|
||||
|
||||
**Bundle B acceptance:** 4 tasks merged. `ScadaLink.AuditLog.Tests` adds ~12+ tests. No regressions.
|
||||
|
||||
---
|
||||
|
||||
## Bundle C — gRPC proto + mapper
|
||||
|
||||
### Task C1: Extend `sitestream.proto` with `IngestAuditEvents`
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Communication/Protos/sitestream.proto` — add the messages and unary RPC. Use `google.protobuf.Timestamp` for `OccurredAtUtc`; encode enums as `string` (matches the EF mapping).
|
||||
|
||||
Proposed addition:
|
||||
```proto
|
||||
message AuditEventDto {
|
||||
string event_id = 1;
|
||||
google.protobuf.Timestamp occurred_at_utc = 2;
|
||||
string channel = 3;
|
||||
string kind = 4;
|
||||
string correlation_id = 5; // empty string when null
|
||||
string source_site_id = 6;
|
||||
string source_instance_id = 7;
|
||||
string source_script = 8;
|
||||
string actor = 9;
|
||||
string target = 10;
|
||||
string status = 11;
|
||||
google.protobuf.Int32Value http_status = 12;
|
||||
google.protobuf.Int32Value duration_ms = 13;
|
||||
string error_message = 14;
|
||||
string error_detail = 15;
|
||||
string request_summary = 16;
|
||||
string response_summary = 17;
|
||||
bool payload_truncated = 18;
|
||||
string extra = 19;
|
||||
}
|
||||
message AuditEventBatch { repeated AuditEventDto events = 1; }
|
||||
message IngestAck { repeated string accepted_event_ids = 1; }
|
||||
|
||||
service SiteStreamService {
|
||||
// existing rpcs...
|
||||
rpc IngestAuditEvents(AuditEventBatch) returns (IngestAck);
|
||||
}
|
||||
```
|
||||
|
||||
(Use `google.protobuf.Int32Value` to encode nullable ints; empty string semantics for nullable text fields.)
|
||||
|
||||
- Test: `tests/ScadaLink.Communication.Tests/Protos/AuditEventProtoTests.cs`.
|
||||
|
||||
**Steps:**
|
||||
1. Edit proto + rebuild (`dotnet build src/ScadaLink.Communication/`).
|
||||
2. Failing test round-trips an `AuditEventDto` through `ToByteArray()` and `Parser.ParseFrom()`; asserts all populated fields survive.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(comms): IngestAuditEvents RPC + AuditEventDto proto (#23)`.
|
||||
|
||||
### Task C2: `AuditEvent` ↔ `AuditEventDto` mapper
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Telemetry/AuditEventMapper.cs` — static `ToDto(AuditEvent)` and `FromDto(AuditEventDto)`. Handles nullable→empty-string, Timestamp↔DateTime UTC, enum↔string. ForwardState NOT carried in the proto (site-local only; central never sees it).
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Telemetry/AuditEventMapperTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `Roundtrip_FullyPopulated_PreservesAllFields`.
|
||||
2. `Roundtrip_AllNullableFieldsNull_ProducesEmptyDtoFields`.
|
||||
3. `FromDto_EmptyOptionalString_BecomesNullProperty`.
|
||||
4. `ToDto_Sets_OccurredAtUtc_As_UtcTimestamp` — Round-trip with `DateTimeKind.Utc` preserved.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): AuditEvent ↔ proto mapper (#23)`.
|
||||
|
||||
**Bundle C acceptance:** Communication.Tests + AuditLog.Tests still green; proto rebuilds cleanly.
|
||||
|
||||
---
|
||||
|
||||
## Bundle D — SiteAuditTelemetryActor + AuditLogIngestActor + gRPC handler
|
||||
|
||||
### Task D1: `SiteAuditTelemetryActor` — drain loop
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryActor.cs` — `ReceiveActor`. On `Drain`: queries `SqliteAuditWriter.ReadPendingAsync(BatchSize)`, calls `gRPC client.IngestAuditEventsAsync(batch)`, on ack flips returned EventIds to `Forwarded` via `SqliteAuditWriter.MarkForwardedAsync(eventIds)`. Re-schedules `Drain` self-tick: 5s if ≥1 row drained, 30s otherwise. On gRPC error: re-schedule 5s; rows stay Pending.
|
||||
- Modify: `src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs` — add `ReadPendingAsync(int limit, CancellationToken)` returning `IReadOnlyList<AuditEvent>` (with ForwardState=Pending), and `MarkForwardedAsync(IReadOnlyList<Guid> eventIds, CancellationToken)`.
|
||||
- Create: `src/ScadaLink.AuditLog/Site/Telemetry/SiteAuditTelemetryOptions.cs` — `BatchSize=256`, `BusyIntervalSeconds=5`, `IdleIntervalSeconds=30`.
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Site/Telemetry/SiteAuditTelemetryActorTests.cs` using `TestKit` + NSubstitute-mocked gRPC client.
|
||||
|
||||
**Tests:**
|
||||
1. `Drain_With_50PendingRows_Sends_OneBatch_Of_50`.
|
||||
2. `Drain_Ack_Flips_Rows_To_Forwarded`.
|
||||
3. `Drain_GrpcThrows_Rows_StayPending_NextTick_Retries`.
|
||||
4. `Drain_Cadence_5s_AfterNonZero_30s_AfterZero` (via `TestScheduler`).
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): SiteAuditTelemetryActor drain loop (#23)`.
|
||||
|
||||
### Task D2: `AuditLogIngestActor` + gRPC server handler
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs` — `ReceiveActor` accepting `IngestAuditEventsCommand(IReadOnlyList<AuditEvent> events, IActorRef replyTo)`. For each event, calls `IAuditLogRepository.InsertIfNotExistsAsync` (which now swallows duplicates per Bundle A). Sets `IngestedAtUtc = DateTime.UtcNow` before insert (this is the central-side timestamp). Replies with `IngestAck(acceptedEventIds)` — by spec "accepted" includes already-existed rows (idempotent semantics).
|
||||
- Create: `src/ScadaLink.AuditLog/Central/IngestAuditEventsCommand.cs` (Akka message).
|
||||
- Create: `src/ScadaLink.AuditLog/Central/IngestAck.cs` (Akka reply).
|
||||
- Modify: `src/ScadaLink.Communication/SiteStreamGrpc/SiteStreamGrpcServer.cs` — implement `public override async Task<IngestAck> IngestAuditEvents(AuditEventBatch request, ServerCallContext context)` — Ask the central `AuditLogIngestActor` proxy with the deserialized batch, await reply, return.
|
||||
- Modify: `src/ScadaLink.Communication/SiteStreamGrpc/SiteStreamGrpcServer.cs` — add a setter `SetAuditIngestActor(IActorRef)` mirroring how `SetNotificationOutbox` is wired (per recon: Notification Outbox proxy is handed in via `commService?.SetNotificationOutbox(outboxProxy)`).
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Central/AuditLogIngestActorTests.cs`.
|
||||
- Test: `tests/ScadaLink.Communication.Tests/SiteStreamIngestAuditEventsTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `Receive_BatchOf5_Calls_Repo_5Times_Acks_All`.
|
||||
2. `Receive_BatchWith_AlreadyExistingEvent_AcksAll_NoDoubleInsert` (idempotent).
|
||||
3. `Receive_RepoThrowsTransient_Replies_AckExcludingFailedEventIds_LogsError` (partial-failure semantics — what gets acked is what was persisted).
|
||||
4. `Receive_Sets_IngestedAtUtc_Before_Insert`.
|
||||
5. `gRPC_Handler_Routes_To_Actor_Returns_Reply`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement actor + gRPC handler.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(auditlog): AuditLogIngestActor + gRPC handler (#23)`.
|
||||
|
||||
**Bundle D acceptance:** New actor + gRPC handler tests all green.
|
||||
|
||||
---
|
||||
|
||||
## Bundle E — Host wiring (central singleton + site actor + dispatcher)
|
||||
|
||||
### Task E1: Register `AuditLogIngestActor` + `SiteAuditTelemetryActor` + dispatcher
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Host/Actors/AkkaHostedService.cs` — mirror the Notification Outbox pattern (recon report's exact lines 272-295):
|
||||
- Central role: `AuditLogIngestActor` as `ClusterSingletonManager` (singleton name `"audit-log-ingest"`) + `ClusterSingletonProxy` (`"audit-log-ingest-proxy"`). Hand the proxy to `SiteStreamGrpcServer.SetAuditIngestActor(proxy)`.
|
||||
- Site role: `SiteAuditTelemetryActor` as a per-site actor (`actorSystem.ActorOf(Props.Create(...)`), bound to the dedicated dispatcher (below).
|
||||
- Modify: HOCON in `src/ScadaLink.Host/Configuration/` (the existing akka config file) — add:
|
||||
```
|
||||
audit-telemetry-dispatcher {
|
||||
type = ForkJoinDispatcher
|
||||
throughput = 100
|
||||
dedicated-thread-pool { thread-count = 2 }
|
||||
}
|
||||
```
|
||||
Apply `.WithDispatcher("audit-telemetry-dispatcher")` to `SiteAuditTelemetryActor`'s Props.
|
||||
- Modify: `src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs:AddAuditLog` — register the SqliteAuditWriter+RingBufferFallback+FallbackAuditWriter chain and the actor factories.
|
||||
- Test: `tests/ScadaLink.Host.Tests/AkkaHostedServiceAuditWiringTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `Central_Host_Starts_With_AuditLogIngest_Singleton_Healthy`.
|
||||
2. `Site_Host_Starts_With_SiteAuditTelemetry_Bound_To_DedicatedDispatcher`.
|
||||
3. `AuditWriter_Resolves_From_DI_To_FallbackAuditWriter`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests against current host (which doesn't wire audit).
|
||||
2. Implement wiring.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(host): register Audit Log #23 singletons with dedicated dispatcher`.
|
||||
|
||||
**Bundle E acceptance:** Host.Tests still green; 3 new tests pass.
|
||||
|
||||
---
|
||||
|
||||
## Bundle F — ESG audit emission via ScriptRuntimeContext wrapper
|
||||
|
||||
### Task F1: Wrap `ExternalSystem.Call` in `ScriptRuntimeContext` to emit audit
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs` — find the existing `ExternalSystem.Call` method (or add one if scripts call through a dynamic API surface). Inside, after `_externalSystemClient.CallAsync(...)` returns OR throws, build the `AuditEvent` (channel=`ApiOutbound`, kind=`ApiCall`, status=`Delivered` for success, `Failed` for HTTP non-2xx or exception, populate `Target=$"{systemName}.{methodName}"`, `SourceSiteId={siteId}`, `SourceInstanceId={instanceName}`, `SourceScript={sourceScript}`, `DurationMs={stopwatch}`, `HttpStatus`, `ErrorMessage`). Call `_auditWriter.WriteAsync(evt)` inside a try/catch that swallows + logs at Warning + increments `SiteAuditWriteFailures` (via the same counter Bundle G defines). Re-throw the original ExternalSystem exception (if any) so the script sees its original error path unchanged.
|
||||
- Modify: `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs` constructor — inject `IAuditWriter`.
|
||||
- Modify: `src/ScadaLink.SiteRuntime/Actors/ScriptExecutionActor.cs` — resolve and pass `IAuditWriter` into the ScriptRuntimeContext.
|
||||
- Test: `tests/ScadaLink.SiteRuntime.Tests/Scripts/ExternalSystemCallAuditEmissionTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `Call_Success_EmitsOneEvent_Channel_ApiOutbound_Kind_ApiCall_Status_Delivered`.
|
||||
2. `Call_HTTP500_EmitsEvent_Status_Failed_HttpStatus_500_ErrorMessage_Set`.
|
||||
3. `Call_HTTP400_EmitsEvent_Status_Failed_HttpStatus_400`.
|
||||
4. `Call_ClientThrows_NetworkError_EmitsEvent_Status_Failed_ErrorMessage_SetFromException`.
|
||||
5. `AuditWriter_Throws_Script_Call_Returns_Original_Result_Unchanged_Audit_Failure_Counter_Incremented`.
|
||||
6. `Provenance_Populated_FromContext` — SourceInstanceId, SourceScript, SourceSiteId all match the ScriptRuntimeContext's values.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement wrapper + provenance threading.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(siteruntime): ExternalSystem.Call emits Audit Log #23 event on every sync call`.
|
||||
|
||||
**Bundle F acceptance:** SiteRuntime.Tests still green; 6 new tests.
|
||||
|
||||
---
|
||||
|
||||
## Bundle G — Health metric `SiteAuditWriteFailures`
|
||||
|
||||
### Task G1: Counter + DI surface
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Site/IAuditWriteFailureCounter.cs` — `void Increment();`. Bundle B's `FallbackAuditWriter` already takes this.
|
||||
- Modify: `src/ScadaLink.HealthMonitoring/SiteHealthCollector.cs` — add `int _siteAuditWriteFailures` field + `IncrementSiteAuditWriteFailures()` method using `Interlocked.Increment`. Expose via a snapshot read.
|
||||
- Modify: `src/ScadaLink.HealthMonitoring/SiteHealthState.cs` — add `SiteAuditWriteFailures` property to the report payload.
|
||||
- Implementation: a small adapter class `HealthMetricsAuditWriteFailureCounter : IAuditWriteFailureCounter` registered in DI that bridges to `ISiteHealthCollector.IncrementSiteAuditWriteFailures()`.
|
||||
- Test: `tests/ScadaLink.HealthMonitoring.Tests/SiteAuditWriteFailuresMetricTests.cs`.
|
||||
|
||||
**Tests:**
|
||||
1. `Increment_Three_Times_Counter_Reports_3`.
|
||||
2. `Report_Payload_Includes_SiteAuditWriteFailures`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing tests.
|
||||
2. Implement counter + adapter + DI registration.
|
||||
3. Run: pass.
|
||||
4. Commit: `feat(health): SiteAuditWriteFailures counter (#23)`.
|
||||
|
||||
**Bundle G acceptance:** HealthMonitoring.Tests still green; 2 new tests.
|
||||
|
||||
---
|
||||
|
||||
## Bundle H — Component-level integration test
|
||||
|
||||
### Task H1: End-to-end via TestKit + MSSQL fixture
|
||||
|
||||
**Files:**
|
||||
- Create: `tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs` — uses `MsSqlMigrationFixture` (the M1 reusable fixture; depend on `Xunit.SkippableFact`):
|
||||
- Brings up `SqliteAuditWriter` against `:memory:`.
|
||||
- Brings up `SiteAuditTelemetryActor` via TestKit.
|
||||
- Brings up `AuditLogIngestActor` via TestKit, configured with the MSSQL `IAuditLogRepository` from M1.
|
||||
- Stubs the gRPC client by overriding the actor's gRPC dependency with a direct `IActorRef`-backed mock that forwards `IngestAuditEvents` directly to the central actor.
|
||||
- Writes one `AuditEvent` via the FallbackAuditWriter.
|
||||
- Drives a `Drain` tick on the telemetry actor.
|
||||
- Asserts the row appears in the MS SQL `AuditLog` table within 5 seconds via `IAuditLogRepository.QueryAsync`.
|
||||
|
||||
**Steps:**
|
||||
1. Failing test (telemetry not yet wired).
|
||||
2. Wire the components together via the test harness.
|
||||
3. Run: pass.
|
||||
4. Commit: `test(auditlog): end-to-end sync-call emission via TestKit + MSSQL fixture (#23)`.
|
||||
|
||||
**Bundle H acceptance:** New test passes when MSSQL container is up; skips cleanly when down.
|
||||
|
||||
---
|
||||
|
||||
## Final cross-bundle review
|
||||
|
||||
After Bundles A–H, dispatch a final reviewer agent with the same template as M1's. Acceptance gate: full `dotnet test ScadaLink.slnx` green. Then merge `--no-ff` with summary; update M3–M8 with M2 realities; status paragraph; proceed to M3.
|
||||
@@ -0,0 +1,212 @@
|
||||
# Audit Log #23 — M3 Cached Operations + Dual-Write Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development (bundled cadence per `feedback_subagent_cadence`).
|
||||
|
||||
**Goal:** Cached external calls (`ExternalSystem.CachedCall`) and cached DB writes (`Database.CachedWrite`) each produce 4+ audit rows per operation (`CachedSubmit` → `ApiCallCached`/`DbWriteCached` × N attempts with statuses `Forwarded` then `Attempted` then `Delivered`/`Failed` → `CachedResolve` terminal) AND a `SiteCalls` row at central. Combined telemetry: site emits one packet per lifecycle event carrying both the AuditEvent and the SiteCalls upsert; central writes both in one MS SQL transaction. Audit-write failure never aborts the script.
|
||||
|
||||
**Recommended-defaults applied:**
|
||||
- Telemetry proto: **new top-level RPC `IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck)`** (sitestream.proto), separate from the M2 `IngestAuditEvents` to keep payload shapes distinct.
|
||||
- Forwarder: **separate `CachedCallTelemetryForwarder`** actor (or static dispatcher hooking into the existing `SiteAuditTelemetryActor`'s SQLite queue) — write the audit row + tracking row in one SQLite transaction, then let the existing telemetry actor drain both via the new RPC. Reuse the M2 Channel/SQLite hot-path infrastructure; do NOT introduce a parallel writer.
|
||||
- Provenance: mirror M2's `ScriptRuntimeContext` wrapper pattern — ScriptRuntimeContext's cached-call helpers capture instance/script/site and feed the combined packet.
|
||||
- IntegrationTests E2E: same component-level pattern as M2 Bundle H (`DirectActorSiteStreamAuditClient`), but extracted into `tests/ScadaLink.AuditLog.Tests/Integration/Infrastructure/` for reuse.
|
||||
|
||||
**M2 realities baked in (from roadmap line 446-459):**
|
||||
- Use M1 vocabulary: `AuditKind.CachedSubmit` (enqueue), `AuditKind.ApiCallCached` / `AuditKind.DbWriteCached` (each attempt + post-forward), `AuditKind.CachedResolve` (terminal). `AuditStatus.Submitted` → `Forwarded` → `Attempted` × N → `Delivered`/`Failed`/`Parked`/`Discarded`. NO `CachedEnqueued`/`CachedAttempt`/`CachedTerminal` strings appear in code (those are pre-M1 spec wording the roadmap text still mentions; honor the enum vocabulary).
|
||||
- NoOpSiteStreamAuditClient still in production until M6; E2E tests use the M2 Bundle H pattern.
|
||||
- AuditEventMapper duplication note from M2: M3 should move the mapper into Commons (or document the gRPC inline duplication) since M3 adds a SECOND gRPC handler with the same DTO→entity translation work.
|
||||
- CachedCallTelemetry message creates from scratch (additive per Commons REQ-COM-5a) — NOT renamed to CachedOperationTelemetry.
|
||||
|
||||
---
|
||||
|
||||
## Bundles
|
||||
|
||||
- **Bundle A — Commons types + tracking store** (T1, T2, T3, T4): TrackedOperationId, OperationTrackingStore, Tracking.Status API, CachedCallTelemetry message.
|
||||
- **Bundle B — SiteCalls table EF + migration + repo** (T5, T6, T7).
|
||||
- **Bundle C — SiteCallAudit project + actor** (T8).
|
||||
- **Bundle D — Proto + central dual-write transaction** (T9, T10).
|
||||
- **Bundle E — ESG / DB-gateway / S&F emissions** (T11, T12, T13, T14).
|
||||
- **Bundle F — Host registration** (T15).
|
||||
- **Bundle G — Integration tests** (T16, T17, T18).
|
||||
|
||||
Final cross-bundle reviewer + merge to main.
|
||||
|
||||
---
|
||||
|
||||
## Bundle A — Commons types + tracking store
|
||||
|
||||
### Task A1: TrackedOperationId strong-typed ID
|
||||
File: `src/ScadaLink.Commons/Types/TrackedOperationId.cs` — `public readonly record struct TrackedOperationId(Guid Value)`. Static `New()`, `Parse(string)`, `ToString()` returns Value.ToString("D"). Implicit conversion from Guid via `From(Guid)` (no operator implicit because record struct doesn't allow). Tests in `tests/ScadaLink.Commons.Tests/Types/TrackedOperationIdTests.cs`. Commit: `feat(commons): TrackedOperationId strong type (#23 M3)`.
|
||||
|
||||
### Task A2: OperationTrackingStore (site-local SQLite)
|
||||
File: `src/ScadaLink.Commons/Interfaces/IOperationTrackingStore.cs` — `RecordEnqueueAsync`, `RecordAttemptAsync`, `RecordTerminalAsync`, `GetStatusAsync(TrackedOperationId)`, `PurgeTerminalAsync(olderThanUtc)`.
|
||||
File: `src/ScadaLink.SiteRuntime/Tracking/OperationTrackingStore.cs` — SQLite-backed, mirror SqliteAuditWriter pattern: Channel<T> + background writer Task + write-lock. Schema:
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS OperationTracking (
|
||||
TrackedOperationId TEXT NOT NULL PRIMARY KEY,
|
||||
Kind TEXT NOT NULL,
|
||||
TargetSummary TEXT NULL,
|
||||
Status TEXT NOT NULL,
|
||||
RetryCount INTEGER NOT NULL DEFAULT 0,
|
||||
LastError TEXT NULL,
|
||||
HttpStatus INTEGER NULL,
|
||||
CreatedAtUtc TEXT NOT NULL,
|
||||
UpdatedAtUtc TEXT NOT NULL,
|
||||
TerminalAtUtc TEXT NULL,
|
||||
SourceInstanceId TEXT NULL,
|
||||
SourceScript TEXT NULL);
|
||||
CREATE INDEX IF NOT EXISTS IX_OperationTracking_Status_Updated ON OperationTracking(Status, UpdatedAtUtc);
|
||||
```
|
||||
Tests: schema, insert+update sequence, terminal purge (only terminal rows older than threshold). Commit: `feat(siteruntime): OperationTrackingStore site-local SQLite (#23 M3)`.
|
||||
|
||||
### Task A3: Tracking.Status script API
|
||||
File: `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs` — add a `Tracking` accessor exposing `Status(TrackedOperationId)` reading via `IOperationTrackingStore.GetStatusAsync`. Returns a `TrackingStatusSnapshot` record (Commons/Types) with `Status`, `RetryCount`, `LastError`, `CreatedAtUtc`, `UpdatedAtUtc`, `TerminalAtUtc`. Returns null for unknown IDs.
|
||||
Tests: known, unknown, terminal IDs. Commit: `feat(siteruntime): Tracking.Status script API (#23 M3)`.
|
||||
|
||||
### Task A4: CachedCallTelemetry Commons message
|
||||
File: `src/ScadaLink.Commons/Messages/Integration/CachedCallTelemetry.cs` — `public sealed record CachedCallTelemetry(TrackedOperationId TrackedOperationId, AuditEvent Audit, SiteCallOperational Operational)` plus `SiteCallOperational` record (TrackedOperationId, Channel, Target, SourceSite, Status, RetryCount, LastError, HttpStatus, CreatedAtUtc, UpdatedAtUtc, TerminalAtUtc?).
|
||||
Tests: round-trip; lifecycle-specific construction (Submit/Attempted/Resolve). Commit: `feat(commons): CachedCallTelemetry combined operational+audit packet (#23 M3)`.
|
||||
|
||||
---
|
||||
|
||||
## Bundle B — SiteCalls EF + migration + repo
|
||||
|
||||
### Task B1: SiteCall entity + EF mapping
|
||||
File: `src/ScadaLink.Commons/Entities/Audit/SiteCall.cs` — `public sealed record SiteCall` with fields per `SiteCallOperational` plus `IngestedAtUtc`.
|
||||
File: `src/ScadaLink.ConfigurationDatabase/Configurations/SiteCallEntityTypeConfiguration.cs` — table `SiteCalls`, PK on `TrackedOperationId`, indexes `IX_SiteCalls_Source_Created` on (SourceSite, CreatedAtUtc), `IX_SiteCalls_Status_Updated` on (Status, UpdatedAtUtc).
|
||||
Modify: `ScadaLinkDbContext.cs` — `public DbSet<SiteCall> SiteCalls => Set<SiteCall>();`.
|
||||
Tests as M1 pattern. Commit: `feat(configdb): map SiteCall to SiteCalls table (#23 M3)`.
|
||||
|
||||
### Task B2: AddSiteCallsTable migration
|
||||
Generate via `dotnet ef migrations add AddSiteCallsTable --project src/ScadaLink.ConfigurationDatabase --startup-project src/ScadaLink.Host`. No partitioning (operational state, not audit). Use MsSqlMigrationFixture for integration test. Commit: `feat(configdb): add SiteCalls migration (#23 M3)`.
|
||||
|
||||
### Task B3: ISiteCallAuditRepository + EF impl
|
||||
File: `src/ScadaLink.Commons/Interfaces/Repositories/ISiteCallAuditRepository.cs` — `UpsertAsync(SiteCall)` with **monotonic status progression** (later status wins; earlier status is no-op), `GetAsync(TrackedOperationId)`, `QueryAsync(filter, paging)`, `PurgeTerminalAsync(olderThanUtc)`.
|
||||
File: `src/ScadaLink.ConfigurationDatabase/Repositories/SiteCallAuditRepository.cs` — implement via `MERGE` or `INSERT ... WHERE NOT EXISTS` + `UPDATE WHERE TerminalAtUtc IS NULL AND <monotonic order check>`. Tests use MsSqlMigrationFixture. Commit: `feat(configdb): ISiteCallAuditRepository + EF impl (#23 M3)`.
|
||||
|
||||
---
|
||||
|
||||
## Bundle C — SiteCallAudit project + actor
|
||||
|
||||
### Task C1: ScadaLink.SiteCallAudit project + actor
|
||||
Create: `src/ScadaLink.SiteCallAudit/ScadaLink.SiteCallAudit.csproj` (mirrors ScadaLink.AuditLog csproj style — net10.0, references Commons + ConfigurationDatabase).
|
||||
Create: `src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs` — central singleton actor handling `UpsertSiteCallCommand(SiteCall siteCall)` by calling `ISiteCallAuditRepository.UpsertAsync` (scope-per-message via IServiceProvider, mirror AuditLogIngestActor). Idempotent via repo's monotonic upsert.
|
||||
Create: `src/ScadaLink.SiteCallAudit/ServiceCollectionExtensions.cs` — `AddSiteCallAudit()` registering actor props factory.
|
||||
Create: `tests/ScadaLink.SiteCallAudit.Tests/` project.
|
||||
Modify: `ScadaLink.slnx` — add src + tests entries.
|
||||
Commit: `feat(scaudit): SiteCallAuditActor minimum surface (#22, #23 M3)`.
|
||||
|
||||
---
|
||||
|
||||
## Bundle D — Proto + central dual-write transaction
|
||||
|
||||
### Task D1: Extend sitestream.proto with IngestCachedTelemetry RPC
|
||||
Follow the documented protobuf regen procedure from M2 Bundle C (temporarily uncomment ItemGroup, build, copy back, recomment). Add:
|
||||
```proto
|
||||
message SiteCallOperationalDto {
|
||||
string tracked_operation_id = 1;
|
||||
string channel = 2;
|
||||
string target = 3;
|
||||
string source_site = 4;
|
||||
string status = 5;
|
||||
int32 retry_count = 6;
|
||||
string last_error = 7;
|
||||
google.protobuf.Int32Value http_status = 8;
|
||||
google.protobuf.Timestamp created_at_utc = 9;
|
||||
google.protobuf.Timestamp updated_at_utc = 10;
|
||||
google.protobuf.Timestamp terminal_at_utc = 11; // null when active
|
||||
}
|
||||
message CachedTelemetryPacket {
|
||||
AuditEventDto audit_event = 1;
|
||||
SiteCallOperationalDto operational = 2;
|
||||
}
|
||||
message CachedTelemetryBatch { repeated CachedTelemetryPacket packets = 1; }
|
||||
|
||||
service SiteStreamService {
|
||||
rpc IngestCachedTelemetry(CachedTelemetryBatch) returns (IngestAck);
|
||||
}
|
||||
```
|
||||
Test round-trips. Commit: `feat(comms): IngestCachedTelemetry RPC + CachedTelemetryPacket proto (#23 M3)`.
|
||||
|
||||
### Task D2: Dual-write transaction in AuditLogIngestActor
|
||||
File: `src/ScadaLink.AuditLog/Central/AuditLogIngestActor.cs` (extend) — add `IngestCachedTelemetryCommand` handler. Inside one `DbContext.Database.BeginTransactionAsync()`:
|
||||
1. Call `IAuditLogRepository.InsertIfNotExistsAsync(auditEvent)` (idempotent already from M2 Bundle A).
|
||||
2. Call `ISiteCallAuditRepository.UpsertAsync(siteCallOperational)` (monotonic).
|
||||
3. Commit on both-success; rollback on either-throw (the central singleton SUPERVISES — actor doesn't crash).
|
||||
4. Reply `IngestAck(acceptedIds)`.
|
||||
|
||||
Modify: `src/ScadaLink.Communication/SiteStreamGrpc/SiteStreamGrpcServer.cs` — implement `IngestCachedTelemetry` gRPC handler routing to actor. Same inline FromDto pattern as M2 (move to mapper if time permits per M2 reviewer recommendation).
|
||||
|
||||
Add: `src/ScadaLink.Commons/Messages/Audit/IngestCachedTelemetryCommand.cs` and `IngestCachedTelemetryReply.cs` (Akka messages).
|
||||
|
||||
Tests:
|
||||
- Single packet → 1 AuditLog + 1 SiteCalls row.
|
||||
- Duplicate `EventId` + same status → AuditLog no-op, SiteCalls no-op (monotonic), no error.
|
||||
- Duplicate `EventId` + ADVANCED status → AuditLog no-op, SiteCalls updates.
|
||||
- SiteCalls upsert throws → AuditLog rolled back (no orphan).
|
||||
- AuditLog throws (non-duplicate) → SiteCalls rolled back.
|
||||
|
||||
Commit: `feat(auditlog): combined telemetry dual-write transaction (#23 M3)`.
|
||||
|
||||
---
|
||||
|
||||
## Bundle E — ESG / DB / S&F lifecycle emissions
|
||||
|
||||
### Task E1: ScriptRuntimeContext.ExternalSystem.CachedCall wrapper
|
||||
Mirror M2 Bundle F's `Call` wrapper. Differences:
|
||||
- Emit on enqueue: AuditEvent(Kind=CachedSubmit, Status=Submitted) + SiteCallOperational(Status=Submitted, RetryCount=0).
|
||||
- Calls `_externalSystemClient.CachedCallAsync` (resolves what S&F existing API surface looks like — discover by reading ExternalSystemClient).
|
||||
- Returns a `TrackedOperationId` immediately (a TrackedOperationId tracking handle).
|
||||
- Hands the operation to the existing StoreAndForward retry loop.
|
||||
|
||||
For the per-attempt + terminal emissions, hook into the S&F dispatch loop (Bundle E2/E3).
|
||||
|
||||
### Task E2: S&F retry-loop emission
|
||||
Find the S&F retry-attempt callback site in `src/ScadaLink.StoreAndForward/`. On each attempt (success/transient/permanent):
|
||||
- Build AuditEvent(Kind=ApiCallCached or DbWriteCached, Status=Attempted).
|
||||
- Build SiteCallOperational(Status=Attempted, RetryCount=N, LastError, HttpStatus).
|
||||
- Hand to `CachedCallTelemetryForwarder` which writes both to SQLite (AuditLog + OperationTracking tables, in one SQLite transaction) and lets SiteAuditTelemetryActor's drain loop push them.
|
||||
|
||||
### Task E3: S&F terminal-state emission
|
||||
On final state transition (Delivered / Failed / Parked / Discarded):
|
||||
- Build AuditEvent(Kind=CachedResolve, Status={final state}).
|
||||
- Build SiteCallOperational(Status={final state}, TerminalAtUtc=DateTime.UtcNow).
|
||||
- Forward.
|
||||
|
||||
### Task E4: Database.CachedWrite mirror
|
||||
Same three-event pattern but Channel=DbOutbound, Kind=DbWriteCached for attempts, Kind=CachedSubmit for enqueue, Kind=CachedResolve for terminal.
|
||||
|
||||
Tests in ExternalSystemGateway.Tests + StoreAndForward.Tests.
|
||||
|
||||
Commit (bundle-level): one commit per task, descriptive messages following M2 style.
|
||||
|
||||
---
|
||||
|
||||
## Bundle F — Host registration
|
||||
|
||||
### Task F1: Register SiteCallAuditActor central singleton
|
||||
File: `src/ScadaLink.Host/Actors/AkkaHostedService.cs` — register `SiteCallAuditActor` central singleton + proxy alongside `AuditLogIngestActor`. Hand the proxy to `SiteStreamGrpcServer.SetSiteCallAuditActor(proxy)` (mirroring `SetAuditIngestActor`).
|
||||
File: `src/ScadaLink.Host/Program.cs` — call `.AddSiteCallAudit()` on the central role's services.
|
||||
Tests in `tests/ScadaLink.Host.Tests/AkkaHostedServiceAuditWiringTests.cs` (extend).
|
||||
Commit: `feat(host): register SiteCallAuditActor central singleton (#22, #23 M3)`.
|
||||
|
||||
---
|
||||
|
||||
## Bundle G — Integration tests
|
||||
|
||||
### Task G1: Extract DirectActorSiteStreamAuditClient to shared infrastructure
|
||||
Move from `tests/ScadaLink.AuditLog.Tests/Integration/SyncCallEmissionEndToEndTests.cs` private inner class into `tests/ScadaLink.AuditLog.Tests/Integration/Infrastructure/DirectActorSiteStreamAuditClient.cs`. Extend to also implement the new `IngestCachedTelemetryAsync` method (mirror pattern).
|
||||
|
||||
### Task G2: Cached call E2E test
|
||||
File: `tests/ScadaLink.AuditLog.Tests/Integration/CachedCallCombinedTelemetryTests.cs` (use AuditLog.Tests, not IntegrationTests, because the existing IntegrationTests harness disables Akka per M2 reality). Test: cached call that fails twice then succeeds produces 5 AuditLog rows (1 Submit + 1 Forwarded + 2 Attempted + 1 Resolve) + 1 SiteCalls row (Status=Delivered) + Tracking.Status reports Delivered.
|
||||
|
||||
### Task G3: Cached DB write E2E test
|
||||
File: `tests/ScadaLink.AuditLog.Tests/Integration/CachedWriteCombinedTelemetryTests.cs`. Mirror G2 for DB.
|
||||
|
||||
### Task G4: Idempotency test
|
||||
File: `tests/ScadaLink.AuditLog.Tests/Integration/CombinedTelemetryIdempotencyTests.cs`. Send the same packet twice; assert exactly 1 AuditLog row + 1 SiteCalls row.
|
||||
|
||||
---
|
||||
|
||||
## Final cross-bundle review + merge
|
||||
|
||||
Same template as M1/M2.
|
||||
@@ -0,0 +1,24 @@
|
||||
# Audit Log #23 — M4 Remaining Boundary Emission Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development (bundled cadence).
|
||||
|
||||
**Goal:** Close every remaining script-trust-boundary emission gap: sync DB writes/reads via Database.Connection().Execute*/ExecuteReader, Notification Outbox central dispatcher attempts + terminal, site-side Notify.Send submission, and Inbound API middleware. Audit-write failure NEVER aborts the user-facing action across all five new code paths.
|
||||
|
||||
**Vocabulary (M3 reality-locked):**
|
||||
- `AuditKind.DbWrite` (Channel=DbOutbound) for both Execute and ExecuteReader; `Extra` carries `{"op":"write"|"read","rowsAffected":N|"rowsReturned":N}`.
|
||||
- `AuditKind.NotifyDeliver` for each Notification Outbox attempt; `AuditStatus.Attempted` on attempts, `AuditStatus.Delivered|Failed|Parked|Discarded` on terminal.
|
||||
- `AuditKind.NotifySend` for site-emit at Notify.Send; `AuditStatus.Submitted`.
|
||||
- `AuditKind.InboundRequest` for happy-path inbound; `AuditStatus.Delivered`. `AuditKind.InboundAuthFailure` for 401; `AuditStatus.Failed`.
|
||||
- `AuditStatus.Failed` replaces "PermanentFailure" / "TransientFailure" terminal wording throughout.
|
||||
|
||||
**Bundles:**
|
||||
- Bundle A — DB sync emissions (T1, T2)
|
||||
- Bundle B — NotificationOutbox central emissions (T3, T4, T5)
|
||||
- Bundle C — Site Notify.Send emission (T6)
|
||||
- Bundle D — Inbound API audit middleware (T7, T8)
|
||||
- Bundle E — Integration tests (T9, T10, T11, T12)
|
||||
- Final cross-bundle review + merge
|
||||
|
||||
Each task follows the M2 Bundle F / M3 Bundle E emission pattern: capture timing, build AuditEvent with provenance, write via try/catch that swallows + logs, never propagate audit failure to the user-facing action. Mirror M2's ScriptRuntimeContext wrapper pattern where the emission is script-context-aware.
|
||||
|
||||
Integration tests go in `tests/ScadaLink.AuditLog.Tests/Integration/` (component-level per M2 Bundle H + M3 Bundle G — the existing IntegrationTests factory disables Akka).
|
||||
@@ -0,0 +1,20 @@
|
||||
# Audit Log #23 — M5 Payload + Redaction Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development (bundled cadence).
|
||||
|
||||
**Goal:** Filter pipeline (IAuditPayloadFilter) runs between event construction and writer call. Truncates to 8 KB / 64 KB on error; applies HTTP header redactors (default list from M1-T9 AuditLogOptions); applies body regex redactors (global + per-target); applies SQL parameter redactors (per-connection opt-in); over-redacts on regex error and increments AuditRedactionFailure metric. Hot-reloadable config via IOptionsMonitor.
|
||||
|
||||
**Vocabulary (M1 reality):** Error-row cap (64 KB) triggers when `Status NOT IN (AuditStatus.Delivered, AuditStatus.Submitted, AuditStatus.Forwarded)` — i.e., on `Failed/Parked/Discarded/Attempted/Skipped`. The roadmap's M5-T2 step references (Status=TransientFailure/PermanentFailure) are stale pre-M1 wording. Translation: `TransientFailure` = `Attempted` with HttpStatus 5xx OR `Failed`; `PermanentFailure` = `Failed`.
|
||||
|
||||
**M4 realities baked in:** AuditingDb decorators, NotificationOutboxActor, AuditWriteMiddleware, site emission paths all need filter pluggin. Filter is invoked in:
|
||||
- FallbackAuditWriter.WriteAsync (site chain) — before SqliteAuditWriter.WriteAsync.
|
||||
- CentralAuditWriter.WriteAsync (central direct-write) — before IAuditLogRepository.InsertIfNotExistsAsync.
|
||||
- AuditLogIngestActor handlers — before InsertIfNotExistsAsync/UpsertAsync.
|
||||
|
||||
**Bundles:**
|
||||
- Bundle A — Filter contract + truncation (T1, T2).
|
||||
- Bundle B — Header + body + SQL param redaction (T3, T4, T5).
|
||||
- Bundle C — Wire into emission paths + health metric (T6, T7).
|
||||
- Bundle D — Configuration binding + perf + safety-net edge cases (T8, T9, T10).
|
||||
|
||||
Final cross-bundle review + merge.
|
||||
@@ -0,0 +1,19 @@
|
||||
# Audit Log #23 — M6 Reconciliation + Purge + Partition Maintenance + Health Metrics
|
||||
|
||||
> **For Claude:** subagent-driven-development with bundled cadence.
|
||||
|
||||
**Goal:** Self-healing telemetry (5-min reconciliation pull), monthly partition rollover, daily partition-switch purge with drop-and-rebuild around UX_AuditLog_EventId, all five health metrics live (SiteAuditBacklog, SiteAuditWriteFailures, SiteAuditTelemetryStalled, CentralAuditWriteFailures, AuditRedactionFailure).
|
||||
|
||||
**M5 realities baked in:** AuditRedactionFailure counter is site-only — M6-T9 surfaces it centrally. SwitchOutPartitionAsync ships as NotSupportedException stub from M1; M6-T4 replaces it with the drop-DROP-INDEX → SWITCH PARTITION → DROP staging → CREATE UNIQUE NONCLUSTERED INDEX dance. Partition function pre-seeded Jan 2026 – Dec 2027; M6-T5 SPLITs new boundaries forward.
|
||||
|
||||
**Bundles:**
|
||||
- Bundle A — Proto + site handler (T1, T2)
|
||||
- Bundle B — Reconciliation actor (T3)
|
||||
- Bundle C — Purge actor + drop-and-rebuild repository fix (T4)
|
||||
- Bundle D — Partition maintenance hosted service (T5)
|
||||
- Bundle E — Health metrics (T6, T7, T8, T9)
|
||||
- Bundle F — Integration tests (T10, T11, T12)
|
||||
|
||||
Final cross-bundle review + merge.
|
||||
|
||||
**Note**: M2 noted NoOpSiteStreamAuditClient stays in production until "M6 wires the real client". M6-T1+T2 add the PULL RPC; the actual production PUSH client (real implementation of ISiteStreamAuditClient.IngestAuditEventsAsync + IngestCachedTelemetryAsync) is the bigger lift. M6 will add the real client IF feasible within scope OR defer to a follow-up. Decision: try in Bundle A (alongside the proto extension); if scope blows up, the NoOp stays.
|
||||
@@ -0,0 +1,31 @@
|
||||
# Audit Log #23 — M7 Central UI Implementation Plan
|
||||
|
||||
> **For Claude:** subagent-driven-development with bundled cadence.
|
||||
|
||||
**Goal:** User-visible Audit Log page in the Central UI: filter bar, results grid with keyset paging, drilldown drawer with JSON pretty-print + cURL + redaction badges, drill-ins from Notifications/Site Calls/External Systems/Inbound API Keys/Sites/Instances, 3 KPI tiles on Health dashboard, server-side streaming CSV export, OperationalAudit+AuditExport permission gating, Playwright E2E.
|
||||
|
||||
**UI memory constraints (locked):**
|
||||
- Blazor Server + Bootstrap CSS only. NO third-party UI libraries (no Blazorise, MudBlazor, Radzen, Prism.js, Highlight.js, etc.).
|
||||
- Custom Blazor components for tables/grids/forms.
|
||||
- Clean corporate aesthetic.
|
||||
- Form layout: vertical stacking, read-only fields first, subsections stacked, buttons at bottom.
|
||||
- Use the frontend-design skill IF dispatched UI-design subagents need pattern guidance.
|
||||
|
||||
**M6 realities baked in:**
|
||||
- `IAuditCentralHealthSnapshot` aggregates CentralAuditWriteFailures + AuditRedactionFailure + per-site stalled. Health tiles read this.
|
||||
- `SiteHealthReport.SiteAuditBacklog` ready for per-site display.
|
||||
- `IAuditLogRepository.QueryAsync` keyset-paged; data source for the grid.
|
||||
- Pre-existing `Components/Pages/Monitoring/AuditLog.razor` (IAuditService config-change viewer) must be renamed → `Components/Pages/Audit/ConfigurationAuditLog.razor` with route `/audit/configuration`. Old route returns 404 (no redirect — internal tool, no external bookmarks).
|
||||
- Need to add `OperationalAudit` + `AuditExport` permission strings.
|
||||
|
||||
**SQL highlighting decision:** no third-party highlighter. Render `<pre><code>` block with `language-sql` class and let any future CSS theme it; semantic markup is preserved without JS dependency.
|
||||
|
||||
**Bundles:**
|
||||
- Bundle A — Page scaffold + nav + ConfigurationAuditLog rename (T1, T9)
|
||||
- Bundle B — Filter bar + results grid (T2, T3)
|
||||
- Bundle C — Drilldown drawer (T4, T5, T6, T7, T8)
|
||||
- Bundle D — Drill-ins from other pages (T10, T11, T12)
|
||||
- Bundle E — Health dashboard KPI tiles (T13)
|
||||
- Bundle F — CSV export (T14)
|
||||
- Bundle G — Permissions (T15)
|
||||
- Bundle H — Playwright E2E (T16)
|
||||
@@ -0,0 +1,21 @@
|
||||
# Audit Log #23 — M8 CLI Implementation Plan
|
||||
|
||||
> **For Claude:** subagent-driven-development with bundled cadence. FINAL milestone.
|
||||
|
||||
**Goal:** Operator CLI surface — `scadalink audit query | export | verify-chain` — plus the ManagementService HTTP endpoints they call, output formatters, and renaming the pre-existing `audit-log` config-change command to `audit-config` with a deprecation alias.
|
||||
|
||||
**M7 realities baked in:**
|
||||
- `OperationalAudit` + `AuditExport` are role-claim policies (M7 Bundle G). The Management endpoints reuse them.
|
||||
- `IAuditLogRepository.QueryAsync` (keyset paging) + `GetKpiSnapshotAsync` exist.
|
||||
- `AuditLogQueryFilter` is single-value per dimension — the CLI's `--channel` etc. flags collapse to single values like the UI chips do (documented limitation).
|
||||
- `verify-chain` is a v1 no-op stub (hash-chain deferred to v1.x per alog.md locked decisions). Do NOT implement hash chains.
|
||||
- ManagementService surface: confirm controllers vs minimal API by reading the project (M7 found CentralUI uses minimal API; ManagementService may differ).
|
||||
|
||||
**CLI conventions:** System.CommandLine; JSON default + `--format table` opt-in. The CLI connects via the HTTP Management API (per CLAUDE.md). Mirror `src/ScadaLink.CLI/Commands/AuditLogCommands.cs` for the System.CommandLine pattern.
|
||||
|
||||
**Bundles:**
|
||||
- Bundle A — CLI `audit` command group: scaffold + query + export + verify-chain (T1, T2, T3, T4).
|
||||
- Bundle B — ManagementService /api/audit/{query,export} endpoints (T5).
|
||||
- Bundle C — Output formatters + audit-config rename + README (T6, T7, T8).
|
||||
|
||||
Final cross-bundle review + merge + roadmap closeout.
|
||||
@@ -0,0 +1,787 @@
|
||||
# Centralized Audit Log Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:executing-plans to implement this plan task-by-task.
|
||||
>
|
||||
> **Repo nature:** Design-documentation only. No code, no tests. Each task is a documentation change. "Verify" = re-read the diff + grep for stale cross-references. Commit after each task.
|
||||
|
||||
**Goal:** Document the new **#23 Audit Log** component and propagate its cross-references across every affected component design, the README, HighLevelReqs, and CLAUDE.md — exactly as specified in `alog.md` (committed `fec0bb1`).
|
||||
|
||||
**Architecture:** Layered, append-only `AuditLog` table at central, alongside existing `Notifications` (#21) and `SiteCalls` (#22) operational stores. Site SQLite writes on the hot path; gRPC telemetry forwards to central; site purge requires `ForwardState ∈ {Forwarded, Reconciled}`. Cached calls send a single telemetry packet that drives both the immutable `AuditLog` insert and the operational `SiteCalls` upsert. Central-originated events (Inbound API, Notification dispatch attempts) write directly. Monthly partitioning at central, 365-day default retention.
|
||||
|
||||
**Tech Stack:** Markdown only. No code in v1 of this plan.
|
||||
|
||||
**Spec:** `/Users/dohertj2/Desktop/scadalink-design/alog.md` (see commit `fec0bb1`). All task content below cites sections of that file.
|
||||
|
||||
---
|
||||
|
||||
## Task 0: Prepare branch
|
||||
|
||||
**Files:**
|
||||
- None — git operation only.
|
||||
|
||||
**Step 1: Confirm working tree state**
|
||||
|
||||
Run: `git status --short`
|
||||
Expected: three unstaged `infra/` modifications (unrelated; leave them alone), nothing else.
|
||||
|
||||
**Step 2: Create feature branch off `main`**
|
||||
|
||||
Run: `git switch -c feature/audit-log-docs`
|
||||
Expected: switched to a new branch.
|
||||
|
||||
**Step 3: Verify branch**
|
||||
|
||||
Run: `git rev-parse --abbrev-ref HEAD`
|
||||
Expected: `feature/audit-log-docs`.
|
||||
|
||||
**No commit at this task — just branch prep.**
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Author `Component-AuditLog.md`
|
||||
|
||||
**Files:**
|
||||
- Create: `docs/requirements/Component-AuditLog.md`
|
||||
|
||||
**Step 1: Read context**
|
||||
|
||||
Read `alog.md` §1–§16. Read the structural style of `docs/requirements/Component-SiteCallAudit.md` and `docs/requirements/Component-NotificationOutbox.md` — mirror their section ordering (Purpose / Location / Responsibilities / Tables / Lifecycle / Ingest & Idempotency / Reconciliation / Retention & Purge / KPIs / Configuration / Dependencies / Interactions).
|
||||
|
||||
**Step 2: Write the skeleton**
|
||||
|
||||
Create the file with these top-level headings (verbatim, in order):
|
||||
|
||||
```
|
||||
# Component: Audit Log
|
||||
|
||||
## Purpose
|
||||
## Location
|
||||
## Responsibilities
|
||||
## Scope — the script trust boundary
|
||||
## The `AuditLog` Table (central)
|
||||
## The Site-Local `AuditLog` (SQLite)
|
||||
## Ingestion Paths
|
||||
## Cached Operations — Combined Telemetry
|
||||
## Payload Capture Policy
|
||||
## Failure Handling & Idempotency
|
||||
## Retention & Purge
|
||||
## Security & Tamper-Evidence
|
||||
## KPIs
|
||||
## Configuration
|
||||
## Dependencies
|
||||
## Interactions
|
||||
```
|
||||
|
||||
**Step 3: Fill `Purpose`**
|
||||
|
||||
Two-paragraph version of `alog.md` §1. Lead sentence: "Provides a single, append-only, forensic + operational record of every integration action initiated by, or terminating in, a script — across outbound API, outbound DB, notifications, and inbound API." Second paragraph: not a dispatcher, observes Notification Outbox (#21) and Site Call Audit (#22), adds coverage where they are silent.
|
||||
|
||||
**Step 4: Fill `Location`**
|
||||
|
||||
Central cluster + site cluster. Central: `AuditLog` table in MS SQL plus three singleton actors on the active central node — `AuditLogIngestActor` (telemetry receiver), `SiteAuditReconciliationActor`, `AuditLogPurgeActor`. Sites: `AuditLog` SQLite database file alongside the S&F buffer plus `SiteAuditTelemetryActor` singleton on the active site node. Registered as component #23 in the Host role configuration.
|
||||
|
||||
**Step 5: Fill `Responsibilities`**
|
||||
|
||||
Bullet list mirroring `alog.md` §1–§3 commitments. Six bullets:
|
||||
- Accept site-local hot-path audit writes from script-trust-boundary call paths.
|
||||
- Forward site audit rows to central via gRPC telemetry with at-least-once + idempotency on `EventId`.
|
||||
- Run periodic reconciliation pulls per site to self-heal missed telemetry.
|
||||
- Accept central-originated audit writes (Inbound API, Notification dispatch attempts).
|
||||
- Compute point-in-time KPIs (global + per-site) from the central `AuditLog` table.
|
||||
- Purge expired rows by monthly partition switch.
|
||||
|
||||
**Step 6: Fill `Scope — the script trust boundary`**
|
||||
|
||||
Reproduce the table from `alog.md` §2 verbatim (the six rows). Add the "Out of scope" bullet list. Add the DB-reads note.
|
||||
|
||||
**Step 7: Fill `The AuditLog Table (central)`**
|
||||
|
||||
Reproduce the column table from `alog.md` §4. Then the index list. Then the `Kind`-per-channel table (with the inbound API simplification — only `Completed`).
|
||||
|
||||
**Step 8: Fill `The Site-Local AuditLog (SQLite)`**
|
||||
|
||||
State same schema as central minus `IngestedAtUtc`, plus `ForwardState` (`Pending | Forwarded | Reconciled`). Reproduce the **hard purge invariant** from `alog.md` §4 verbatim:
|
||||
|
||||
> A row is eligible for purge only when both `OccurredAtUtc < retention threshold` AND `ForwardState IN ('Forwarded', 'Reconciled')`. Pending rows are never purged.
|
||||
|
||||
Mention the `SiteAuditBacklog` health metric.
|
||||
|
||||
**Step 9: Fill `Ingestion Paths`**
|
||||
|
||||
Three subsections mirroring `alog.md` §6.1, §6.2, §6.3, §6.4. Keep concise — full pseudo-code lives in `alog.md`; the component doc captures the contract.
|
||||
|
||||
**Step 10: Fill `Cached Operations — Combined Telemetry`**
|
||||
|
||||
Capture `alog.md` §6.5 — site is source of truth, one telemetry packet carries both the audit row and the SiteCalls operational update; central ingest performs both writes in a single transaction.
|
||||
|
||||
**Step 11: Fill `Payload Capture Policy`**
|
||||
|
||||
Compress `alog.md` §8 into 8–12 lines: defaults (8 KB / 64 KB on error), header redaction, body-redactor regex hook, SQL captures values by default with per-connection opt-out, never-captured list (API keys, LDAP creds, secrets), safety-net over-redacts on misconfiguration.
|
||||
|
||||
**Step 12: Fill `Failure Handling & Idempotency`**
|
||||
|
||||
Compress `alog.md` §9: EventId is the PK and dedup key; never-fail-the-action principle; ring buffer for transient SQLite write failures; reconciliation as fallback when telemetry actor wedges; central-direct-write failure handling.
|
||||
|
||||
**Step 13: Fill `Retention & Purge`**
|
||||
|
||||
Compress `alog.md` §12: 365-day default central retention; monthly partition switch; no row-level deletes at central; site 7-day default; site purge respects `ForwardState`.
|
||||
|
||||
**Step 14: Fill `Security & Tamper-Evidence`**
|
||||
|
||||
Compress `alog.md` §11: dedicated `scadalink_audit_writer` (INSERT+SELECT) and `scadalink_audit_purger` (partition-switch only) DB roles; CI grep guard against `UPDATE`/`DELETE` of `AuditLog`; Audit + OperationalAudit + AuditExport permissions; hash-chain tamper evidence deferred to v1.x.
|
||||
|
||||
**Step 15: Fill `KPIs`**
|
||||
|
||||
List the five KPIs from `alog.md` §14: Volume, Error rate, Backlog, Top inbound callers, Top outbound 5xx. Note that Notification Outbox and Site Call Audit KPIs are unaffected.
|
||||
|
||||
**Step 16: Fill `Configuration`**
|
||||
|
||||
Show the `AuditLog` `appsettings.json` shape from `alog.md` §8.4. Include `DefaultCapBytes`, `ErrorCapBytes`, `HeaderRedactList`, `GlobalBodyRedactors`, `PerTargetOverrides`, and `RetentionDays` (global only in v1).
|
||||
|
||||
**Step 17: Fill `Dependencies`**
|
||||
|
||||
Cross-references to:
|
||||
- **Commons (#16)** — `AuditEvent`, `IAuditWriter`, `ICentralAuditWriter`, `AuditChannel`, `AuditKind`, `AuditStatus` types and interfaces.
|
||||
- **Configuration Database (#17)** — `AuditLog` table schema, partition function/scheme, DB roles, retention options.
|
||||
- **Cluster Infrastructure (#13)** — singleton placement and supervision (`AuditLogIngestActor`, `SiteAuditTelemetryActor`, `SiteAuditReconciliationActor`, `AuditLogPurgeActor`).
|
||||
- **Communication (#5)** — gRPC telemetry message types added to the existing site-stream proto additively.
|
||||
- **Site Runtime (#3)** — script trust boundary touchpoints invoke `IAuditWriter`.
|
||||
- **Host (#15)** — registers the new component under the central + site roles.
|
||||
|
||||
**Step 18: Fill `Interactions`**
|
||||
|
||||
Edges to:
|
||||
- **External System Gateway (#7)** — emits `ApiOutbound.SyncCall` rows; for `CachedCall` emits combined telemetry (audit + operational).
|
||||
- **Site Runtime (#3) / Database layer** — emits `DbOutbound.SyncWrite`, `DbOutbound.SyncRead`, and cached variants similarly.
|
||||
- **Inbound API (#14)** — emits `ApiInbound.Completed` rows from request middleware.
|
||||
- **Notification Outbox (#21)** — site-emitted `Notification.Enqueued` flows via audit telemetry; central dispatcher writes `Notification.Attempt` and `Notification.Terminal` rows directly via `ICentralAuditWriter`.
|
||||
- **Site Call Audit (#22)** — shares the cached-call telemetry packet; central ingest of that packet performs both `AuditLog` insert and `SiteCalls` upsert in one transaction.
|
||||
- **Central UI (#9)** — new Audit nav group + Audit Log page; drill-in links from Notifications, Site Calls, External Systems, Inbound API key, Sites, Instances detail pages.
|
||||
- **Health Monitoring (#11)** — three new tiles (Volume, Error rate, Backlog) plus new metrics (`SiteAuditBacklog`, `SiteAuditWriteFailures`, `SiteAuditTelemetryStalled`, `CentralAuditWriteFailures`, `AuditRedactionFailure`).
|
||||
- **CLI (#19)** — `scadalink audit query|export|verify-chain` commands.
|
||||
|
||||
**Step 19: Verify**
|
||||
|
||||
Run: `grep -n "Component-AuditLog.md\|#23" docs/requirements/Component-AuditLog.md`
|
||||
Expected: file references itself sensibly.
|
||||
|
||||
Run: `wc -l docs/requirements/Component-AuditLog.md`
|
||||
Expected: ~250–400 lines (sanity check; not exact).
|
||||
|
||||
**Step 20: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-AuditLog.md
|
||||
git commit -m "docs(audit): add Component-AuditLog (#23) design document"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Update `Component-Commons.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-Commons.md`
|
||||
|
||||
**Step 1: Read existing structure**
|
||||
|
||||
Read the file to find the right sections — likely "Types", "Interfaces", "Messages", "Entities". Note which subsections audit-related additions belong in.
|
||||
|
||||
**Step 2: Add to `Types/`**
|
||||
|
||||
Under the Types section, add:
|
||||
|
||||
- `AuditChannel` enum: `ApiOutbound | DbOutbound | Notification | ApiInbound`.
|
||||
- `AuditKind` enum: union of channel-specific values from `alog.md` §4 table.
|
||||
- `AuditStatus` enum: `Success | TransientFailure | PermanentFailure | Enqueued | Retrying | Delivered | Parked | Discarded`.
|
||||
- `AuditEvent` POCO record carrying every column from `alog.md` §4 (central schema), plus a `ForwardState` for site SQLite.
|
||||
|
||||
**Step 3: Add to `Interfaces/`**
|
||||
|
||||
- `IAuditWriter` — site-local hot-path interface: `Task WriteAsync(AuditEvent evt, CancellationToken ct)`. Implementation lives in Audit Log (#23) component.
|
||||
- `ICentralAuditWriter` — central direct-write interface: `Task WriteAsync(AuditEvent evt, CancellationToken ct)` with insert-if-not-exists semantics on `EventId`.
|
||||
|
||||
**Step 4: Add to `Messages/`**
|
||||
|
||||
- `AuditTelemetryEnvelope` — gRPC message wrapping a batch of `AuditEvent` rows for telemetry forwarding.
|
||||
- `CachedCallTelemetry` — the existing SiteCalls telemetry message, additively extended in place to also carry `AuditEvent` content alongside the operational `SiteCalls` upsert fields. Do NOT rename; per `Component-Commons.md` REQ-COM-5a, message renames are breaking changes. Extend the existing entry's description.
|
||||
|
||||
**Step 5: Verify**
|
||||
|
||||
Run: `grep -n "AuditEvent\|IAuditWriter\|AuditChannel" docs/requirements/Component-Commons.md`
|
||||
Expected: all five identifiers appear in the right sections.
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-Commons.md
|
||||
git commit -m "docs(audit): register AuditEvent, IAuditWriter, AuditTelemetry types in Commons"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Update `Component-ConfigurationDatabase.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-ConfigurationDatabase.md`
|
||||
|
||||
**Step 1: Read existing structure**
|
||||
|
||||
Find the "Tables" and "Roles" / "Permissions" / "Migrations" sections.
|
||||
|
||||
**Step 2: Add `AuditLog` table description**
|
||||
|
||||
Under Tables, add a new subsection mirroring how `Notifications` and `SiteCalls` are documented. Include:
|
||||
- Full column list from `alog.md` §4 (central table).
|
||||
- Index list from `alog.md` §4.
|
||||
- Monthly partitioning: partition function `pf_AuditLog_Month`, scheme `ps_AuditLog_Month`, filegroup-per-month rollover.
|
||||
- PK on `EventId` for idempotency.
|
||||
|
||||
**Step 3: Add `AuditLog` DB roles**
|
||||
|
||||
Under Roles/Permissions, add `scadalink_audit_writer` (INSERT+SELECT only) and `scadalink_audit_purger` (partition-switch only). Note the CI grep guard against `UPDATE … AuditLog` / `DELETE … AuditLog`.
|
||||
|
||||
**Step 4: Add `AuditLog` migration note**
|
||||
|
||||
Under Migrations, note that the initial migration creates the partition function/scheme and the table aligned to the scheme; partition-maintenance job is owned by the Audit Log component, not the Configuration DB.
|
||||
|
||||
**Step 5: Add retention config note**
|
||||
|
||||
Mention `AuditLog:RetentionDays` (global only in v1) as an Audit Log options key consumed by the purge actor.
|
||||
|
||||
**Step 6: Verify cross-reference**
|
||||
|
||||
Run: `grep -n "AuditLog\|Audit Log" docs/requirements/Component-ConfigurationDatabase.md`
|
||||
Expected: new table appears in the Tables section, roles in Roles section.
|
||||
|
||||
**Step 7: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-ConfigurationDatabase.md
|
||||
git commit -m "docs(audit): add AuditLog table, partitioning, and DB roles to Config DB"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Update `Component-ClusterInfrastructure.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-ClusterInfrastructure.md`
|
||||
|
||||
**Step 1: Read singleton-placement section**
|
||||
|
||||
Find where Notification Outbox / Site Call Audit singletons are documented (active-central placement model).
|
||||
|
||||
**Step 2: Register central singletons**
|
||||
|
||||
Add to the central-singleton list:
|
||||
- `AuditLogIngestActor` — receives gRPC telemetry batches, performs insert-if-not-exists on `EventId`; for cached telemetry, performs both `AuditLog` insert and `SiteCalls` upsert in one transaction.
|
||||
- `SiteAuditReconciliationActor` — periodic per-site pull, default every 5 minutes.
|
||||
- `AuditLogPurgeActor` — daily partition-switch purge.
|
||||
|
||||
**Step 3: Register site singletons**
|
||||
|
||||
Add to the site-singleton list:
|
||||
- `SiteAuditTelemetryActor` — drains the local `AuditLog` SQLite's `Pending` rows to central in batches; short interval (5s) when busy, longer (30s) when idle.
|
||||
|
||||
**Step 4: Note dedicated dispatcher**
|
||||
|
||||
Add a one-liner: `SiteAuditTelemetryActor` runs on a dedicated dispatcher so it doesn't compete with the script blocking-I/O dispatcher (per `alog.md` §6.2).
|
||||
|
||||
**Step 5: Verify**
|
||||
|
||||
Run: `grep -n "AuditLogIngestActor\|SiteAuditTelemetryActor\|AuditLogPurgeActor\|SiteAuditReconciliationActor" docs/requirements/Component-ClusterInfrastructure.md`
|
||||
Expected: all four singletons listed.
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-ClusterInfrastructure.md
|
||||
git commit -m "docs(audit): register AuditLog singletons in Cluster Infrastructure"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 5: Update `Component-SiteRuntime.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-SiteRuntime.md`
|
||||
|
||||
**Step 1: Find script-trust-boundary section**
|
||||
|
||||
Locate the section listing what scripts can/cannot do and how their boundary-crossing calls are mediated.
|
||||
|
||||
**Step 2: Note audit hook**
|
||||
|
||||
Add: "Every script-trust-boundary call (External System Gateway, Database layer, Notify) emits an `AuditEvent` to `IAuditWriter` (site-local SQLite append). Hot path; never fails the calling action; failures logged via the `SiteAuditWriteFailures` health metric (see Health Monitoring #11)."
|
||||
|
||||
**Step 3: Note site SQLite footprint**
|
||||
|
||||
Find the section discussing site storage (SQLite for deployed configs, S&F buffer, event log, operation tracking). Add the `AuditLog` SQLite database file as a peer with the 7-day-purge-respecting-ForwardState invariant; cross-reference to Component-AuditLog.md.
|
||||
|
||||
**Step 4: Verify**
|
||||
|
||||
Run: `grep -n "IAuditWriter\|AuditLog\|Audit Log" docs/requirements/Component-SiteRuntime.md`
|
||||
Expected: hook documented, SQLite file mentioned.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-SiteRuntime.md
|
||||
git commit -m "docs(audit): note IAuditWriter hook and site SQLite in Site Runtime"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 6: Update `Component-ExternalSystemGateway.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-ExternalSystemGateway.md`
|
||||
|
||||
**Step 1: Find Call/CachedCall sections**
|
||||
|
||||
Locate the dual-call-modes documentation.
|
||||
|
||||
**Step 2: Note audit emission on sync calls**
|
||||
|
||||
Under `ExternalSystem.Call`, add: "Emits an `ApiOutbound.SyncCall` row to `IAuditWriter` at call completion (success or failure). Payload captured per the Audit Log policy (#23 §Payload Capture Policy). Audit-write failure never aborts the script."
|
||||
|
||||
**Step 3: Note audit emission on cached calls**
|
||||
|
||||
Under `ExternalSystem.CachedCall`, add: "Each lifecycle transition (`CachedEnqueued`, `CachedAttempt`, `CachedTerminal`) emits an audit row via the combined cached-operation telemetry packet — one packet carries both the audit row and the SiteCalls upsert (see Audit Log #23 §Cached Operations and Site Call Audit #22)."
|
||||
|
||||
**Step 4: Note audit emission on DB writes**
|
||||
|
||||
Under `Database.Connection()` (synchronous), add: "Script-initiated `Execute`/`ExecuteScalar` calls emit `DbOutbound.SyncWrite` rows; `ExecuteReader` emits `DbOutbound.SyncRead`. SQL parameter values are captured by default; per-connection redaction opt-in via the Audit Log configuration (#23 §Payload Capture Policy §8.2)."
|
||||
|
||||
**Step 5: Note audit emission on cached DB writes**
|
||||
|
||||
Under `Database.CachedWrite`, add: same combined-telemetry pattern as cached external calls.
|
||||
|
||||
**Step 6: Verify**
|
||||
|
||||
Run: `grep -n "AuditLog\|Audit Log\|ApiOutbound\|DbOutbound\|IAuditWriter" docs/requirements/Component-ExternalSystemGateway.md`
|
||||
Expected: hooks documented in all four call-mode subsections.
|
||||
|
||||
**Step 7: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-ExternalSystemGateway.md
|
||||
git commit -m "docs(audit): emit AuditLog rows from External System Gateway call paths"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 7: Update `Component-SiteCallAudit.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-SiteCallAudit.md`
|
||||
|
||||
**Step 1: Find Ingest & Idempotency section**
|
||||
|
||||
Locate the "Ingest & Idempotency" section (around line 69 in current file).
|
||||
|
||||
**Step 2: Note combined telemetry**
|
||||
|
||||
Add a new paragraph: "From v1.x onward, the cached-operation telemetry packet additively carries the `AuditEvent` content alongside the existing operational fields. Central's `AuditLogIngestActor` (Audit Log #23) performs both the immutable `AuditLog` insert and the `SiteCalls` upsert in a single transaction. Idempotency keys remain `EventId` (for AuditLog) and `TrackedOperationId` (for SiteCalls)."
|
||||
|
||||
**Step 3: Cross-reference Audit Log**
|
||||
|
||||
Find the Dependencies / Interactions sections (typically near the end). Add an edge to **Audit Log (#23)** noting the shared telemetry packet and dual-write ingest.
|
||||
|
||||
**Step 4: Verify**
|
||||
|
||||
Run: `grep -n "Audit Log\|AuditLog\|AuditEvent\|#23" docs/requirements/Component-SiteCallAudit.md`
|
||||
Expected: combined-telemetry paragraph + Dependencies edge present.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-SiteCallAudit.md
|
||||
git commit -m "docs(audit): note shared cached-operation telemetry with Audit Log"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 8: Update `Component-NotificationOutbox.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-NotificationOutbox.md`
|
||||
|
||||
**Step 1: Find dispatcher section**
|
||||
|
||||
Locate the section describing the central dispatcher's delivery attempt loop.
|
||||
|
||||
**Step 2: Note central direct-write of attempt/terminal**
|
||||
|
||||
Add: "Each delivery attempt writes a `Notification.Attempt` row to the `AuditLog` via `ICentralAuditWriter`; transition to a terminal status (`Delivered` / `Parked` / `Discarded`) writes a `Notification.Terminal` row. Audit writes are direct (no telemetry — the dispatcher runs at central). The site-emitted `Notification.Enqueued` row arrives via the standard audit telemetry channel."
|
||||
|
||||
**Step 3: Cross-reference Audit Log**
|
||||
|
||||
Add to Dependencies / Interactions: edge to **Audit Log (#23)** noting central direct-write of dispatch lifecycle events.
|
||||
|
||||
**Step 4: Note status independence**
|
||||
|
||||
Add a clarifying sentence: "The operational `Notifications` table remains the source of truth for the dispatcher and for Retry/Discard actions; the `AuditLog` rows are immutable shadows."
|
||||
|
||||
**Step 5: Verify**
|
||||
|
||||
Run: `grep -n "Audit Log\|ICentralAuditWriter\|Notification.Attempt\|#23" docs/requirements/Component-NotificationOutbox.md`
|
||||
Expected: dispatcher hook + Dependencies edge present.
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-NotificationOutbox.md
|
||||
git commit -m "docs(audit): central direct-write of notification dispatch events to AuditLog"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 9: Update `Component-InboundAPI.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-InboundAPI.md`
|
||||
|
||||
**Step 1: Find request-completion / logging section**
|
||||
|
||||
Locate the section describing how requests are processed and what gets logged today (today: failures only, per the brainstorm exploration).
|
||||
|
||||
**Step 2: Replace failures-only stance**
|
||||
|
||||
Edit the "failures-only logging" claim so it now reads: "Every request (success or failure) emits one `ApiInbound.Completed` row to `ICentralAuditWriter` from request middleware before the HTTP response is flushed. The row captures the API key *name* (never the key material), remote IP, user-agent, response status, duration, and truncated request/response bodies per the Audit Log capture policy (#23 §Payload Capture Policy)."
|
||||
|
||||
**Step 3: Cross-reference Audit Log**
|
||||
|
||||
Add Dependencies edge to **Audit Log (#23)**.
|
||||
|
||||
**Step 4: Note non-blocking semantics**
|
||||
|
||||
Add: "Middleware audit-write failures are logged and metricked (see Health Monitoring #11) but never affect the HTTP response."
|
||||
|
||||
**Step 5: Verify**
|
||||
|
||||
Run: `grep -n "Audit Log\|ApiInbound\|ICentralAuditWriter\|#23" docs/requirements/Component-InboundAPI.md`
|
||||
Expected: middleware hook + Dependencies edge present.
|
||||
|
||||
**Step 6: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-InboundAPI.md
|
||||
git commit -m "docs(audit): emit ApiInbound.Completed audit row per request"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 10: Update `Component-CentralUI.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-CentralUI.md`
|
||||
|
||||
**Step 1: Find navigation / page list**
|
||||
|
||||
Locate the section enumerating top-level nav groups and pages.
|
||||
|
||||
**Step 2: Add Audit nav group**
|
||||
|
||||
Add a new top-level group **Audit** with one page in v1:
|
||||
- **Audit Log** — global query/filter/drilldown over the central `AuditLog` table.
|
||||
|
||||
Document the filter bar and results grid columns from `alog.md` §10.1.
|
||||
|
||||
**Step 3: Add drill-in links**
|
||||
|
||||
In the existing Notifications, Site Calls, External Systems, Inbound API Keys, Sites, and Instances detail-page documentation, add a "View audit history" / "Recent activity" / "Audit feed" entry that opens the Audit Log page pre-filtered (per `alog.md` §10.2).
|
||||
|
||||
**Step 4: Add Health dashboard tiles**
|
||||
|
||||
In the Health dashboard documentation, add three tiles under a new "Audit" KPI group: Audit volume, Audit error rate, Audit backlog (per `alog.md` §10.3 / §14).
|
||||
|
||||
**Step 5: Note UI rules already covered**
|
||||
|
||||
No new framework choices — sticks to Blazor Server + Bootstrap + custom components per the existing project rules (per memory note `feedback_central_ui.md`).
|
||||
|
||||
**Step 6: Verify**
|
||||
|
||||
Run: `grep -n "Audit Log\|Audit nav\|Audit feed\|Audit volume\|#23" docs/requirements/Component-CentralUI.md`
|
||||
Expected: nav group, page, drill-ins, tiles all documented.
|
||||
|
||||
**Step 7: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-CentralUI.md
|
||||
git commit -m "docs(audit): add Audit nav group, Audit Log page, drill-ins, and KPI tiles to Central UI"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 11: Update `Component-HealthMonitoring.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-HealthMonitoring.md`
|
||||
|
||||
**Step 1: Find metrics list**
|
||||
|
||||
Locate where existing site + central metrics are enumerated.
|
||||
|
||||
**Step 2: Add new site metrics**
|
||||
|
||||
- `SiteAuditBacklog` — count of `Pending` rows in site-local `AuditLog` plus oldest-pending-age plus on-disk bytes. Threshold drives a Health dashboard warning on the affected site tile.
|
||||
- `SiteAuditWriteFailures` — count of failed hot-path appends since last report.
|
||||
- `SiteAuditTelemetryStalled` — boolean flag set when reconciliation reports a non-draining backlog over two cycles.
|
||||
|
||||
**Step 3: Add new central metrics**
|
||||
|
||||
- `CentralAuditWriteFailures` — central direct-write failures (Inbound API middleware, Notification Outbox dispatcher).
|
||||
- `AuditRedactionFailure` — payload redactor errors (over-redacted, safety-net hit).
|
||||
|
||||
**Step 4: Add new tiles**
|
||||
|
||||
Three new dashboard tiles under an "Audit" group: Audit volume, Audit error rate, Audit backlog.
|
||||
|
||||
**Step 5: Cross-reference Audit Log**
|
||||
|
||||
Dependencies edge to **Audit Log (#23)**.
|
||||
|
||||
**Step 6: Verify**
|
||||
|
||||
Run: `grep -n "SiteAuditBacklog\|SiteAuditWriteFailures\|CentralAuditWriteFailures\|AuditRedactionFailure\|Audit volume" docs/requirements/Component-HealthMonitoring.md`
|
||||
Expected: all five metrics + three tiles listed.
|
||||
|
||||
**Step 7: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-HealthMonitoring.md
|
||||
git commit -m "docs(audit): add Audit Log health metrics and dashboard tiles"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 12: Update `Component-CLI.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/Component-CLI.md`
|
||||
|
||||
**Step 1: Find command-group list**
|
||||
|
||||
Locate the section enumerating top-level CLI command groups.
|
||||
|
||||
**Step 2: Add `scadalink audit` group**
|
||||
|
||||
Three subcommands per `alog.md` §15.1:
|
||||
- `audit query --site <s> --since <t> --kind <k> [...]` — UI-equivalent filter set.
|
||||
- `audit export --since <t> --until <t> --format csv|jsonl|parquet --output <path>` — server-side streaming export.
|
||||
- `audit verify-chain --month <YYYY-MM>` — hash-chain verification (no-op in v1; available once §11.4 ships).
|
||||
|
||||
Note: requires `OperationalAudit` + `AuditExport` permissions (Security & Auth #10).
|
||||
|
||||
**Step 3: Cross-reference Audit Log and Management Service**
|
||||
|
||||
Dependencies edges to **Audit Log (#23)** and **Management Service (#18)** (the CLI hits central via the existing HTTP Management API).
|
||||
|
||||
**Step 4: Verify**
|
||||
|
||||
Run: `grep -n "scadalink audit\|audit query\|audit export\|audit verify-chain\|#23" docs/requirements/Component-CLI.md`
|
||||
Expected: command group documented with all three subcommands.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/Component-CLI.md
|
||||
git commit -m "docs(audit): add scadalink audit command group to CLI"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 13: Update `README.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `README.md`
|
||||
|
||||
**Step 1: Find component table**
|
||||
|
||||
Locate the markdown table containing rows #1–#22 (currently around lines 36–58).
|
||||
|
||||
**Step 2: Add row #23**
|
||||
|
||||
Append a row after `Site Call Audit`:
|
||||
|
||||
```
|
||||
| 23 | Audit Log | [docs/requirements/Component-AuditLog.md](docs/requirements/Component-AuditLog.md) | New central append-only AuditLog spanning every script-trust-boundary action (outbound API sync+cached, outbound DB sync+cached, notifications, inbound API). Site-local SQLite hot-path append + gRPC telemetry + central reconciliation; combined telemetry packet with Site Call Audit; central direct-write for Notification Outbox dispatch + Inbound API middleware; monthly partitioning, 365-day default retention. |
|
||||
```
|
||||
|
||||
**Step 3: Update architecture diagram (logical)**
|
||||
|
||||
In the architecture diagram, add an `AuditLog` box under the central cluster's "Audit Log" / observability cluster (parallel to Notification Outbox and Site Call Audit). Add a thin arrow from each affected component into it.
|
||||
|
||||
**Step 4: Verify**
|
||||
|
||||
Run: `grep -n "Audit Log\|Component-AuditLog.md\|| 23 |" README.md`
|
||||
Expected: new row + diagram entry present.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add README.md
|
||||
git commit -m "docs(audit): register Audit Log (#23) in the README component table"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 14: Update `docs/requirements/HighLevelReqs.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `docs/requirements/HighLevelReqs.md`
|
||||
|
||||
**Step 1: Find functional-area sections**
|
||||
|
||||
Locate the section that currently contains requirements for Notification Outbox and Site Call Audit (likely under "Observability" or "Audit & Reporting").
|
||||
|
||||
**Step 2: Add Audit Log requirements section**
|
||||
|
||||
Add a new subsection **"Centralized Audit Log"** with numbered requirements covering:
|
||||
- AL-1: Append-only central record of every script-trust-boundary action.
|
||||
- AL-2: One row per lifecycle event for cached calls and notifications.
|
||||
- AL-3: Site-local hot-path append; gRPC telemetry to central; idempotent on `EventId`.
|
||||
- AL-4: Reconciliation pull self-heals missed telemetry.
|
||||
- AL-5: Payload metadata + truncated bodies (8 KB default, 64 KB on errors).
|
||||
- AL-6: Headers redacted by default; SQL parameter values captured by default; per-target redaction opt-in.
|
||||
- AL-7: Audit-write failure never aborts the user-facing action.
|
||||
- AL-8: 365-day default central retention; monthly partition switch purge.
|
||||
- AL-9: Site SQLite purge requires `ForwardState ∈ {Forwarded, Reconciled}`; central outage cannot cause audit loss at sites.
|
||||
- AL-10: Central UI Audit Log page with cross-channel filter and drill-ins from existing operational pages.
|
||||
- AL-11: Append-only enforced via DB roles; tamper-evidence hash chain deferred to v1.x.
|
||||
- AL-12: CLI `scadalink audit` command group.
|
||||
|
||||
**Step 3: Cross-reference Audit Log component**
|
||||
|
||||
Add a "See Component-AuditLog.md (#23)" pointer at the top of the subsection.
|
||||
|
||||
**Step 4: Verify**
|
||||
|
||||
Run: `grep -n "AL-1\|AL-12\|Centralized Audit Log\|Component-AuditLog.md" docs/requirements/HighLevelReqs.md`
|
||||
Expected: section header and all twelve requirements present.
|
||||
|
||||
**Step 5: Commit**
|
||||
|
||||
```bash
|
||||
git add docs/requirements/HighLevelReqs.md
|
||||
git commit -m "docs(audit): add Centralized Audit Log requirements (AL-1..AL-12) to HighLevelReqs"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 15: Update `CLAUDE.md`
|
||||
|
||||
**Files:**
|
||||
- Modify: `CLAUDE.md`
|
||||
|
||||
**Step 1: Update Current Component List**
|
||||
|
||||
Change the heading from `## Current Component List (22 components)` to `## Current Component List (23 components)`. Append a new line at the end of the numbered list:
|
||||
|
||||
```
|
||||
23. Audit Log — Central append-only AuditLog table spanning every script-trust-boundary action (outbound API sync+cached, outbound DB sync+cached, notifications, inbound API). Site SQLite hot-path + gRPC telemetry + reconciliation; combined telemetry with Site Call Audit; central direct-write for Notification Outbox dispatch + Inbound API; monthly partitioning, 365-day retention.
|
||||
```
|
||||
|
||||
**Step 2: Add Key Design Decisions block**
|
||||
|
||||
In the **Key Design Decisions** section, add a new subsection **`### Centralized Audit Log`** with bulleted decisions mirroring `alog.md` §1–§15 highlights:
|
||||
|
||||
- Layered design — append-only AuditLog alongside operational Notifications (#21) and SiteCalls (#22), not replacing them.
|
||||
- Scope = script trust boundary; framework traffic explicitly excluded.
|
||||
- One row per lifecycle event; cached calls produce 4+ rows per operation.
|
||||
- Site SQLite hot-path first; gRPC telemetry to central; idempotent on `EventId`; reconciliation pull as fallback.
|
||||
- Cached operations: site emits, one telemetry packet carries audit + operational state; central writes both in one transaction.
|
||||
- Payload cap 8 KB default / 64 KB on errors; headers redacted by default; SQL parameter values captured by default; per-target redaction opt-in.
|
||||
- Audit-write failure never aborts the user-facing action.
|
||||
- 365-day central retention with monthly partition-switch purge; 7-day site SQLite with hard `ForwardState` invariant.
|
||||
- Append-only enforced via DB roles; hash-chain tamper evidence and Parquet archival deferred to v1.x.
|
||||
- New top-level **Audit** nav group + Audit Log page + drill-ins from Notifications / Site Calls / External Systems / Inbound API Keys / Sites / Instances.
|
||||
|
||||
**Step 3: Verify**
|
||||
|
||||
Run: `grep -n "Centralized Audit Log\|Audit Log\|23 components\|23\\. Audit Log" CLAUDE.md`
|
||||
Expected: count updated, list extended, Key Design Decisions block present.
|
||||
|
||||
**Step 4: Commit**
|
||||
|
||||
```bash
|
||||
git add CLAUDE.md
|
||||
git commit -m "docs(audit): register Audit Log (#23) in CLAUDE.md component list and key decisions"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Task 16: Final cross-reference verification
|
||||
|
||||
**Files:**
|
||||
- None — verification only.
|
||||
|
||||
**Step 1: Grep for stale references**
|
||||
|
||||
Run: `grep -rn "22 components\|Currently 22\|22\\. Site Call Audit\\s*$" docs/ README.md CLAUDE.md`
|
||||
Expected: no hits — all updated to 23.
|
||||
|
||||
**Step 2: Grep for orphan references**
|
||||
|
||||
Run: `grep -rn "Component-AuditLog.md" docs/ README.md CLAUDE.md`
|
||||
Expected: hits in README, CLAUDE.md, and each affected component doc. Confirm the file exists at the referenced path.
|
||||
|
||||
**Step 3: Verify all twelve affected component docs cross-reference Audit Log**
|
||||
|
||||
Run: `for f in docs/requirements/Component-{ExternalSystemGateway,InboundAPI,NotificationOutbox,SiteCallAudit,SiteRuntime,Commons,CentralUI,ConfigurationDatabase,ClusterInfrastructure,HealthMonitoring,CLI}.md; do echo "--- $f"; grep -c "Audit Log\|AuditLog\|#23" "$f"; done`
|
||||
Expected: each file shows count ≥ 1.
|
||||
|
||||
**Step 4: Verify alog.md still matches the design canonically**
|
||||
|
||||
Run: `git diff fec0bb1 -- alog.md`
|
||||
Expected: no diff — alog.md is unchanged from the validated commit.
|
||||
|
||||
**Step 5: Skim the new file once more end-to-end**
|
||||
|
||||
Read: `docs/requirements/Component-AuditLog.md`. Verify section ordering, completeness, no contradictions with `alog.md`.
|
||||
|
||||
**Step 6: Review the commit graph**
|
||||
|
||||
Run: `git log --oneline feature/audit-log-docs ^main`
|
||||
Expected: 14 commits — one per Task 1–13 plus Task 15 (Task 14 is HighLevelReqs in this list — recount: tasks 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 = 15 commits). Adjust expectation: 15 docs/commits.
|
||||
|
||||
**Step 7: Final commit (only if any fix-ups needed)**
|
||||
|
||||
If grep finds any issue, fix it and commit with `docs(audit): cross-reference cleanup`. Otherwise no commit at this task.
|
||||
|
||||
---
|
||||
|
||||
## Task 17: Merge to main (optional, on user request only)
|
||||
|
||||
**Files:**
|
||||
- None — git operation only.
|
||||
|
||||
**Step 1: Confirm with user**
|
||||
|
||||
Per CLAUDE.md and harness policy, do not push or merge to main without explicit user instruction. This task documents the option but does not execute automatically.
|
||||
|
||||
**Step 2: If user requests merge**
|
||||
|
||||
```bash
|
||||
git switch main
|
||||
git merge --no-ff feature/audit-log-docs -m "Merge feature/audit-log-docs: centralized audit log design"
|
||||
```
|
||||
|
||||
**Step 3: If user requests push**
|
||||
|
||||
```bash
|
||||
git push origin main
|
||||
```
|
||||
|
||||
(or push the feature branch instead — operator's call).
|
||||
|
||||
---
|
||||
|
||||
## Execution Notes
|
||||
|
||||
- **Tasks 2–14 are mostly independent of each other** once Task 1 is done. Suitable for parallel execution via the **subagent-driven-development** sub-skill — one fresh subagent per task, review between commits.
|
||||
- **Tasks 15 and 16** must run last (Task 15 is the CLAUDE.md rollup; Task 16 is verification).
|
||||
- **Task 0** must run first (branch prep).
|
||||
- Total: 17 tasks, ~15 commits, ~250–400 lines of new prose in `Component-AuditLog.md` plus smaller per-component additions.
|
||||
- Spec is `alog.md` (commit `fec0bb1`); every task cites the relevant section.
|
||||
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-20-centralized-audit-log.md",
|
||||
"spec": "alog.md (commit fec0bb1)",
|
||||
"repoNature": "design-documentation-only",
|
||||
"tasks": [
|
||||
{"id": 0, "subject": "Task 0: Prepare branch", "status": "pending", "blockedBy": []},
|
||||
{"id": 1, "subject": "Task 1: Author Component-AuditLog.md", "status": "pending", "blockedBy": [0]},
|
||||
{"id": 2, "subject": "Task 2: Update Component-Commons.md", "status": "pending", "blockedBy": [0]},
|
||||
{"id": 3, "subject": "Task 3: Update Component-ConfigurationDatabase.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 4, "subject": "Task 4: Update Component-ClusterInfrastructure.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 5, "subject": "Task 5: Update Component-SiteRuntime.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 6, "subject": "Task 6: Update Component-ExternalSystemGateway.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 7, "subject": "Task 7: Update Component-SiteCallAudit.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 8, "subject": "Task 8: Update Component-NotificationOutbox.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 9, "subject": "Task 9: Update Component-InboundAPI.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 10, "subject": "Task 10: Update Component-CentralUI.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 11, "subject": "Task 11: Update Component-HealthMonitoring.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 12, "subject": "Task 12: Update Component-CLI.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 13, "subject": "Task 13: Update README.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 14, "subject": "Task 14: Update HighLevelReqs.md", "status": "pending", "blockedBy": [1]},
|
||||
{"id": 15, "subject": "Task 15: Update CLAUDE.md", "status": "pending", "blockedBy": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]},
|
||||
{"id": 16, "subject": "Task 16: Final cross-reference verification", "status": "pending", "blockedBy": [15]},
|
||||
{"id": 17, "subject": "Task 17: Merge to main (user-gated)", "status": "pending", "blockedBy": [16]}
|
||||
],
|
||||
"lastUpdated": "2026-05-20T00:00:00Z"
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
# Audit Log — ExecutionId Universal Correlation (Design)
|
||||
|
||||
**Date:** 2026-05-21
|
||||
**Status:** Validated — ready for implementation planning.
|
||||
|
||||
## Problem
|
||||
|
||||
The audit `CorrelationId` column is overloaded with three incompatible meanings —
|
||||
`TrackedOperationId` for cached calls, `NotificationId` for notifications, the
|
||||
script-execution id for sync calls (added 2026-05-21), and request-local ids for
|
||||
inbound. It is `NULL` for sync one-shot calls. There is no single value that ties
|
||||
together *everything one script run (or inbound request) did*: a run that makes a
|
||||
sync API call, a cached call and a notification produces three unrelated
|
||||
correlation ids, and nothing links the cached call's lifecycle rows back to the
|
||||
run that launched them.
|
||||
|
||||
A single `CorrelationId` column cannot serve both scopes — the **operation
|
||||
lifecycle** (a cached call's `Submit→Attempted→Resolve`; a notification's
|
||||
`Send→Deliver`, which the Site Calls / Notifications "View audit history"
|
||||
drill-ins depend on) and the **execution trace** (all operations of one run).
|
||||
|
||||
## Decision
|
||||
|
||||
Add a dedicated, nullable **`ExecutionId`** column to the audit row. It identifies
|
||||
the originating **script execution** or **inbound API request**. Every audit row
|
||||
that execution produces carries the same `ExecutionId`. `CorrelationId` is left
|
||||
exactly as it is — it keeps the per-operation lifecycle meaning, so the existing
|
||||
operation drill-ins are unaffected.
|
||||
|
||||
Result: `WHERE ExecutionId = X` returns every audit row of one run — sync
|
||||
`ApiCall`/`DbWrite`, the whole cached-call lifecycle, `NotifySend`,
|
||||
`NotifyDeliver`, and the inbound row — across both the site and central tables.
|
||||
|
||||
`ScriptRuntimeContext` already holds a per-execution id (`_auditCorrelationId`,
|
||||
added 2026-05-21). That id becomes the `ExecutionId`; this work stamps it into the
|
||||
new column from every emitter and threads it to the two paths where the script
|
||||
context is not in scope.
|
||||
|
||||
### Considered and rejected
|
||||
|
||||
- **Overload `CorrelationId`** with the execution id everywhere — breaks the
|
||||
cached-call / notification "View audit history" drill-ins (they filter
|
||||
`CorrelationId` by `TrackedOperationId` / `NotificationId`), or forces them to
|
||||
show the whole run instead of the one operation.
|
||||
- **Stash the execution id in `Extra` JSON** — no schema change, but `Extra` is
|
||||
unindexed; filtering an audit table of this volume by it is unworkable.
|
||||
|
||||
## Schema changes (all additive, nullable — no backfill; pre-existing rows stay `NULL`)
|
||||
|
||||
| Where | Change |
|
||||
|---|---|
|
||||
| `ScadaLink.Commons` | `AuditEvent` record (and the site-local variant) gains `Guid? ExecutionId`. |
|
||||
| Central MS SQL `AuditLog` | new `ExecutionId uniqueidentifier NULL` column + index `IX_AuditLog_Execution (ExecutionId)`. EF migration — additive nullable column is a metadata-only `ALTER`, fast even on the monthly-partitioned table. |
|
||||
| Site SQLite `auditlog.db` `AuditLog` | new `ExecutionId TEXT NULL` column (`SqliteAuditWriter` schema + `MapRow`). |
|
||||
| gRPC `AuditEventDto` (`sitestream.proto`) | additive `execution_id` field; `AuditEventDtoMapper` maps it both directions. |
|
||||
| Central MS SQL `Notifications` | new `OriginExecutionId uniqueidentifier NULL` column — carries the originating run's id so the dispatcher can echo it onto `NotifyDeliver` audit rows. EF migration. |
|
||||
|
||||
`SiteCalls` needs no new column — the cached telemetry packet already carries the
|
||||
audit half, which now has `ExecutionId` directly.
|
||||
|
||||
## Emitter coverage — every audit row carries `ExecutionId`
|
||||
|
||||
| Emitter | `ExecutionId` source |
|
||||
|---|---|
|
||||
| Sync `ApiCall`, sync `DbWrite` | `ScriptRuntimeContext` execution id (in scope today) |
|
||||
| Cached call script-side rows (`CachedSubmit`, immediate `Attempted`/`CachedResolve`) | `ScriptRuntimeContext` execution id |
|
||||
| Cached call **S&F retry-loop** rows (`CachedCallLifecycleBridge`) | threaded through the store-and-forward buffered message → `CachedCallAttemptContext` → the bridge. This same threading also fixes the pre-existing `SourceScript = NULL` gap on those rows (identical boundary). |
|
||||
| `NotifySend` (site, script-side) | `ScriptRuntimeContext` execution id |
|
||||
| `NotifyDeliver` (central dispatch) | `Notifications.OriginExecutionId` — the id rides on `NotificationSubmit`, is persisted on the `Notifications` row, and the dispatcher stamps it on every `NotifyDeliver` row |
|
||||
| Inbound `InboundRequest` / `InboundAuthFailure` | request id minted once in `AuditWriteMiddleware` |
|
||||
|
||||
## Data flow
|
||||
|
||||
- **Site script run** — `ScriptRuntimeContext` generates the execution id (or is
|
||||
given one); every emitter it owns stamps `ExecutionId`.
|
||||
- **Buffered cached call** — the execution id rides on the S&F buffered message;
|
||||
the retry loop reconstructs it into `CachedCallAttemptContext`;
|
||||
`CachedCallLifecycleBridge` stamps it on the retry-loop audit rows.
|
||||
- **Notification** — the `NotifySend` row stamps it site-side; the id travels on
|
||||
`NotificationSubmit`, is stored as `Notifications.OriginExecutionId`, and the
|
||||
dispatcher stamps every `NotifyDeliver` row it emits.
|
||||
- **Inbound API request** — `AuditWriteMiddleware` mints a request id and stamps
|
||||
the inbound audit row.
|
||||
|
||||
## UI / CLI surface
|
||||
|
||||
- **Central UI Audit Log page** — `ExecutionId` added as a results-grid column
|
||||
(the grid already supports resize/reorder); an `ExecutionId` paste-filter in
|
||||
the filter bar; the page accepts `?executionId=<guid>`; a row drill-in
|
||||
"View this execution" → `/audit/log?executionId=<guid>`.
|
||||
- **CLI** — `scadalink audit query --execution-id <guid>`.
|
||||
- **ManagementService** — `/api/audit/query` and the export endpoint accept an
|
||||
`executionId` filter parameter.
|
||||
|
||||
## Compatibility
|
||||
|
||||
- Two additive nullable columns; additive proto field; additive message-contract
|
||||
fields — all version-compatible. No data backfill; historical rows keep
|
||||
`ExecutionId = NULL`.
|
||||
- `CorrelationId` semantics unchanged — every existing drill-in keeps working.
|
||||
|
||||
## Testing
|
||||
|
||||
- Repository: query-by-`ExecutionId`; migration smoke test.
|
||||
- Emitter unit tests: each emitter stamps `ExecutionId`; the cached-call lifecycle
|
||||
rows from one run share it; `NotifyDeliver` echoes `Notifications.OriginExecutionId`.
|
||||
- Integration: a script run that does a sync call + a cached call + a notification
|
||||
→ all resulting audit rows share one `ExecutionId` end-to-end.
|
||||
- Central UI: bUnit (grid column, filter, drill-in) + Playwright.
|
||||
|
||||
## Out of scope
|
||||
|
||||
- Bridging the inbound request id into the routed site script's execution
|
||||
(cross-cluster threading) — a separate future change.
|
||||
- Backfilling `ExecutionId` on historical audit rows.
|
||||
@@ -0,0 +1,155 @@
|
||||
# Audit Log ExecutionId — Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development to execute this plan task-by-task (fresh implementer per task + spec review + code-quality review).
|
||||
|
||||
**Goal:** Add a dedicated `ExecutionId` column to the Audit Log — one universal correlation value, stamped on every audit row, identifying the originating script execution or inbound request.
|
||||
|
||||
**Architecture:** Additive nullable `ExecutionId` (`Guid`) on the audit row (Commons `AuditEvent`, central MS SQL `AuditLog`, site SQLite `auditlog.db`, gRPC `AuditEventDto`). Every emitter stamps it; the `ScriptRuntimeContext` per-execution id is the source for site script runs, threaded through the S&F buffer for retry-loop cached rows and through `NotificationSubmit` → `Notifications.OriginExecutionId` for central `NotifyDeliver` rows. `CorrelationId` is left as the per-operation lifecycle id (and reverts to `null` for sync one-shot calls). Validated design: `docs/plans/2026-05-21-audit-executionid-design.md`.
|
||||
|
||||
**Tech Stack:** .NET 10, EF Core 10 (MS SQL + SQLite), Akka.NET, gRPC, Blazor Server + Bootstrap, System.CommandLine, xUnit + Akka.TestKit.Xunit2 + bUnit + NSubstitute/Moq, Playwright.
|
||||
|
||||
**Ground rules (every task):** branch is `feature/audit-executionid` (already created) — never commit to `main`. Edit in place; never touch `infra/*`; `docker/*` only if a task says so (none do). Stage with explicit `git add <path>` — never `git add .` / `commit -am`. TDD; full solution stays green (`dotnet build ScadaLink.slnx` 0 warnings — `TreatWarningsAsErrors` is on). Additive contract evolution. Do not push.
|
||||
|
||||
---
|
||||
|
||||
## Task 0: Prep — verify branch + baseline
|
||||
|
||||
**Files:** none.
|
||||
|
||||
**Steps:** confirm `git branch --show-current` is `feature/audit-executionid`; `dotnet build ScadaLink.slnx` succeeds.
|
||||
|
||||
**Acceptance:** on the branch, solution builds clean.
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Foundation — `AuditEvent.ExecutionId`, central `AuditLog` column, repository query
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Commons/Entities/Audit/AuditEvent.cs` — add `Guid? ExecutionId`.
|
||||
- Modify: `src/ScadaLink.Commons/Types/Audit/AuditLogQueryFilter.cs` — add `Guid? ExecutionId` filter dimension (single-value, like `CorrelationId`).
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/Configurations/AuditLogEntityTypeConfiguration.cs` — map the column; add index `IX_AuditLog_Execution (ExecutionId)`.
|
||||
- Create: a new EF migration under `src/ScadaLink.ConfigurationDatabase/Migrations/` — `AddAuditLogExecutionId` — `ExecutionId uniqueidentifier NULL` + the index. Additive nullable column (metadata-only ALTER, safe on the monthly-partitioned table). Mirror the existing `AddNotificationsTable` migration style.
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs` — `QueryAsync` translates `filter.ExecutionId` to `e.ExecutionId == value` (mirror the `CorrelationId` clause). Keyset paging untouched.
|
||||
- Test: `tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs` — `QueryAsync_FilterByExecutionId`; migration smoke if the suite has that pattern.
|
||||
|
||||
**Approach:** purely additive. `ExecutionId` is `Guid?` everywhere. Generate the migration with `dotnet ef migrations add` against the ConfigurationDatabase project (or hand-write mirroring an existing one — match how the repo does migrations).
|
||||
|
||||
**Commit:** `feat(auditlog): ExecutionId column on AuditEvent + central AuditLog`
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Foundation — site SQLite + gRPC DTO
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.AuditLog/Site/SqliteAuditWriter.cs` — add `ExecutionId TEXT NULL` to the `auditlog.db` `AuditLog` table DDL; the insert command binds it; `MapRow` reads it back. (Site SQLite is created fresh by the writer — an additive column in the `CREATE TABLE` is enough; if the writer has any migration/ALTER path, extend it.)
|
||||
- Modify: `src/ScadaLink.Communication/Protos/sitestream.proto` — add `string execution_id` to `AuditEventDto` (next free field number; additive). Rebuild regenerates the C# stubs.
|
||||
- Modify: `src/ScadaLink.Communication/Grpc/AuditEventDtoMapper.cs` — `ToDto`/`FromDto` map `ExecutionId` ↔ `execution_id` (Guid ↔ string; empty string ↔ null, mirroring the existing `CorrelationId` handling).
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Site/SqliteAuditWriterSchemaTests.cs` (column present + round-trips); `tests/ScadaLink.Communication.Tests/AuditEventDtoMapperTests.cs` (ExecutionId round-trip incl. null).
|
||||
|
||||
**Commit:** `feat(auditlog): ExecutionId on site SQLite schema + gRPC AuditEventDto`
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Site script-side emitters stamp `ExecutionId`
|
||||
|
||||
**What:** Every audit row a `ScriptRuntimeContext` emits gets `ExecutionId` = the context's per-execution id. Revert the interim "execution id in `CorrelationId` for sync rows" change so `CorrelationId` is purely per-operation again.
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.SiteRuntime/Scripts/ScriptRuntimeContext.cs`:
|
||||
- Rename the field `_auditCorrelationId` → `_executionId` (and the ctor param `auditCorrelationId` → `executionId`) for clarity; update XML docs. Thread it to the helpers as today.
|
||||
- Sync `ApiCall` (`BuildCallAuditEvent`): set `ExecutionId = _executionId`; set `CorrelationId = null` (revert — sync one-shot calls have no operation lifecycle).
|
||||
- Cached script-side rows (`CachedSubmit`, immediate `ApiCallCached`/`CachedResolve`): set `ExecutionId = _executionId`; `CorrelationId` stays `trackedId.Value`.
|
||||
- `NotifySend` (`Notify.Send` emission): set `ExecutionId = _executionId`; `CorrelationId` stays the `NotificationId`.
|
||||
- Modify: `src/ScadaLink.SiteRuntime/Scripts/AuditingDbConnection.cs` + `AuditingDbCommand.cs` — thread `_executionId` (rename from the audit-correlation param); sync `DbWrite` event sets `ExecutionId = _executionId` and `CorrelationId = null`. Cached DB write rows: `ExecutionId` set, `CorrelationId` stays `trackedId`.
|
||||
- Test: extend `tests/ScadaLink.SiteRuntime.Tests/Scripts/ExternalSystemCallAuditEmissionTests.cs`, `DatabaseSyncEmissionTests.cs`, `ExternalSystemCachedCallEmissionTests.cs`, `DatabaseCachedWriteEmissionTests.cs`, `NotifySendAuditEmissionTests.cs`, and `ExecutionCorrelationContextTests.cs` — assert `ExecutionId` is the context's id on every row; assert sync rows now have `CorrelationId == null`; assert cached/notification rows keep their `CorrelationId`.
|
||||
|
||||
**Commit:** `feat(auditlog): site script-side emitters stamp ExecutionId`
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Cached S&F retry-loop rows carry `ExecutionId`
|
||||
|
||||
**What:** Thread the execution id through the store-and-forward buffer so the retry-loop cached audit rows (`CachedCallLifecycleBridge`) carry `ExecutionId`. This same threading fixes the pre-existing `SourceScript = null` gap on those rows (identical boundary).
|
||||
|
||||
**Files:**
|
||||
- Modify: the S&F buffered cached-call message / `StoreAndForwardMessage` (or the cached-call payload) in `src/ScadaLink.StoreAndForward/` — carry the originating execution id (and source script) alongside the call.
|
||||
- Modify: `CachedCallAttemptContext` (find it — `src/ScadaLink.AuditLog/Site/Telemetry/` or StoreAndForward) — add an `ExecutionId` (and `SourceScript`) field.
|
||||
- Modify: `src/ScadaLink.AuditLog/Site/Telemetry/CachedCallLifecycleBridge.cs` `BuildPacket` — set `ExecutionId` from the context (and `SourceScript`, replacing the `SourceScript = null` line).
|
||||
- Modify the enqueue path (`ExternalSystem.CachedCall` / `Database.CachedWrite` in `ScriptRuntimeContext`) so the execution id is written into the buffered message.
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/` cached-telemetry tests + `tests/ScadaLink.StoreAndForward.Tests/` — retry-loop rows carry the originating `ExecutionId`.
|
||||
|
||||
**Note for implementer:** this is the deepest task — the threading touches StoreAndForward. If the buffered message can't cleanly carry the id, STOP and report before guessing.
|
||||
|
||||
**Commit:** `feat(auditlog): thread ExecutionId through S&F for retry-loop cached rows`
|
||||
|
||||
---
|
||||
|
||||
## Task 5: Central `NotifyDeliver` rows carry `ExecutionId`
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Commons/Entities/Notifications/Notification.cs` — add `Guid? OriginExecutionId`.
|
||||
- Modify: `src/ScadaLink.Commons/Messages/Notification/` — `NotificationSubmit` carries `Guid? OriginExecutionId` (additive).
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/` — EF config + a new migration `AddNotificationOriginExecutionId` (`Notifications.OriginExecutionId uniqueidentifier NULL`).
|
||||
- Modify: the site `NotifySend` forward path — the execution id (already on the `NotifySend` audit row from Task 3) also rides on the `NotificationSubmit` (set it where the submit is built — `ScriptRuntimeContext` `Notify.Send` / the S&F notification forwarder).
|
||||
- Modify: `src/ScadaLink.NotificationOutbox/NotificationOutboxActor.cs` — persist `OriginExecutionId` on insert; `BuildNotifyDeliverEvent` sets `ExecutionId = notification.OriginExecutionId`.
|
||||
- Test: `tests/ScadaLink.NotificationOutbox.Tests/` — `NotifyDeliver` rows echo `OriginExecutionId`; `tests/ScadaLink.Commons.Tests/` contract shape.
|
||||
|
||||
**Commit:** `feat(auditlog): NotifyDeliver rows carry the originating ExecutionId`
|
||||
|
||||
---
|
||||
|
||||
## Task 6: Inbound rows carry `ExecutionId`
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.InboundAPI/Middleware/AuditWriteMiddleware.cs` — `EmitInboundAudit` sets `ExecutionId` to the request id (it already mints a `Guid.NewGuid()` for the inbound `CorrelationId` per the 2026-05-21 change; reuse that one id for `ExecutionId` — and reconsider whether the inbound row's `CorrelationId` should now be `null` to keep `CorrelationId` purely per-operation; align with the Task 3 decision: inbound is a one-shot from the audit row's perspective → `CorrelationId = null`, `ExecutionId = <request id>`).
|
||||
- Test: `tests/ScadaLink.InboundAPI.Tests/Middleware/AuditWriteMiddlewareTests.cs` — inbound row carries a non-null `ExecutionId`; distinct per request.
|
||||
|
||||
**Commit:** `feat(auditlog): inbound audit rows carry ExecutionId`
|
||||
|
||||
---
|
||||
|
||||
## Task 7: Central UI — ExecutionId column, filter, drill-in
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor` (+ `.razor.cs`) — add `ExecutionId` to the column set (the grid already supports resize/reorder + a `ColumnOrder`); render it (short form / monospace).
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Audit/AuditFilterBar.razor` (+ `.razor.cs`) + `AuditQueryModel.cs` — an `ExecutionId` paste text-filter; `ToFilter` maps it to `AuditLogQueryFilter.ExecutionId`.
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Pages/Audit/AuditLogPage.razor.cs` — `ApplyQueryStringFilters` accepts `?executionId=<guid>`; `BuildExportUrl` emits it.
|
||||
- Add a "View this execution" drill-in — a row/drilldown action linking `/audit/log?executionId=<guid>`. Mirror the existing `?correlationId=` drill-in.
|
||||
- Test: `tests/ScadaLink.CentralUI.Tests/` bUnit (column renders, filter maps, query-param parsed); `tests/ScadaLink.CentralUI.PlaywrightTests/Audit/` (drill-in filters the grid).
|
||||
|
||||
Use the `frontend-design` skill for the column/filter styling.
|
||||
|
||||
**Commit:** `feat(centralui): ExecutionId column, filter and drill-in on the Audit Log page`
|
||||
|
||||
---
|
||||
|
||||
## Task 8: CLI + ManagementService — ExecutionId filter
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.CLI/Commands/AuditCommands.cs` + `AuditQueryHelpers.cs` — `audit query --execution-id <guid>`; `AuditQueryArgs` + `BuildQueryString` emit `executionId`.
|
||||
- Modify: `src/ScadaLink.ManagementService/AuditEndpoints.cs` `ParseFilter` — parse `executionId` query param into `AuditLogQueryFilter.ExecutionId` (lax-parse — unparseable dropped).
|
||||
- Modify: `src/ScadaLink.CentralUI/Audit/AuditExportEndpoints.cs` `ParseFilter` — same.
|
||||
- Test: `tests/ScadaLink.CLI.Tests/`, `tests/ScadaLink.ManagementService.Tests/AuditEndpointsTests.cs`.
|
||||
|
||||
**Commit:** `feat(audit): ExecutionId filter in the CLI and ManagementService`
|
||||
|
||||
---
|
||||
|
||||
## Task 9: End-to-end integration test + docs
|
||||
|
||||
**Files:**
|
||||
- Create: `tests/ScadaLink.IntegrationTests/AuditLog/ExecutionIdCorrelationTests.cs` — boot a site+central pair; run a script that does a sync `ExternalSystem.Call`, a cached call, and a `Notify.Send`; assert every resulting audit row (site + central) shares one `ExecutionId`.
|
||||
- Modify: `docs/requirements/Component-AuditLog.md` — add `ExecutionId` to the schema table and a sentence on its meaning vs `CorrelationId`. (Do NOT modify `alog.md` — it is the locked v1 spec.)
|
||||
- Modify: `CLAUDE.md` — one line under the Centralized Audit Log decisions noting `ExecutionId` as the universal per-run correlation value.
|
||||
|
||||
**Commit:** `test(auditlog): end-to-end ExecutionId correlation + docs`
|
||||
|
||||
---
|
||||
|
||||
## Final review
|
||||
|
||||
Dispatch a final cross-cutting review of the whole branch; full `dotnet build` + `dotnet test ScadaLink.slnx`; hand back to the user for the push/merge/redeploy decision (do not push).
|
||||
|
||||
## Dependency summary
|
||||
|
||||
0 blocks all. 2 blockedBy 1. 3 blockedBy 2. 4 blockedBy 3. 5 blockedBy 2. 6 blockedBy 2. 7 blockedBy 1. 8 blockedBy 1. 9 blockedBy 3,4,5,6,7,8. Execution order: 0 → 1 → 2 → 3 → 4 → 5 → 6 → 7 → 8 → 9 → final review.
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-21-audit-executionid.md",
|
||||
"tasks": [
|
||||
{"id": 50, "subject": "Task 0: Prep — verify branch + baseline", "status": "pending"},
|
||||
{"id": 51, "subject": "Task 1: Foundation — AuditEvent.ExecutionId + central AuditLog column + repo query", "status": "pending", "blockedBy": [50]},
|
||||
{"id": 52, "subject": "Task 2: Foundation — site SQLite + gRPC DTO", "status": "pending", "blockedBy": [51]},
|
||||
{"id": 53, "subject": "Task 3: Site script-side emitters stamp ExecutionId", "status": "pending", "blockedBy": [52]},
|
||||
{"id": 54, "subject": "Task 4: Cached S&F retry-loop rows carry ExecutionId", "status": "pending", "blockedBy": [53]},
|
||||
{"id": 55, "subject": "Task 5: Central NotifyDeliver rows carry ExecutionId", "status": "pending", "blockedBy": [52]},
|
||||
{"id": 56, "subject": "Task 6: Inbound audit rows carry ExecutionId", "status": "pending", "blockedBy": [52]},
|
||||
{"id": 57, "subject": "Task 7: Central UI — ExecutionId column, filter, drill-in", "status": "pending", "blockedBy": [51]},
|
||||
{"id": 58, "subject": "Task 8: CLI + ManagementService — ExecutionId filter", "status": "pending", "blockedBy": [51]},
|
||||
{"id": 59, "subject": "Task 9: End-to-end integration test + docs", "status": "pending", "blockedBy": [53, 54, 55, 56, 57, 58]}
|
||||
],
|
||||
"lastUpdated": "2026-05-21T00:00:00Z"
|
||||
}
|
||||
@@ -0,0 +1,249 @@
|
||||
# Audit Log #23 — Deferred Follow-ups Implementation Plan
|
||||
|
||||
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers-extended-cc:subagent-driven-development to implement this plan task-by-task (bundled cadence — one implementer + one review pass per task).
|
||||
|
||||
**Goal:** Close the five deferred implementation follow-ups from the Audit Log #23 roadmap so site audit events actually reach central, the audit/SiteCall surfaces are complete, and known tech debt is paid down.
|
||||
|
||||
**Architecture:** Five independent-ish workstreams against the existing ScadaLink codebase. The headline change: site→central audit forwarding moves from the production `NoOpSiteStreamAuditClient` stub to a real **ClusterClient-based push** — the same transport notifications already use (`SiteCommunicationActor` → `ClusterClient.Send("/user/central-communication", …)` → `CentralCommunicationActor`), avoiding a new central-hosted gRPC server. The remaining four follow-ups are scoped tech-debt / UI / contract changes.
|
||||
|
||||
**Tech Stack:** .NET 10, Akka.NET (ClusterClient, ClusterClientReceptionist, cluster singletons, TestKit), EF Core 10 (MS SQL + SQLite providers), Blazor Server + Bootstrap CSS (no third-party UI libs), System.CommandLine, xUnit + Akka.TestKit.Xunit2 + bUnit + NSubstitute, Playwright.
|
||||
|
||||
**Spec sources:** `alog.md`, `docs/requirements/Component-AuditLog.md`, `docs/requirements/Component-SiteCallAudit.md`, `docs/plans/2026-05-20-audit-log-code-roadmap.md` (header lines 14–19 enumerate the deferred items).
|
||||
|
||||
**Ground rules (carry into every task):**
|
||||
- Branch off `main` before any code change; never commit on `main`.
|
||||
- Edit in place. Never touch `infra/*`. The `docker/*` cluster config is touched only if a task explicitly says so (none here do).
|
||||
- Stage with explicit `git add <path>` — never `git add .`, never `git commit -am`.
|
||||
- TDD: failing test → minimal code → green → commit. Full solution stays green (`dotnet build ScadaLink.slnx`, `dotnet test ScadaLink.slnx`).
|
||||
- Additive message-contract evolution where possible; where a contract shape must change (Task 8), update every call site in the same task.
|
||||
- Do not push to origin — the user authorizes pushes separately.
|
||||
|
||||
---
|
||||
|
||||
## Task 0: Prep — feature branch
|
||||
|
||||
**Files:** none (git only).
|
||||
|
||||
**Step 1:** From a clean `main`, create the working branch:
|
||||
```bash
|
||||
git checkout main && git status --porcelain # expect clean
|
||||
git checkout -b feature/audit-log-followups
|
||||
```
|
||||
|
||||
**Step 2:** Confirm baseline green:
|
||||
```bash
|
||||
dotnet build ScadaLink.slnx
|
||||
```
|
||||
Expected: build succeeds. (A full `dotnet test` baseline is optional but recommended.)
|
||||
|
||||
**Acceptance:** on branch `feature/audit-log-followups`, solution builds.
|
||||
|
||||
---
|
||||
|
||||
## Task 1: Audit push — central ingest routing over ClusterClient
|
||||
|
||||
**What:** Make the receptionist-registered `CentralCommunicationActor` accept `IngestAuditEventsCommand` (and `IngestCachedTelemetryCommand`) from a site ClusterClient, forward to the `AuditLogIngestActor` cluster-singleton proxy, and pipe the reply back. Mirror the existing `NotificationSubmit` / `RegisterNotificationOutbox` pattern exactly.
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Communication/Actors/CentralCommunicationActor.cs` — add `Receive<IngestAuditEventsCommand>` + `Receive<IngestCachedTelemetryCommand>` handlers; add a `RegisterAuditIngest` registration message handler holding the `AuditLogIngestActor` proxy `IActorRef` (mirror `RegisterNotificationOutbox` at line ~120 / `HandleNotificationSubmit` at line ~130).
|
||||
- Create: `src/ScadaLink.Commons/Messages/Audit/RegisterAuditIngest.cs` — `public sealed record RegisterAuditIngest(IActorRef AuditIngestActor);` (mirror `RegisterNotificationOutbox`).
|
||||
- Modify: `src/ScadaLink.Host/Actors/AkkaHostedService.cs` — after the central `AuditLogIngestActor` singleton + proxy are created (~lines 355–379), `Tell` the `RegisterAuditIngest` to the `CentralCommunicationActor` (mirror how the Notification Outbox proxy is registered).
|
||||
- Test: `tests/ScadaLink.Communication.Tests/Actors/CentralCommunicationActorAuditTests.cs` (new).
|
||||
|
||||
**Approach:**
|
||||
- Handler `Ask`s the registered audit-ingest proxy and `PipeTo`s the `IngestAuditEventsReply` back to the original `Sender` (the ClusterClient round-trips it to the site). Use the existing audit-ingest Ask-timeout convention (30s — see `SiteStreamGrpcServer` `AuditIngestAskTimeout`); add a bound option if no constant is reachable.
|
||||
- If no audit-ingest proxy is registered yet (startup race), reply with an empty `IngestAuditEventsReply([])` — the site keeps the rows `Pending` and retries, exactly as the gRPC handler does today.
|
||||
- `IngestCachedTelemetryCommand` is routed the same way (its reply type is the same `IngestAuditEventsReply` per `AuditLogIngestActor`).
|
||||
|
||||
**Tests (TestKit + NSubstitute):**
|
||||
1. `IngestAuditEventsCommand` with an audit-ingest probe registered → probe receives the command, actor replies the probe's `IngestAuditEventsReply` to the sender.
|
||||
2. `IngestAuditEventsCommand` with no audit-ingest registered → sender gets `IngestAuditEventsReply` with empty `AcceptedEventIds`.
|
||||
3. `IngestCachedTelemetryCommand` routes to the same proxy.
|
||||
|
||||
**Steps:** write failing tests → run (fail) → implement record + handlers + Host registration → run (pass) → `dotnet build ScadaLink.slnx` → commit.
|
||||
|
||||
**Commit:** `feat(communication): route audit ingest commands through CentralCommunicationActor`
|
||||
|
||||
---
|
||||
|
||||
## Task 2: Audit push — real site client, Host wiring, integration test
|
||||
|
||||
**What:** Replace `NoOpSiteStreamAuditClient` (production binding) with a real `ISiteStreamAuditClient` that pushes over ClusterClient via the site's `SiteCommunicationActor`. After this task the site `auditlog.db` `Pending` backlog drains to central.
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.AuditLog/Site/Telemetry/ClusterClientSiteAuditClient.cs` — implements `ISiteStreamAuditClient`; ctor takes the `SiteCommunicationActor` `IActorRef` + an Ask timeout.
|
||||
- Modify: `src/ScadaLink.Communication/Actors/SiteCommunicationActor.cs` — ensure `IngestAuditEventsCommand` / `IngestCachedTelemetryCommand` are forwarded over `ClusterClient.Send("/user/central-communication", …)` with the reply routed back to the Ask (mirror the `NotificationSubmit` forward at lines ~190/214/224).
|
||||
- Modify: `src/ScadaLink.Host/Actors/AkkaHostedService.cs` — in the site telemetry wiring (~lines 648–681), construct `ClusterClientSiteAuditClient` with the `SiteCommunicationActor` ref and pass it to `SiteAuditTelemetryActor` instead of the DI-resolved `NoOpSiteStreamAuditClient`.
|
||||
- Modify: `src/ScadaLink.AuditLog/ServiceCollectionExtensions.cs` (line ~124–129) — keep `NoOpSiteStreamAuditClient` as the DI default (it remains correct for central/test composition roots that have no `SiteCommunicationActor`); update the stale comment that says "M6's reconciliation work brings the real implementation".
|
||||
- Test: `tests/ScadaLink.AuditLog.Tests/Site/Telemetry/ClusterClientSiteAuditClientTests.cs` (new); extend `tests/ScadaLink.IntegrationTests/AuditLog/` with a ClusterClient-push end-to-end test.
|
||||
|
||||
**Approach:**
|
||||
- `IngestAuditEventsAsync(AuditEventBatch, ct)` maps the batch to `IngestAuditEventsCommand(IReadOnlyList<AuditEvent>)`, `Ask`s the `SiteCommunicationActor` for `IngestAuditEventsReply`, maps the reply's `AcceptedEventIds` back into the `IngestAck` the `SiteAuditTelemetryActor` expects.
|
||||
- An Ask timeout / failure must **throw** — `SiteAuditTelemetryActor`'s drain loop already treats a thrown exception as transient (rows stay `Pending`, retried next tick). Keep that contract.
|
||||
- `IngestCachedTelemetryAsync` does the same with `IngestCachedTelemetryCommand`. (`CachedCallTelemetryForwarder` already resolves `ISiteStreamAuditClient` — no change there.)
|
||||
- `AuditEvent` already crosses the wire as the `NotificationSubmit` records do; confirm the Akka serializer handles `IReadOnlyList<AuditEvent>` (notification messages prove the pattern).
|
||||
|
||||
**Tests:**
|
||||
1. `IngestAuditEventsAsync` → batch becomes one `IngestAuditEventsCommand`; mocked actor reply's accepted ids map onto `IngestAck`.
|
||||
2. Partial ack (3 of 5 ids) → `IngestAck` lists only the 3.
|
||||
3. Ask timeout → method throws (drain loop keeps rows `Pending`).
|
||||
4. Integration: boot a site+central pair via the IntegrationTests harness, write an audit event on the site hot-path, assert a central `AuditLog` row appears within ~10s and the site row flips to `Forwarded`.
|
||||
|
||||
**Commit:** `feat(auditlog): real ClusterClient-based site audit push client`
|
||||
|
||||
---
|
||||
|
||||
## Task 3: Consolidate the duplicated audit DTO mappers
|
||||
|
||||
**What:** Collapse the 4 near-duplicate `AuditEvent`↔`AuditEventDto` mapping copies into one canonical mapper. The project-reference cycle (`AuditLog → Communication`, never the reverse) is resolved by hosting the canonical mapper **in `ScadaLink.Communication`** — it owns the generated `AuditEventDto` and references `Commons` for `AuditEvent`, and `AuditLog` already references `Communication`.
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Communication/Grpc/AuditEventDtoMapper.cs` — `public static class` with `ToDto(AuditEvent) → AuditEventDto` and `FromDto(AuditEventDto) → AuditEvent` (lift the canonical logic from `AuditLog/Telemetry/AuditEventMapper.cs`).
|
||||
- Modify: `src/ScadaLink.Communication/Grpc/SiteStreamGrpcServer.cs` — replace the inlined `IngestAuditEvents` loop (~lines 265–295), `AuditEventToDto` (~490–517) and `MapAuditEventFromDto` (~537–561) with calls to `AuditEventDtoMapper`.
|
||||
- Delete: `src/ScadaLink.AuditLog/Telemetry/AuditEventMapper.cs`; update its callers in `ScadaLink.AuditLog` to use `Communication`'s `AuditEventDtoMapper`.
|
||||
- Leave untouched: `SqliteAuditWriter.MapRow` (SQLite `DataReader` → `AuditEvent`, not a DTO mapper — different source type) and `MapSiteCallFromDto` (SiteCall, not audit). Note this in the commit body.
|
||||
- Test: move/merge `tests/ScadaLink.AuditLog.Tests/Telemetry/AuditEventMapperTests.cs` into `tests/ScadaLink.Communication.Tests/Grpc/AuditEventDtoMapperTests.cs`; keep round-trip coverage (`FromDto(ToDto(x)) == x`).
|
||||
|
||||
**Approach:** Pure refactor — no behaviour change. Verify field-by-field parity against all 3 inlined copies before deleting them (null handling, enum parsing, `Int32Value`/`Timestamp` wrapping).
|
||||
|
||||
**Steps:** create mapper + tests → run → swap call sites → delete old copies → `dotnet build` + `dotnet test ScadaLink.slnx` (all green, no behaviour drift) → commit.
|
||||
|
||||
**Commit:** `refactor(auditlog): consolidate AuditEvent DTO mappers into Communication`
|
||||
|
||||
---
|
||||
|
||||
## Task 4: Site Call Audit — query / KPI / detail backend
|
||||
|
||||
**What:** Build the missing read-side backend for the Site Calls UI: Commons message contracts, `SiteCallAuditActor` query/KPI/detail handlers, and `CommunicationService` methods. Mirror `NotificationOutboxQueries.cs` + the Notification Outbox actor/service shape. Spec: `Component-SiteCallAudit.md` §KPIs and §queryable list.
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Messages/Audit/SiteCallQueries.cs` — records mirroring `NotificationOutboxQueries.cs`:
|
||||
- `SiteCallQueryRequest` (CorrelationId, status/site/kind/target filters, date range, page cursor fields, PageSize)
|
||||
- `SiteCallSummary` (TrackedOperationId, SourceSite, Kind, TargetSummary, Status, RetryCount, LastError, provenance, CreatedAtUtc, UpdatedAtUtc, TerminalAtUtc)
|
||||
- `SiteCallQueryResponse` (CorrelationId, Success, ErrorMessage, IReadOnlyList<SiteCallSummary>, next-cursor fields)
|
||||
- `SiteCallKpiRequest` / `SiteCallKpiResponse` (BufferedCount, ParkedCount, FailedLastInterval, DeliveredLastInterval, OldestPendingAge, StuckCount — mirror the Notification Outbox KPI shape; also a per-site variant)
|
||||
- `SiteCallDetailRequest` / `SiteCallDetailResponse` / `SiteCallDetail` (full row incl. LastError, all timestamps).
|
||||
- Modify: `src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs` — add `ReceiveAsync` handlers for the query / KPI / detail requests; query handler calls `ISiteCallAuditRepository.QueryAsync` (keyset paging on `(CreatedAtUtc DESC, TrackedOperationId DESC)`); KPI handler computes point-in-time counts from the `SiteCalls` table (stuck = `Pending`/`Retrying` older than the configurable threshold, default 10 min). Use the per-message DI scope pattern already in the actor.
|
||||
- Add repo support if needed: `src/ScadaLink.ConfigurationDatabase/Repositories/SiteCallAuditRepository.cs` may need a KPI-count method + a detail `GetAsync` (a `GetAsync(TrackedOperationId)` already exists).
|
||||
- Modify: `src/ScadaLink.Communication/CommunicationService.cs` — add `QuerySiteCallsAsync`, `GetSiteCallKpisAsync`, `GetPerSiteSiteCallKpisAsync`, `GetSiteCallDetailAsync` (mirror `QueryNotificationOutboxAsync` etc.: `Ask` the `SiteCallAuditActor` proxy with `_options.QueryTimeout`).
|
||||
- Test: `tests/ScadaLink.SiteCallAudit.Tests/` (actor handlers), `tests/ScadaLink.Commons.Tests/` (contract shape), `tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/SiteCallAuditRepositoryTests.cs` (extend for KPI counts).
|
||||
|
||||
**Commit:** `feat(sitecallaudit): query, KPI and detail backend for the Site Calls page`
|
||||
|
||||
---
|
||||
|
||||
## Task 5: Site Call Audit — Retry/Discard relay to owning site
|
||||
|
||||
**What:** Central UI Retry/Discard on a parked Site Call must relay `RetryParkedOperation` / `DiscardParkedOperation` to the **owning site** (sites are the source of truth — central never mutates the `SiteCalls` row directly; the corrected row arrives back via telemetry). Spec: `Component-SiteCallAudit.md` §actions-on-parked-rows.
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.Commons/Messages/Audit/SiteCallRelayMessages.cs` — `RetryParkedOperationRequest`/`Response`, `DiscardParkedOperationRequest`/`Response` (carry `TrackedOperationId`, `SourceSite`, `CorrelationId`; response carries Success + a "site unreachable" error case).
|
||||
- Modify: `src/ScadaLink.SiteCallAudit/SiteCallAuditActor.cs` (or a small relay collaborator) — on a relay request, look up the owning site and forward `RetryParkedOperation`/`DiscardParkedOperation` to that site over the central→site ClusterClient (the central side already maintains one ClusterClient per site; reuse the `CentralCommunicationActor` site-addressing path). On no/late reply → respond "site unreachable".
|
||||
- Modify: `src/ScadaLink.Communication/Actors/SiteCommunicationActor.cs` — receive `RetryParkedOperation`/`DiscardParkedOperation` and hand to the site operation-tracking subsystem.
|
||||
- Modify the site operation-tracking owner (S&F operation-tracking store / `ParkedMessageHandlerActor` in `src/ScadaLink.StoreAndForward/`) — Retry resets a parked tracked operation to `Pending` for the retry loop; Discard marks it `Discarded`. Reuse the parked-message handling that already backs notification Retry/Discard.
|
||||
- Modify: `src/ScadaLink.Communication/CommunicationService.cs` — add `RetrySiteCallAsync` / `DiscardSiteCallAsync`.
|
||||
- Test: `tests/ScadaLink.SiteCallAudit.Tests/` (relay routing + unreachable path), `tests/ScadaLink.StoreAndForward.Tests/` (site-side parked op reset/discard), `tests/ScadaLink.Communication.Tests/`.
|
||||
|
||||
**Note for implementer:** this is the meatiest backend task — the central→site relay direction and the site-side parked-operation mutation are both required. If the site operation-tracking Retry/Discard primitive already exists for cached calls, reuse it; only add the message plumbing.
|
||||
|
||||
**Commit:** `feat(sitecallaudit): central→site Retry/Discard relay for parked operations`
|
||||
|
||||
---
|
||||
|
||||
## Task 6: Site Calls UI page + nav + Audit drill-in
|
||||
|
||||
**What:** Build the Central UI Site Calls page — a near-mirror of `NotificationReport.razor`. Spec: `Component-SiteCallAudit.md`.
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.CentralUI/Components/Pages/SiteCalls/SiteCallsReport.razor` (+ `.razor.cs`) — route `@page "/site-calls/report"`, `RequireDeployment` (or `OperationalAudit`) auth to match the Notifications report gating. Structure (per the form-layout memory: header, filter card, results table, paging, modal):
|
||||
- Filter card: Status, Kind, Source site, Target keyword, date range, "Stuck only" checkbox, Clear/Query.
|
||||
- Results table columns: TrackedOperationId, Source site, Kind, Target, Status (badge + Stuck indicator), Retries, Last error, Created, Updated, Actions.
|
||||
- Actions column: a **"View audit history"** link `href="/audit/log?correlationId=@row.TrackedOperationId"` (the `TrackedOperationId` is the audit `CorrelationId`) — mirror `NotificationReport.razor:172`; plus **Retry/Discard** buttons shown only on `Parked` rows (none on `Failed`).
|
||||
- Keyset Previous/Next paging; double-click row → detail modal (body shows full row + LastError; reuse the Notifications detail-modal idiom — never `MarkupString`).
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Layout/NavMenu.razor` — register the Site Calls page (own "Site Calls" section, or under an existing group, consistent with the `Notifications` / `Audit` section pattern at lines ~65–129).
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Pages/Audit/AuditLogPage.razor.cs` — confirm `?correlationId=` drill-in already covers this (it does); no change expected — just verify.
|
||||
- Test: `tests/ScadaLink.CentralUI.Tests/Pages/` (bUnit — scaffold, paging, parked-only actions, drill-in link), `tests/ScadaLink.CentralUI.PlaywrightTests/SiteCalls/SiteCallsPageTests.cs` (new).
|
||||
|
||||
**Use the `frontend-design` skill** for page/component styling guidance. Blazor Server + Bootstrap only; custom components; clean corporate aesthetic.
|
||||
|
||||
**Commit:** `feat(centralui): Site Calls page with Retry/Discard and Audit drill-in`
|
||||
|
||||
---
|
||||
|
||||
## Task 7: Site Call KPI tiles + Health dashboard integration
|
||||
|
||||
**What:** Surface Site Call Audit KPIs on the Health dashboard, mirroring the Notification Outbox tiles + `AuditKpiTiles`.
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.CentralUI/Components/Health/SiteCallKpiTiles.razor` (+ `.razor.cs`) — mirror `Components/Health/AuditKpiTiles.razor`; tiles for Buffered, Parked (danger border if >0), Stuck (warning border if >0); each tile navigates to `/site-calls/report` with a query-string filter.
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Pages/Monitoring/Health.razor` (+ code-behind) — add a "Site Calls" KPI section between the Notification Outbox and Audit Log sections; load via `CommunicationService.GetSiteCallKpisAsync` (Task 4).
|
||||
- Test: `tests/ScadaLink.CentralUI.Tests/` (bUnit — tile rendering, threshold borders, navigation targets).
|
||||
|
||||
**Commit:** `feat(centralui): Site Call KPI tiles on the Health dashboard`
|
||||
|
||||
---
|
||||
|
||||
## Task 8: Multi-value `AuditLogQueryFilter` — contract + repository
|
||||
|
||||
**What:** Widen `AuditLogQueryFilter` from single-value to multi-value on the `Channel`, `Kind`, `Status`, `SourceSiteId` dimensions, and translate them to `IN (...)` in the repository. `Target`, `Actor`, `CorrelationId`, `FromUtc`, `ToUtc` stay as-is. Keyset paging must not change.
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.Commons/Types/Audit/AuditLogQueryFilter.cs` — change `Channel`/`Kind`/`Status`/`SourceSiteId` to `IReadOnlyList<…>?` (e.g. `IReadOnlyList<AuditChannel>? Channels`). Keep the record's other params. This is a **breaking shape change** — update every call site in this task.
|
||||
- Modify: `src/ScadaLink.ConfigurationDatabase/Repositories/AuditLogRepository.cs` (`QueryAsync`, ~lines 119–165) — each widened dimension becomes `if (filter.Channels is { Count: > 0 }) query = query.Where(e => filter.Channels.Contains(e.Channel));`. Empty/null list = no filter. Keyset predicate + `OrderByDescending` untouched.
|
||||
- Update all other `AuditLogQueryFilter` constructors in this task so the solution compiles (ManagementService `ParseFilter`, CentralUI `AuditQueryModel.ToFilter`, CLI helpers, tests) — the deep behaviour of those consumers is Task 9; here just make them compile (e.g. wrap a single value in a one-element list).
|
||||
- Test: `tests/ScadaLink.ConfigurationDatabase.Tests/Repositories/AuditLogRepositoryTests.cs` — add `QueryAsync_FilterByMultipleChannels_ReturnsUnion`, multi-status, multi-site; keep the existing single-value and keyset tests green.
|
||||
|
||||
**Commit:** `feat(auditlog): multi-value AuditLogQueryFilter dimensions`
|
||||
|
||||
---
|
||||
|
||||
## Task 9: Multi-value filters — ManagementService, CLI, Central UI
|
||||
|
||||
**What:** Make the three consumers actually emit/accept multiple values per dimension instead of collapsing to the first.
|
||||
|
||||
**Files:**
|
||||
- Modify: `src/ScadaLink.ManagementService/AuditEndpoints.cs` (`ParseFilter`, ~lines 369–414) — read repeated query params with `.ToArray()` (not `.ToString()`); parse each into the enum list; unparseable values silently dropped (keep the existing lax contract).
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Audit/AuditQueryModel.cs` (`ToFilter`, ~lines 110–126) — stop collapsing to `.First()`; pass the full `Channels`/`Kinds`/`Statuses`/`SiteIdentifiers` sets. Adjust the `ErrorsOnly` logic (lines ~128–145) for multi-value `Status`. The chip UI already supports multi-select — no `.razor` change expected; verify.
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Pages/Audit/AuditLogPage.razor.cs` export-URL builder (~lines 175–227) — emit repeated query-string params per selected value.
|
||||
- Modify: `src/ScadaLink.CLI/Commands/AuditCommands.cs` (~lines 29–41) — make `--channel`/`--kind`/`--status`/`--site` accept multiple values (System.CommandLine multi-arity options; keep `AcceptOnlyFromAmong` for the enum-like ones). Modify `src/ScadaLink.CLI/Commands/AuditQueryHelpers.cs` — `AuditQueryArgs` fields become arrays; `BuildQueryString` emits one key per value.
|
||||
- Test: extend `tests/ScadaLink.ManagementService.Tests/AuditEndpointsTests.cs`, `tests/ScadaLink.CLI.Tests/Commands/AuditQueryCommandTests.cs`, `tests/ScadaLink.CentralUI.Tests/` filter-model tests for multi-value round-trips.
|
||||
|
||||
**Commit:** `feat(audit): multi-value filters across ManagementService, CLI and Central UI`
|
||||
|
||||
---
|
||||
|
||||
## Task 10: Audit results grid — column resize + reorder UX
|
||||
|
||||
**What:** Add drag-to-resize and drag-to-reorder column UX to `AuditResultsGrid`, persisted in `sessionStorage`. Blazor + Bootstrap + minimal JS interop only (no third-party libs).
|
||||
|
||||
**Files:**
|
||||
- Create: `src/ScadaLink.CentralUI/wwwroot/js/audit-grid.js` — a `window.auditGrid` namespace: column-resize drag handlers, header drag-reorder handlers, and `save(key,json)` / `load(key)` over `sessionStorage` (mirror `treeview-storage.js`).
|
||||
- Modify: `src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor` (+ `.razor.cs`) — render a resize handle in each `<th>`; make headers draggable; apply persisted widths (inline style/CSS var) and column order (the `ColumnOrder` parameter + `OrderedColumns()` already exist — wire it to persisted state); `IJSRuntime` calls to load on first render and save on change.
|
||||
- Create: `src/ScadaLink.CentralUI/Components/Audit/AuditResultsGrid.razor.css` — resize-handle styling, drag-over feedback (mirror `AuditDrilldownDrawer.razor.css` / `TreeView.razor.css` idioms).
|
||||
- Reference the script from the host page (`App.razor` / `_Host` / layout — match where `monaco-init.js` / `session-expiry.js` are referenced).
|
||||
- Test: extend `tests/ScadaLink.CentralUI.PlaywrightTests/Audit/AuditLogPageTests.cs` (or new `AuditGridColumnTests.cs`) — resize changes a column width, reorder changes header order, both survive a reload via `sessionStorage`.
|
||||
|
||||
**Use the `frontend-design` skill** for the resize-handle / drag-feedback visual treatment.
|
||||
|
||||
**Commit:** `feat(centralui): column resize and reorder for the audit results grid`
|
||||
|
||||
---
|
||||
|
||||
## Final review
|
||||
|
||||
After Task 10: dispatch a final cross-cutting code review of the whole branch against this plan, then run the full solution build + test once more. Update `docs/plans/2026-05-20-audit-log-code-roadmap.md` header lines 14–19 to strike the five now-completed follow-ups (leaving the three v1.x items). Hand back to the user for the push decision (do not push).
|
||||
|
||||
---
|
||||
|
||||
## Task dependency summary
|
||||
|
||||
- Task 0 blocks everything.
|
||||
- Task 2 blocked by Task 1.
|
||||
- Task 3 independent (after Task 0).
|
||||
- Task 5 blocked by Task 4.
|
||||
- Task 6 blocked by Tasks 4 and 5.
|
||||
- Task 7 blocked by Task 4.
|
||||
- Task 9 blocked by Task 8.
|
||||
- Task 10 independent (after Task 0).
|
||||
|
||||
Execution order: 0 → 1 → 2 → 3 → 4 → 5 → 6 → 7 → 8 → 9 → 10 → final review.
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"planPath": "docs/plans/2026-05-21-audit-log-followups.md",
|
||||
"tasks": [
|
||||
{"id": 33, "subject": "Task 0: Prep — feature branch", "status": "completed"},
|
||||
{"id": 34, "subject": "Task 1: Audit push — central ingest routing over ClusterClient", "status": "completed", "blockedBy": [33]},
|
||||
{"id": 35, "subject": "Task 2: Audit push — real site client, Host wiring, integration test", "status": "completed", "blockedBy": [34]},
|
||||
{"id": 36, "subject": "Task 3: Consolidate the duplicated audit DTO mappers", "status": "completed", "blockedBy": [33]},
|
||||
{"id": 37, "subject": "Task 4: Site Call Audit — query / KPI / detail backend", "status": "completed", "blockedBy": [33]},
|
||||
{"id": 38, "subject": "Task 5: Site Call Audit — Retry/Discard relay to owning site", "status": "completed", "blockedBy": [37]},
|
||||
{"id": 39, "subject": "Task 6: Site Calls UI page + nav + Audit drill-in", "status": "completed", "blockedBy": [37, 38]},
|
||||
{"id": 40, "subject": "Task 7: Site Call KPI tiles + Health dashboard integration", "status": "completed", "blockedBy": [37]},
|
||||
{"id": 41, "subject": "Task 8: Multi-value AuditLogQueryFilter — contract + repository", "status": "completed", "blockedBy": [33]},
|
||||
{"id": 42, "subject": "Task 9: Multi-value filters — ManagementService, CLI, Central UI", "status": "completed", "blockedBy": [41]},
|
||||
{"id": 43, "subject": "Task 10: Audit results grid — column resize + reorder UX", "status": "completed", "blockedBy": [33]}
|
||||
],
|
||||
"lastUpdated": "2026-05-21T12:00:00Z"
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
# Audit Log (#23) — /goal Prompt
|
||||
|
||||
Self-contained prompt to drive autonomous execution of the Audit Log
|
||||
implementation from `docs/plans/2026-05-20-audit-log-code-roadmap.md`.
|
||||
|
||||
**Usage:** copy the fenced block below and paste into `/goal` in this repo.
|
||||
Each milestone executes, ships to `main`, then updates downstream roadmap
|
||||
sections with realities learned before starting the next milestone.
|
||||
|
||||
If your `/goal` environment doesn't pre-load skill identifiers, replace
|
||||
`superpowers-extended-cc:brainstorming` (and similar) with whatever
|
||||
invocation form is correct. The workflow is the same.
|
||||
|
||||
---
|
||||
|
||||
```
|
||||
GOAL: Implement the ScadaLink Audit Log component (#23) end-to-end against
|
||||
the existing codebase by executing all 8 milestones from the roadmap at
|
||||
docs/plans/2026-05-20-audit-log-code-roadmap.md. After each milestone ships
|
||||
to main, update the remaining milestones in the roadmap with what was
|
||||
actually learned. Done when M8 is merged and the full test suite is green.
|
||||
|
||||
═══════ STARTING STATE ═══════
|
||||
Repo: /Users/dohertj2/Desktop/scadalink-design
|
||||
Starting ref: main at 39a3ca3
|
||||
Spec (immut.): alog.md (commit fec0bb1) + docs/requirements/Component-AuditLog.md
|
||||
Roadmap: docs/plans/2026-05-20-audit-log-code-roadmap.md (8 milestones, ~100 tasks)
|
||||
Working tree: 3 infra/* modifications are PRE-EXISTING and UNRELATED.
|
||||
Never touch them. Use explicit `git add <path>`; never `commit -am`.
|
||||
|
||||
User-memory feedback to honor (under ~/.claude/projects/<this>/memory/):
|
||||
• feedback_subagent_cadence.md — bundle small adjacent tasks per milestone
|
||||
• feedback_central_ui.md — Blazor Server + Bootstrap only, no 3rd-party frameworks
|
||||
• feedback_form_layout.md — vertical stack, read-only first, buttons at bottom
|
||||
• feedback_recommendations.md — lead with a recommended option
|
||||
|
||||
═══════ INVARIANTS (never violate) ═══════
|
||||
1. Cross-refs to Component-AuditLog.md use PROSE anchors (e.g., "Payload
|
||||
Capture Policy"), NEVER `§N`. § only exists in alog.md.
|
||||
2. CachedCallTelemetry is the correct message name. CachedOperationTelemetry
|
||||
does not exist — never introduce it.
|
||||
3. Audit-write failures NEVER abort user-facing actions anywhere — ESG calls,
|
||||
DB writes, Inbound API requests, Notify dispatches all continue normally
|
||||
if the audit append throws.
|
||||
4. AuditLog is strictly append-only at central. No UPDATE. No row-level
|
||||
DELETE. Purge is partition-switch only.
|
||||
5. Site SQLite purge requires ForwardState ∈ {Forwarded, Reconciled}; pending
|
||||
rows are never age-purged.
|
||||
6. Do NOT push to origin. Never. Merges to main are local-only.
|
||||
7. Hash-chain tamper-evidence and Parquet archival are deferred to v1.x.
|
||||
Do not implement them. Per-channel retention overrides also deferred.
|
||||
|
||||
═══════ PER-MILESTONE LOOP (M1 → M8) ═══════
|
||||
For each milestone N from 1 to 8, in order:
|
||||
|
||||
A. BRANCH
|
||||
git switch -c feature/audit-log-m{N}-<slice> from main.
|
||||
|
||||
B. BRAINSTORM
|
||||
Invoke `superpowers-extended-cc:brainstorming` to nail any code-level
|
||||
decisions not fixed by the spec or roadmap (test-fixture placement,
|
||||
exact dispatcher names, helper signatures, etc.). One question at a
|
||||
time, recommended option first.
|
||||
|
||||
C. WRITING-PLANS
|
||||
Invoke `superpowers-extended-cc:writing-plans` to turn the milestone's
|
||||
roadmap section into a milestone-specific executable plan at
|
||||
docs/plans/2026-XX-XX-auditlog-m{N}-<slice>.md with peer .tasks.json.
|
||||
Each task: exact file paths, TDD steps (failing test → impl → passing →
|
||||
commit), commit message.
|
||||
|
||||
D. EXECUTE
|
||||
Invoke `superpowers-extended-cc:subagent-driven-development`.
|
||||
Per the cadence memory: bundle small adjacent tasks into one
|
||||
implementer dispatch + one combined spec+quality reviewer per bundle.
|
||||
Trivial 1–3 line fix-ups may be controller-direct edits; substantive
|
||||
fixes go through a fresh implementer subagent. End each milestone with
|
||||
a final cross-bundle reviewer over the whole branch.
|
||||
|
||||
E. VERIFY (gate — do not skip)
|
||||
• Milestone-specific tests pass: dotnet test --filter <pattern>
|
||||
• Full solution tests still pass: dotnet test ScadaLink.slnx
|
||||
• Every acceptance criterion from the roadmap's M{N} section is met,
|
||||
cited by name to the test that proves it.
|
||||
• If anything is red, fix-loop. Never proceed with red tests.
|
||||
|
||||
F. MERGE
|
||||
git switch main
|
||||
git merge --no-ff feature/audit-log-m{N}-<slice> -m "<summary>"
|
||||
No push.
|
||||
|
||||
G. UPDATE DOWNSTREAM ROADMAP (the novel step — do not skip)
|
||||
Before starting M{N+1}, edit the M{N+1}..M8 sections of
|
||||
docs/plans/2026-05-20-audit-log-code-roadmap.md to reflect realities
|
||||
learned during M{N}:
|
||||
• Correct any file paths that turned out different.
|
||||
• Update class/method names that got renamed during M{N}.
|
||||
• Add brief notes under affected later-milestone tasks pointing at
|
||||
the actual helper / fixture / pattern used in M{N} that the next
|
||||
milestone should reuse.
|
||||
• Remove tasks that turned out unnecessary; add tasks that turned
|
||||
out missing.
|
||||
• Do NOT rewrite milestones wholesale — only update what M{N}
|
||||
demonstrably changed.
|
||||
Commit on main:
|
||||
docs(audit): roadmap corrections after M{N}
|
||||
|
||||
H. STATUS UPDATE
|
||||
Output one paragraph:
|
||||
- Milestone N and slice name
|
||||
- Commit SHA range merged to main
|
||||
- Tests added / total passing
|
||||
- Roadmap corrections summary (one line)
|
||||
- Next milestone
|
||||
|
||||
I. PROCEED to M{N+1} (back to step A).
|
||||
|
||||
═══════ ASK THE USER BEFORE ═══════
|
||||
• Implementing anything that contradicts the spec — flag the design gap,
|
||||
let the user decide whether to fix the design first.
|
||||
• Touching a file the user is editing in working tree (always check
|
||||
`git status` first).
|
||||
• Pulling a deferred v1.x feature into v1 scope.
|
||||
• Pushing to origin (never push without explicit authorization).
|
||||
• A milestone's acceptance criteria turn out unachievable as written.
|
||||
|
||||
═══════ TERMINATION ═══════
|
||||
Done when ALL hold:
|
||||
1. M8 merged to main.
|
||||
2. `dotnet test ScadaLink.slnx` green (full solution).
|
||||
3. `dotnet test tests/ScadaLink.IntegrationTests/` green.
|
||||
4. Roadmap reflects what was actually shipped (corrections committed).
|
||||
5. infra/* still untouched and uncommitted.
|
||||
6. alog.md unchanged from fec0bb1 (or, if changed, the design correction
|
||||
was committed BEFORE the affected code change — never the reverse).
|
||||
7. Component-AuditLog.md unchanged unless a clarification was committed
|
||||
first, same discipline.
|
||||
|
||||
═══════ START ═══════
|
||||
Begin with M1: Foundation (schema, types, DB roles, partitioning).
|
||||
The roadmap's M1 section is the source of truth for the task list. Read it
|
||||
fully before doing anything else, then proceed through the per-milestone
|
||||
loop. Build to ship; do not get clever; trust the design.
|
||||
```
|
||||
@@ -0,0 +1,236 @@
|
||||
# Design: Notification Outbox
|
||||
|
||||
**Date:** 2026-05-18
|
||||
**Status:** Basic design — approved, open for refinement.
|
||||
|
||||
## Problem
|
||||
|
||||
Notification delivery today happens at the site clusters: scripts call `Notify.To().Send()`,
|
||||
the Notification Service composes an email, and the site sends it via SMTP. The Store-and-Forward
|
||||
Engine buffers transient failures. Two gaps motivated this design:
|
||||
|
||||
1. **No audit trail.** A successful send is recorded nowhere. A permanently-failed send is
|
||||
returned to the script and then lost. Only a transiently-failed-and-buffered notification
|
||||
is visible — indirectly, as Store-and-Forward activity.
|
||||
2. **No monitoring.** There is no view of delivery health: no KPIs, and no way to find
|
||||
notifications that are stuck or have been parked.
|
||||
|
||||
## Solution overview
|
||||
|
||||
Invert where delivery happens. Sites no longer send notifications directly. Instead:
|
||||
|
||||
- A site script's notification is **store-and-forwarded to the central cluster**.
|
||||
- Central **logs every notification to a `Notifications` table** in the central config DB
|
||||
(MS SQL) — the single source of audit truth.
|
||||
- A central **Notification Outbox** dispatches and delivers from that table, with retry,
|
||||
parking, per-notification status, and KPIs.
|
||||
|
||||
The `Notifications` table is type-agnostic so it can record any notification type the system
|
||||
supports — email today, Microsoft Teams and others later.
|
||||
|
||||
### End-to-end flow
|
||||
|
||||
```
|
||||
Site script: Notify.To("list").Send(subject, body)
|
||||
│ generate NotificationId (GUID) locally; return it to the script immediately
|
||||
▼
|
||||
Site Store-and-Forward Engine (notification category, target = central)
|
||||
│ durably forwards to central via the Communication Layer (ClusterClient);
|
||||
│ buffers/retries if central is unreachable
|
||||
▼
|
||||
Central ingest: insert-if-not-exists on NotificationId → Notifications table (Pending)
|
||||
│ ack the site → site S&F clears the message
|
||||
▼
|
||||
Central Notification Outbox actor (singleton, active central node)
|
||||
│ polls due rows; resolves the list; delivers via the matching adapter
|
||||
├── success → Delivered
|
||||
├── transient failure → Retrying (schedule NextAttemptAt)
|
||||
└── permanent failure
|
||||
/ retries exhausted → Parked
|
||||
```
|
||||
|
||||
`Notify.Status(notificationId)` returns a small **status record** — status, retry count,
|
||||
last error, and key timestamps (enqueued, delivered). While the notification is still in the
|
||||
site S&F buffer the site answers the query **locally** (status `Forwarding`); once forwarded,
|
||||
the query round-trips to central and reads the `Notifications` table.
|
||||
|
||||
## Component design
|
||||
|
||||
### New component #21: Notification Outbox
|
||||
|
||||
A **central** component — the first outbox to live centrally (the Store-and-Forward Engine
|
||||
remains site-only).
|
||||
|
||||
- **Location:** Central cluster.
|
||||
- **Actor:** `NotificationOutboxActor` — a **singleton on the active central node**.
|
||||
- **Owns:** the durable central queue (the `Notifications` table), the dispatcher loop,
|
||||
retry scheduling, parking, per-notification status tracking, and KPI computation.
|
||||
- SMTP/HTTP delivery is blocking I/O — delivery work runs on a **dedicated blocking-I/O
|
||||
dispatcher** (same pattern as Script Execution Actors).
|
||||
|
||||
### Notification Service (revised)
|
||||
|
||||
Shrinks to two clear jobs, both **central-only**:
|
||||
|
||||
- Manage **notification-list and SMTP definitions** in the config DB.
|
||||
- Provide **delivery adapters** — stateless "deliver one notification" implementations per
|
||||
type (see below).
|
||||
|
||||
Notifications and SMTP config are **no longer deployed to sites**. Sites never talk to SMTP.
|
||||
|
||||
### Store-and-Forward Engine (revised)
|
||||
|
||||
Keeps its notification category, but the delivery *target* changes from SMTP to **central**.
|
||||
"Delivering" a buffered notification now means handing it to the Communication Layer for the
|
||||
central cluster and clearing it on central's ack. The site→central forward uses a fixed
|
||||
retry interval configured in the host `appsettings.json` — it concerns reaching the central
|
||||
cluster rather than any notification list.
|
||||
|
||||
## Typed notification lists
|
||||
|
||||
Each notification list gains a **`Type`** field plus type-specific targets:
|
||||
|
||||
- `Email` — a set of recipient addresses (implemented now).
|
||||
- `Teams`, others — future types.
|
||||
|
||||
`Notify.To("list")` works transparently for any type — the script does not care. Lists are
|
||||
defined and stored centrally only.
|
||||
|
||||
**Recipient resolution happens at central, at delivery time** — the site forwards only
|
||||
`(listName, subject, body)`. This keeps definitions in one place and removes the deploy-to-sites
|
||||
artifact entirely.
|
||||
|
||||
## The `Notifications` table (central MS SQL)
|
||||
|
||||
Type-agnostic. One row per notification.
|
||||
|
||||
| Field | Notes |
|
||||
|---|---|
|
||||
| `NotificationId` | GUID, primary key. Generated at the **site**; used as the idempotency key. |
|
||||
| `Type` | `Email` / `Teams` / … discriminator. |
|
||||
| `ListName` | Target notification list. |
|
||||
| `Subject`, `Body` | Plain-text content. |
|
||||
| `TypeData` | JSON — extensibility hook for future per-type fields. |
|
||||
| `Status` | `Pending` → `Retrying` → `Delivered` / `Parked` / `Discarded`. |
|
||||
| `RetryCount` | Delivery attempts so far. |
|
||||
| `LastError` | Detail of the most recent failure. |
|
||||
| `ResolvedTargets` | Who the notification actually went to — snapshotted by central at delivery time, for audit. |
|
||||
| `SourceSiteId`, `SourceInstanceId`, `SourceScript` | Provenance. |
|
||||
| `SiteEnqueuedAt` | When the script called `Send()` (carried from the site). |
|
||||
| `CreatedAt` | When central ingested the row. |
|
||||
| `LastAttemptAt`, `NextAttemptAt`, `DeliveredAt` | Delivery timestamps. |
|
||||
|
||||
All timestamps are UTC.
|
||||
|
||||
### Status lifecycle
|
||||
|
||||
- `Forwarding` — in the site S&F buffer, not yet received by central. **Site-local only** —
|
||||
never stored in the central `Notifications` table; reported by `Notify.Status` while the
|
||||
site still holds the notification.
|
||||
- `Pending` — ingested by central, awaiting first dispatch.
|
||||
- `Retrying` — a transient failure occurred; `NextAttemptAt` schedules the next attempt.
|
||||
- `Delivered` — terminal, success.
|
||||
- `Parked` — terminal-not-delivered: a permanent failure, or retries exhausted. `LastError`
|
||||
distinguishes which.
|
||||
- `Discarded` — terminal, reached **only by operator action** on a parked notification. The
|
||||
row is kept (not deleted) so the table remains a complete audit record.
|
||||
|
||||
### Retry policy
|
||||
|
||||
Delivery retry reuses the central SMTP configuration's max-retry-count and fixed retry
|
||||
interval — consistent with the existing fixed-interval (no backoff) convention.
|
||||
|
||||
### Retention
|
||||
|
||||
Terminal rows (`Delivered`, `Parked`, `Discarded`) are removed by a **daily purge job** after
|
||||
a configurable window (default ~1 year). This preserves a strong audit trail while bounding
|
||||
table growth. Non-terminal rows are never purged.
|
||||
|
||||
## Delivery adapters
|
||||
|
||||
An `INotificationDeliveryAdapter` is registered per `Type`. Each `Deliver(...)` call returns
|
||||
one of `success | transient failure | permanent failure`, mirroring the External System
|
||||
Gateway error-classification pattern.
|
||||
|
||||
- **Email adapter — implemented now.** The existing SMTP composition/send logic, relocated
|
||||
to the central cluster.
|
||||
- **Teams and other adapters — future.** The `Type` discriminator and the adapter interface
|
||||
are the seam; no Teams code is written in this basic plan. Teams auth and targeting
|
||||
(Incoming Webhooks vs Graph API) is a separate design conversation.
|
||||
|
||||
## Active/standby behavior
|
||||
|
||||
The `NotificationOutboxActor` is a singleton on the active central node. All outbox state
|
||||
lives in MS SQL, which is already the central HA store — so no Akka-level replication is
|
||||
needed (unlike the site S&F engine). On central failover the new active node resumes
|
||||
dispatch directly from the table.
|
||||
|
||||
The site→central handoff is **at-least-once**: central acks only after the row is persisted,
|
||||
and a lost ack causes the site to resend. The GUID `NotificationId` idempotency key makes a
|
||||
resend harmless (insert-if-not-exists). A rare failover mid-delivery could re-send one
|
||||
already-`Delivered` notification — an accepted trade-off, consistent with the duplicate-delivery
|
||||
trade-off the Store-and-Forward Engine already accepts.
|
||||
|
||||
## Monitoring
|
||||
|
||||
### KPIs
|
||||
|
||||
Central-computed from the `Notifications` table — global, with a per-source-site breakdown:
|
||||
|
||||
- **Queue depth** — count of `Pending` + `Retrying`.
|
||||
- **Stuck count** — `Pending`/`Retrying` rows older than a configurable age threshold
|
||||
(default 10 minutes).
|
||||
- **Parked count** — count of `Parked`.
|
||||
- **Delivered (last interval)** — count of `Delivered` since the previous sample.
|
||||
- **Oldest pending age** — age of the oldest non-terminal notification.
|
||||
|
||||
### Stuck detection
|
||||
|
||||
A notification is **stuck** if it is `Pending` or `Retrying` and older than the configurable
|
||||
age threshold. Detection is **display-only** — a count KPI and a row badge. No automated
|
||||
escalation or alerting, consistent with the current system-wide no-alerting policy.
|
||||
|
||||
### Surfacing
|
||||
|
||||
- **Health Monitoring dashboard** — headline KPI tiles: queue depth, stuck count, parked
|
||||
count. These are central-computed (not part of the site health report). The site S&F
|
||||
notification backlog remains a separate site health metric, covering the site→central leg.
|
||||
- **New Central UI "Notification Outbox" page** — KPI tiles plus a queryable notification
|
||||
list: filter by status, type, source site, list, and time range; a stuck-only toggle;
|
||||
keyword search on subject. Parked notifications offer **Retry** (→ `Pending`, reset
|
||||
`RetryCount`/`NextAttemptAt`) and **Discard** (→ `Discarded`) actions. Stuck rows are badged.
|
||||
|
||||
## Cross-document impact
|
||||
|
||||
| Document | Change |
|
||||
|---|---|
|
||||
| `Component-NotificationOutbox.md` | **New** — component #21. |
|
||||
| `Component-NotificationService.md` | Delivery moves central; lists gain a `Type`; no deploy-to-sites; async script API; delivery adapters. |
|
||||
| `Component-StoreAndForward.md` | Notification category retargeted from SMTP to central. |
|
||||
| `Component-HealthMonitoring.md` | Outbox KPIs added as central-computed headline metrics. |
|
||||
| `Component-SiteEventLogging.md` | New Notification event category — logs site→central forward failures and long-buffered notifications. |
|
||||
| `Component-CentralUI.md` | New Notification Outbox page. |
|
||||
| Central–Site Communication | New `NotificationSubmit` + ack message pair. |
|
||||
| Configuration Database / Commons | `Notifications` table, entity POCO, repository interface + implementation, EF migration, message contracts. |
|
||||
| `README.md` | Component table 20 → 21. |
|
||||
| `CLAUDE.md` | Component list 20 → 21; new key design decisions. |
|
||||
|
||||
## Refinement decisions (2026-05-18)
|
||||
|
||||
- **Site→central forward retry config** — the fixed forward-retry interval lives in the host
|
||||
`appsettings.json` (infrastructure config, not a deployed artifact).
|
||||
- **`Notify.Status` payload** — returns a status record: status, retry count, last error,
|
||||
and key timestamps (enqueued, delivered).
|
||||
- **Stuck threshold default** — 10 minutes, configurable.
|
||||
- **Pre-ingest status** — a distinct site-local `Forwarding` state; the site answers
|
||||
`Notify.Status` from its own S&F buffer without a round-trip to central.
|
||||
- **Site-side diagnostics** — Site Event Logging records site→central **forward failures**
|
||||
and long-buffered notifications only, not routine enqueue/forward success events.
|
||||
- **KPI history** — point-in-time only, computed on demand from the `Notifications` table;
|
||||
the ~1-year row retention answers historical questions directly, so no separate
|
||||
time-series store is added.
|
||||
|
||||
## Open questions
|
||||
|
||||
None outstanding — the basic design is fully specified. The next step is an implementation
|
||||
plan against the cross-document impact table.
|
||||
@@ -0,0 +1,419 @@
|
||||
# Component: Audit Log
|
||||
|
||||
## Purpose
|
||||
|
||||
Provides a single, append-only, forensic + operational record of every
|
||||
integration action initiated by, or terminating in, a script — across outbound
|
||||
API, outbound DB, notifications, and inbound API. One row per lifecycle event,
|
||||
rich payloads, long retention, dashboards, drilldowns, and filter queries,
|
||||
answering both forensic questions ("did instance X send notification Y on date
|
||||
Z, with what body?") and operational ones ("which inbound caller is hammering
|
||||
us right now?").
|
||||
|
||||
The Audit Log is **not a dispatcher**. It does not drive delivery, retry loops,
|
||||
or operator Retry/Discard actions — those remain in [Notification Outbox](Component-NotificationOutbox.md)
|
||||
and [Site Call Audit](Component-SiteCallAudit.md). The Audit Log is the
|
||||
immutable history that **observes** those subsystems and adds coverage where
|
||||
they are silent (sync `ExternalSystem.Call`, sync DB writes and reads, inbound
|
||||
API requests).
|
||||
|
||||
## Location
|
||||
|
||||
Central cluster and site clusters.
|
||||
|
||||
- **Central:** the `AuditLog` table in central MS SQL, plus three singletons on
|
||||
the active central node — `AuditLogIngestActor` (telemetry receiver),
|
||||
`SiteAuditReconciliationActor`, and `AuditLogPurgeActor`.
|
||||
- **Sites:** a site-local `AuditLog` SQLite database file alongside the
|
||||
Store-and-Forward buffer, plus a `SiteAuditTelemetryActor` singleton on the
|
||||
active site node.
|
||||
|
||||
Registered as component #23 in the Host role configuration.
|
||||
|
||||
## Responsibilities
|
||||
|
||||
- Accept site-local hot-path audit writes from script-trust-boundary call paths.
|
||||
- Forward site audit rows to central via gRPC telemetry with at-least-once
|
||||
delivery and idempotency on `EventId`.
|
||||
- Run periodic per-site reconciliation pulls so missed telemetry self-heals.
|
||||
- Accept central-originated audit writes (Inbound API, Notification dispatch
|
||||
attempts and terminal status).
|
||||
- Compute point-in-time KPIs (global and per-site) from the central `AuditLog`
|
||||
table.
|
||||
- Purge expired rows by monthly partition switch — no row-level deletes.
|
||||
|
||||
## Scope — the script trust boundary
|
||||
|
||||
The Audit Log captures every action a script causes to cross the cluster trust
|
||||
boundary:
|
||||
|
||||
| Channel | Trigger | Direction | Covered today? |
|
||||
|---|---|---|---|
|
||||
| `ExternalSystem.Call(...)` | Script | Outbound | No (gap) |
|
||||
| `ExternalSystem.CachedCall(...)` | Script | Outbound | Yes — `SiteCalls` (Site Call Audit) |
|
||||
| `Database.Connection().Execute*(...)` — writes | Script | Outbound | No (gap) |
|
||||
| `Database.CachedWrite(...)` | Script | Outbound | Yes — `SiteCalls` (Site Call Audit) |
|
||||
| `Notify.To(list).Send(...)` | Script | Outbound | Yes — `Notifications` (Notification Outbox) |
|
||||
| `POST /api/{method}` (Inbound API) | External | Inbound (invokes a script) | No (gap) |
|
||||
|
||||
Out of scope — framework traffic is not audited:
|
||||
|
||||
- Health checks, heartbeats, cluster membership messages.
|
||||
- gRPC inter-cluster real-time streams (attribute values, alarm states).
|
||||
- Data Connection Layer ↔ OPC UA / custom protocol traffic.
|
||||
- LDAP authentication probes, Traefik routing decisions.
|
||||
- Internal Configuration Database queries by the framework.
|
||||
- Site Event Log writes; audit log writes themselves.
|
||||
|
||||
Script-initiated DB **reads** via `Database.Connection().ExecuteReader(...)`
|
||||
count as actions from a script and are in scope. Reads via DCL / subscriptions
|
||||
are framework traffic and excluded.
|
||||
|
||||
## The `AuditLog` Table (central)
|
||||
|
||||
Single wide table in central MS SQL, polymorphic by `Channel` + `Kind`
|
||||
discriminators, with a JSON `Extra` column for channel-specific overflow. One
|
||||
row per lifecycle event across all channels.
|
||||
|
||||
| Column | Type | Notes |
|
||||
|---|---|---|
|
||||
| `EventId` | `uniqueidentifier` PK | Generated where the event originates (site or central). Idempotency key. |
|
||||
| `OccurredAtUtc` | `datetime2` | When the event happened (call returned, retry attempted, etc.). |
|
||||
| `IngestedAtUtc` | `datetime2` | When central persisted the row (lags `OccurredAtUtc` for site-originated rows). |
|
||||
| `Channel` | `varchar(32)` | `ApiOutbound` \| `DbOutbound` \| `Notification` \| `ApiInbound`. |
|
||||
| `Kind` | `varchar(32)` | Event kind discriminator (see kinds list below). |
|
||||
| `CorrelationId` | `uniqueidentifier` NULL | Ties multi-event operations together. `TrackedOperationId` for cached calls, `NotificationId` for notifications, request-id for inbound API. NULL for sync one-shot calls. |
|
||||
| `ExecutionId` | `uniqueidentifier` NULL | The originating script execution / inbound request — the universal per-run correlation value; distinct from `CorrelationId`, which is the per-operation lifecycle id. Stamped on *every* audit row emitted by one execution. |
|
||||
| `SourceSiteId` | `varchar(64)` NULL | NULL for central-originated events. |
|
||||
| `SourceInstanceId` | `varchar(128)` NULL | Instance whose script initiated the action (when applicable). |
|
||||
| `SourceScript` | `varchar(128)` NULL | Script name within the instance. |
|
||||
| `Actor` | `varchar(128)` NULL | Inbound API: API key name. Outbound: script identity. Central: system user. |
|
||||
| `Target` | `varchar(256)` NULL | Outbound API: external system + method. DB: connection name. Notification: list name. Inbound API: method name. |
|
||||
| `Status` | `varchar(32)` | Outcome of *this event* — `Submitted`, `Forwarded`, `Attempted`, `Delivered`, `Failed`, `Parked`, `Discarded`, `Skipped`. |
|
||||
| `HttpStatus` | `int` NULL | HTTP-bearing events only. |
|
||||
| `DurationMs` | `int` NULL | Call / attempt duration. |
|
||||
| `ErrorMessage` | `nvarchar(1024)` NULL | Truncated; `ErrorDetail` for full text. |
|
||||
| `ErrorDetail` | `nvarchar(max)` NULL | Optional full exception text on failures. |
|
||||
| `RequestSummary` | `nvarchar(max)` NULL | Truncated request payload (configurable cap). Headers redacted. |
|
||||
| `ResponseSummary` | `nvarchar(max)` NULL | Truncated response payload. Full on errors. |
|
||||
| `PayloadTruncated` | `bit` | Set if either summary was truncated. |
|
||||
| `Extra` | `nvarchar(max)` NULL | Channel-specific JSON for fields we don't promote to columns. |
|
||||
|
||||
**Indexes (first cut):**
|
||||
|
||||
- `IX_AuditLog_OccurredAtUtc` — primary time-range index for global scans.
|
||||
- `IX_AuditLog_Site_Occurred (SourceSiteId, OccurredAtUtc)` — per-site filters.
|
||||
- `IX_AuditLog_CorrelationId (CorrelationId)` — drilldown from a single operation.
|
||||
- `IX_AuditLog_Execution (ExecutionId)` — drilldown to every action of one script execution / inbound request.
|
||||
- `IX_AuditLog_Channel_Status_Occurred (Channel, Status, OccurredAtUtc)` — KPI / dashboard tiles.
|
||||
- `IX_AuditLog_Target_Occurred (Target, OccurredAtUtc)` — "what did we send to system X".
|
||||
- Monthly partitioning on `OccurredAtUtc` from day one; purge is a partition switch (see Retention & Purge).
|
||||
|
||||
**`Kind` values (flat — 10 discriminators across all channels):**
|
||||
|
||||
| Kind | Fires when |
|
||||
|---|---|
|
||||
| `ApiCall` | Sync `ExternalSystem.Call(...)` returns (success or permanent failure). One row per call. |
|
||||
| `ApiCallCached` | A cached outbound-API attempt records its forward-ack (`Forwarded`) or each retry (`Attempted`). |
|
||||
| `DbWrite` | Sync `Database.Connection().Execute*(...)` / `ExecuteReader(...)` completes. One row per call. |
|
||||
| `DbWriteCached` | A cached outbound-DB attempt records its forward-ack (`Forwarded`) or each retry (`Attempted`). |
|
||||
| `NotifySend` | Script's `Notify.Send(...)` is enqueued on the site — first row in a notification's lifecycle (`Status=Submitted`). |
|
||||
| `NotifyDeliver` | Central Notification Outbox dispatcher records a delivery attempt (`Attempted`) or terminal outcome (`Delivered`/`Parked`/`Discarded`). |
|
||||
| `InboundRequest` | An inbound API request completes — one row per request, written at request end with final status. |
|
||||
| `InboundAuthFailure` | An inbound API request was rejected at the auth boundary (bad/missing key). One row, `Status=Failed`, `HttpStatus=401`. |
|
||||
| `CachedSubmit` | Script-side enqueue of a cached call (`ExternalSystem.CachedCall` / `Database.CachedWrite`); first row in the cached-call lifecycle, written to site SQLite before any forward attempt. |
|
||||
| `CachedResolve` | Terminal row for a cached operation — `Status` = `Delivered` / `Failed` / `Parked` / `Discarded`. |
|
||||
|
||||
Inbound API is intentionally collapsed to a single `InboundRequest` (or
|
||||
`InboundAuthFailure` for auth rejections) row per request rather than a
|
||||
multi-event lifecycle.
|
||||
|
||||
### `ExecutionId` vs `CorrelationId`
|
||||
|
||||
The table carries two correlation columns at different granularities:
|
||||
|
||||
- **`ExecutionId`** is the *universal per-run* value: one id per script
|
||||
execution (tag-change / timer-triggered or otherwise) or per inbound API
|
||||
request. It is stamped on **every** audit row that run produces — the sync
|
||||
`ApiCall` and `DbWrite` rows, the full cached-call lifecycle, the
|
||||
`NotifySend` / `NotifyDeliver` rows, and the inbound row alike. A run that
|
||||
performs no trust-boundary action emits no rows, but any run that emits
|
||||
multiple rows ties them all together under one `ExecutionId`. This lets an
|
||||
audit reader pull the complete trust-boundary footprint of a single script
|
||||
run with one `ExecutionId` filter.
|
||||
- **`CorrelationId`** is the *per-operation lifecycle* id — it groups the
|
||||
multiple events of one long-running operation (`TrackedOperationId` for a
|
||||
cached call, `NotificationId` for a notification, request-id for inbound
|
||||
API) and is NULL for sync one-shot calls that have no operation lifecycle.
|
||||
|
||||
The two are orthogonal: one execution may touch several operations (each with
|
||||
its own `CorrelationId`) yet every resulting row shares the one `ExecutionId`.
|
||||
|
||||
## The Site-Local `AuditLog` (SQLite)
|
||||
|
||||
A SQLite database file on each site node, alongside the Store-and-Forward
|
||||
buffer. Same schema as central minus `IngestedAtUtc` (irrelevant at the source),
|
||||
plus a `ForwardState` column with values `Pending | Forwarded | Reconciled` that
|
||||
drives the telemetry loop and reconciliation pull.
|
||||
|
||||
**Site SQLite retention rule (hard invariant):**
|
||||
|
||||
> A row is eligible for purge only when both `OccurredAtUtc < retention threshold` AND `ForwardState IN ('Forwarded', 'Reconciled')`. Pending rows are never purged.
|
||||
|
||||
A prolonged central outage will grow the site audit table indefinitely until
|
||||
central is reachable again. This is intentional — losing audit rows to make
|
||||
room is a compliance violation, not a self-healing behavior. To bound that
|
||||
growth in practice, the site emits a `SiteAuditBacklog` health metric (pending
|
||||
row count, oldest pending age, bytes on disk); crossing operator-configured
|
||||
thresholds surfaces a warning on the relevant site tile in the Health
|
||||
dashboard, mirroring the Store-and-Forward Engine's backlog metric.
|
||||
|
||||
Central is the durable home. Site SQLite is a write-buffer with a forwarding
|
||||
guarantee.
|
||||
|
||||
## Ingestion Paths
|
||||
|
||||
Four paths feed the central `AuditLog` — one site originator and three central
|
||||
writers — all idempotent on `EventId`.
|
||||
|
||||
### Site hot-path append (site-originated events)
|
||||
|
||||
The component completing a script-trust-boundary action (External System
|
||||
Gateway, Database layer, Store-and-Forward Engine) builds an `AuditEvent` with a
|
||||
fresh `EventId` (Guid v4) and `OccurredAtUtc = UtcNow`, then appends it to the
|
||||
site-local `AuditLog` SQLite via `IAuditWriter` with
|
||||
`ForwardState = 'Pending'`. The append is a single-statement INSERT and is
|
||||
durable in microseconds; control returns to the script with no central
|
||||
round-trip on the hot path.
|
||||
|
||||
### Telemetry forward (site → central)
|
||||
|
||||
A `SiteAuditTelemetryActor` singleton drives the forwarding loop: select up to
|
||||
N `Pending` rows ordered by `OccurredAtUtc`, batch-send them to central via the
|
||||
existing `SiteStream` gRPC channel as `IngestAuditEvents(events)`, and on
|
||||
central-ack flip `ForwardState = 'Forwarded'` for accepted IDs. Rejected IDs
|
||||
stay `Pending` for the next sweep. Cadence is short (default 5 s) when
|
||||
non-empty, longer (default 30 s) when idle; telemetry runs on a dedicated
|
||||
dispatcher.
|
||||
|
||||
### Reconciliation pull (self-healing for missed telemetry)
|
||||
|
||||
A central `SiteAuditReconciliationActor` periodically (default 5 min per site)
|
||||
asks each site for its oldest `Pending` row and pending count; if backlog is
|
||||
non-draining (e.g., telemetry actor wedged), central issues a
|
||||
`PullAuditEvents(sinceUtc, batchSize)` and inserts-if-not-exists. Accepted rows
|
||||
are flipped to `ForwardState = 'Reconciled'` site-side. Same self-healing
|
||||
pattern as Site Call Audit's reconciliation of `SiteCalls`.
|
||||
|
||||
### Central direct-write (central-originated events)
|
||||
|
||||
Events originating at central never touch site SQLite. Inbound API writes one
|
||||
`ApiInbound.InboundRequest` row via `ICentralAuditWriter` synchronously inside
|
||||
the request-handler middleware, before the HTTP response is flushed; auth-layer
|
||||
rejections emit `ApiInbound.InboundAuthFailure` (`Status=Failed`, HTTP 401)
|
||||
instead. The Notification Outbox dispatcher writes
|
||||
`Notification.NotifyDeliver` with `Status=Attempted` per delivery attempt and
|
||||
`Notification.NotifyDeliver` with `Status=Delivered`/`Parked`/`Discarded` on
|
||||
terminal status. Central direct-writes use the same insert-if-not-exists
|
||||
semantics keyed on `EventId`.
|
||||
|
||||
## Cached Operations — Combined Telemetry
|
||||
|
||||
For `ExternalSystem.CachedCall` and `Database.CachedWrite`, the **site** is the
|
||||
source of truth for every audit row. The site writes each lifecycle event —
|
||||
`CachedSubmit` (`Status=Submitted`), then `ApiCallCached`/`DbWriteCached` rows
|
||||
for the forward-ack (`Status=Forwarded`) and each retry (`Status=Attempted`),
|
||||
then a terminal `CachedResolve` row
|
||||
(`Status=Delivered`/`Failed`/`Parked`/`Discarded`) — to its local SQLite
|
||||
`AuditLog` on the hot path (or on the retry tick for `Attempted` rows), then
|
||||
forwards via the same telemetry channel. The telemetry message format gains the
|
||||
audit-row fields additively — one packet per lifecycle transition carries both
|
||||
the operational state update AND the audit row content.
|
||||
|
||||
On receipt, central performs both writes in one transaction:
|
||||
|
||||
1. Insert-if-not-exists the immutable `AuditLog` row, keyed on `EventId`.
|
||||
2. Upsert the operational `SiteCalls` row — existing Site Call Audit behavior
|
||||
(status, retry count, last error, timestamps).
|
||||
|
||||
This collapses two telemetry concerns into one, keeps site SQLite as the
|
||||
single local source of truth for audit content, and preserves the existing
|
||||
operational `SiteCalls` shape for the dispatcher and UI.
|
||||
|
||||
## Payload Capture Policy
|
||||
|
||||
- **Default cap** — 8 KB for each of `RequestSummary` and `ResponseSummary`;
|
||||
raised to 64 KB on any error row (`Status IN ('Failed', 'Parked', 'Discarded')`).
|
||||
- **Truncation** — UTF-8 byte-safe; `PayloadTruncated = 1` when applied. Full
|
||||
bodies are never stored.
|
||||
- **HTTP headers** — `Authorization`, `Cookie`, `Set-Cookie`, `X-API-Key`, and
|
||||
any header matching the configured redact-list regex become `<redacted>`.
|
||||
- **HTTP bodies** — captured verbatim by default. Operators register per-target
|
||||
body redactors (regex → replacement) for known secret fields.
|
||||
- **SQL** — statement text and parameter values captured verbatim by default;
|
||||
per-connection opt-in to redact parameters whose name matches a regex.
|
||||
- **Never captured** — raw API key material (only the key *name* via `Actor`),
|
||||
LDAP bind credentials, cluster secrets, Configuration DB connection strings.
|
||||
- **Safety net** — if a configured redactor throws, the affected payload becomes
|
||||
`"<redacted: redactor error>"` and `AuditRedactionFailure` increments. We
|
||||
over-redact, never under-redact, on configuration faults.
|
||||
|
||||
Redaction happens at the write site, before the row touches SQLite (or central
|
||||
MS SQL for direct-write events). Unredacted secrets never persist.
|
||||
|
||||
## Failure Handling & Idempotency
|
||||
|
||||
- **`EventId` is the dedup key.** Generated at the originator; central ingest
|
||||
is `INSERT … WHERE NOT EXISTS (SELECT 1 FROM AuditLog WHERE EventId = @id)`
|
||||
under the PK constraint. Idempotent across telemetry retries, reconciliation
|
||||
pulls, and any combination of the two.
|
||||
- **Never fail the action.** A failed audit write — site SQLite or central
|
||||
direct-write — logs a critical Site Event Log entry and increments a health
|
||||
metric (`SiteAuditWriteFailures` or `CentralAuditWriteFailures`), but the
|
||||
user-facing action proceeds. We do not fail script-initiated work because the
|
||||
audit write failed.
|
||||
- **Hot-path ring buffer.** While the site audit writer is unhealthy
|
||||
(disk full, schema lock, transient IO), events buffer in a small in-memory
|
||||
ring (default 1024 rows); oldest are discarded with a Site Event Log warning
|
||||
per drop.
|
||||
- **Reconciliation as fallback.** If two consecutive reconciliation cycles
|
||||
report a non-draining backlog, the supervisor restarts the telemetry actor
|
||||
and a `SiteAuditTelemetryStalled` event fires.
|
||||
- **No dedup horizon.** `EventId` PK enforces uniqueness only while a row
|
||||
exists. A retry that arrives after the original row is purged inserts a "new"
|
||||
row — vanishingly rare and harmless.
|
||||
|
||||
## Retention & Purge
|
||||
|
||||
- **Central:** 365-day default based on `OccurredAtUtc`, configurable via
|
||||
`AuditLog:RetentionDays` (min 7, max 3650). Single global retention in v1 —
|
||||
no per-channel overrides.
|
||||
- **Partitioning:** monthly partitions on `OccurredAtUtc` from day one
|
||||
(`pf_AuditLog_Month` / `ps_AuditLog_Month`). Purge is a partition switch;
|
||||
there are no row-level deletes at central.
|
||||
- **Purge actor:** `AuditLogPurgeActor` singleton on the active central node
|
||||
runs daily, switches out any partition whose latest `OccurredAtUtc` is older
|
||||
than the retention window, and emits an `AuditLog:Purged` event (partition
|
||||
range, rowcount, duration). A partition-maintenance step rolls forward each
|
||||
month, creating the next month's partition ahead of time.
|
||||
- **Sites:** daily site job; default 7-day retention (configurable, min 1,
|
||||
max 90). Respects the hard `ForwardState` invariant — `Pending` rows are
|
||||
never purged on age alone.
|
||||
|
||||
## Security & Tamper-Evidence
|
||||
|
||||
- **Append-only enforcement.** The application accesses `AuditLog` via a
|
||||
dedicated DB role `scadalink_audit_writer` granted `INSERT` + `SELECT` only —
|
||||
no `UPDATE`, no `DELETE`. Purge runs under a separate role
|
||||
`scadalink_audit_purger` whose permissions are limited to the partition-switch
|
||||
operation; row-level `DELETE` is not granted even to purge.
|
||||
- **CI grep guard.** The build greps the data layer for any
|
||||
`UPDATE … AuditLog` or `DELETE … AuditLog` text and fails on a hit.
|
||||
- **Authorization.** Reading the Audit Log requires the existing **Audit** role
|
||||
extended with a new **OperationalAudit** permission. Per-site row scoping
|
||||
reuses the existing site-permission model; bulk export requires an additional
|
||||
**AuditExport** permission.
|
||||
- **Payload redaction at write.** See Payload Capture Policy. Unredacted
|
||||
secrets never persist; the safety net over-redacts on misconfiguration.
|
||||
- **Hash-chain tamper evidence — deferred to v1.x.** A future `RowHash` column,
|
||||
computed per partition as `SHA-256(prev.RowHash || canonical(row))`, will be
|
||||
verifiable offline via `scadalink audit verify-chain --month YYYY-MM`. Off by
|
||||
default in v1.
|
||||
- **Site SQLite security.** File permissions: read/write by the ScadaLink
|
||||
service account only. Not backed up off-machine — site SQLite is a buffer,
|
||||
not a record.
|
||||
|
||||
## KPIs
|
||||
|
||||
Point-in-time, computed from the central `AuditLog` table; global and per-site.
|
||||
|
||||
- **Audit volume** — events/min landing in the central `AuditLog`; global plus per-site sparkline.
|
||||
- **Audit error rate** — % of central `AuditLog` rows with `Status IN ('Failed', 'Parked', 'Discarded')` over a rolling 5-minute window. This is the operational error rate of audited operations (HTTP 5xx, permanent failures, parked deliveries) — NOT audit-writer health, which surfaces separately via `CentralAuditWriteFailures` and `AuditRedactionFailure`.
|
||||
- **Audit backlog** — sum of `Pending` site rows across sites; click drills into a per-site breakdown.
|
||||
|
||||
[Notification Outbox](Component-NotificationOutbox.md) and
|
||||
[Site Call Audit](Component-SiteCallAudit.md) KPIs are unaffected — they remain
|
||||
sourced from `Notifications` and `SiteCalls` respectively. Audit Log KPIs
|
||||
describe the audit table itself.
|
||||
|
||||
## Configuration
|
||||
|
||||
Bound from `appsettings.json` to a new `AuditLogOptions` class owned by this
|
||||
component (Options pattern):
|
||||
|
||||
```jsonc
|
||||
"AuditLog": {
|
||||
"DefaultCapBytes": 8192,
|
||||
"ErrorCapBytes": 65536,
|
||||
"HeaderRedactList": [ "Authorization", "Cookie", "Set-Cookie", "X-API-Key" ],
|
||||
"GlobalBodyRedactors": [
|
||||
{ "Pattern": "\"password\"\\s*:\\s*\"[^\"]+\"", "Replacement": "\"password\":\"<redacted>\"" }
|
||||
],
|
||||
"PerTargetOverrides": {
|
||||
"Weather/GetForecast": { "CapBytes": 4096 },
|
||||
"PlantDB": { "RedactSqlParamsMatching": "@apikey|@token" }
|
||||
},
|
||||
"RetentionDays": 365
|
||||
}
|
||||
```
|
||||
|
||||
`PerTargetOverrides` keys bind by External System / Inbound Method /
|
||||
Notification List / Database Connection name. `RetentionDays` is a single
|
||||
global value in v1; per-channel overrides are deferred to v1.x.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **[Commons (#16)](Component-Commons.md)** — `AuditEvent`, `IAuditWriter` /
|
||||
`ICentralAuditWriter` interfaces, and the `AuditChannel`, `AuditKind`,
|
||||
`AuditStatus` enum types live here.
|
||||
- **[Configuration Database (#17)](Component-ConfigurationDatabase.md)** — hosts
|
||||
the `AuditLog` table schema, the monthly partition function and scheme, the
|
||||
`scadalink_audit_writer` / `scadalink_audit_purger` DB roles, and the EF
|
||||
migration. Distinct concern from `IAuditService` (config-change audit), which
|
||||
is unchanged.
|
||||
- **[Cluster Infrastructure (#13)](Component-ClusterInfrastructure.md)** —
|
||||
singleton placement and supervision for `AuditLogIngestActor`,
|
||||
`SiteAuditTelemetryActor`, `SiteAuditReconciliationActor`, and
|
||||
`AuditLogPurgeActor`.
|
||||
- **[Central–Site Communication (#5)](Component-Communication.md)** — carries
|
||||
audit telemetry. New gRPC message types (`IngestAuditEvents`,
|
||||
`PullAuditEvents`) are added to the existing site-stream proto additively.
|
||||
- **[Site Runtime (#3)](Component-SiteRuntime.md)** — script-trust-boundary
|
||||
call paths invoke `IAuditWriter` to append events.
|
||||
- **[Host (#15)](Component-Host.md)** — registers this component (#23) under
|
||||
the central and site roles.
|
||||
|
||||
## Interactions
|
||||
|
||||
- **[External System Gateway (#7)](Component-ExternalSystemGateway.md)** —
|
||||
emits `ApiOutbound.ApiCall` rows on every sync `Call()`. For `CachedCall`,
|
||||
emits the combined cached telemetry packet (audit row + operational update)
|
||||
per Cached Operations — Combined Telemetry, using kinds
|
||||
`CachedSubmit` / `ApiCallCached` / `CachedResolve`.
|
||||
- **[External System Gateway (#7)](Component-ExternalSystemGateway.md) — Database layer** — the database access modes inside ESG emit `DbOutbound.DbWrite` rows on script-initiated `Connection()` calls (writes and reads share the kind; distinguish via `Extra.rowsAffected` vs `Extra.rowsReturned`); `Database.CachedWrite` emits the cached-write lifecycle rows via the combined-telemetry packet using kinds `CachedSubmit` / `DbWriteCached` / `CachedResolve` (same shape as `ApiOutbound`). Site Runtime is the API surface that exposes the `Database.*` calls to scripts; the audit emission itself lives in ESG.
|
||||
- **[Inbound API (#14)](Component-InboundAPI.md)** — emits one
|
||||
`ApiInbound.InboundRequest` row per successful request from request-handler
|
||||
middleware, written directly to central via `ICentralAuditWriter` before the
|
||||
response is flushed. Auth-layer rejections emit
|
||||
`ApiInbound.InboundAuthFailure` instead (`Status=Failed`, HTTP 401).
|
||||
- **[Notification Outbox (#21)](Component-NotificationOutbox.md)** — the
|
||||
site-emitted `Notification.NotifySend` row (`Status=Submitted`) flows via
|
||||
audit telemetry; the central dispatcher writes `Notification.NotifyDeliver`
|
||||
rows directly via `ICentralAuditWriter` — `Status=Attempted` per delivery
|
||||
attempt, `Status=Delivered`/`Parked`/`Discarded` on terminal status. The
|
||||
operational `Notifications` table is unchanged.
|
||||
- **[Site Call Audit (#22)](Component-SiteCallAudit.md)** — shares the
|
||||
cached-call telemetry packet. Central ingest of that packet performs both the
|
||||
`AuditLog` insert and the `SiteCalls` upsert in one transaction. `SiteCalls`
|
||||
remains the operational state store; the Audit Log is its immutable shadow.
|
||||
- **[Central UI (#9)](Component-CentralUI.md)** — a new **Audit** nav group
|
||||
hosts the Audit Log page (filter bar, results grid, drilldown drawer,
|
||||
server-side CSV export). Drill-in links appear on Notifications, Site Calls,
|
||||
External Systems, Inbound API key, Sites, and Instances detail pages.
|
||||
- **[Health Monitoring (#11)](Component-HealthMonitoring.md)** — three new
|
||||
tiles (Volume, Error rate, Backlog) plus new health metrics:
|
||||
`SiteAuditBacklog`, `SiteAuditWriteFailures`, `SiteAuditTelemetryStalled`,
|
||||
`CentralAuditWriteFailures`, `AuditRedactionFailure`.
|
||||
- **[CLI (#19)](Component-CLI.md)** — new `scadalink audit query`,
|
||||
`scadalink audit export`, and `scadalink audit verify-chain` commands; same
|
||||
permission requirements as the UI.
|
||||
@@ -172,6 +172,40 @@ scadalink security scope-rule delete --id <id>
|
||||
scadalink audit-log query [--user <username>] [--entity-type <type>] [--action <action>] [--from <date>] [--to <date>] [--page <n>] [--page-size <n>]
|
||||
```
|
||||
|
||||
The legacy `audit-log query` above targets the original configuration-change audit
|
||||
(IAuditService) surface. The new centralized Audit Log component (#23) is exposed via
|
||||
the `scadalink audit` group below.
|
||||
|
||||
### Centralized Audit Commands
|
||||
|
||||
The `scadalink audit` group targets the centralized Audit Log component (#23) and
|
||||
exposes the UI-equivalent operational audit surface. Permissions follow the same
|
||||
read-vs-export split the Central UI uses (see Component-AuditLog.md, Security &
|
||||
Tamper-Evidence, and Security & Auth #10): `audit query` and `audit verify-chain`
|
||||
require the `OperationalAudit` permission; `audit export` additionally requires
|
||||
`AuditExport`. The server enforces permission checks and returns HTTP 403 (CLI
|
||||
exit code 2) on denial.
|
||||
|
||||
```
|
||||
scadalink audit query --since <t> [--until <t>] [--channel <c>] [--kind <k>] [--status <s>] [--site <s>] [--instance <i>] [--target <t>] [--actor <a>] [--correlation-id <id>] [--errors-only] [--page <n>] [--page-size <n>]
|
||||
scadalink audit export --since <t> --until <t> --format csv|jsonl|parquet --output <path> [--channel <c>] [--kind <k>] [--status <s>] [--site <s>] [--target <t>] [--actor <a>]
|
||||
scadalink audit verify-chain --month <YYYY-MM>
|
||||
```
|
||||
|
||||
- `audit query` — filtered query against the central `AuditLog` table, matching the
|
||||
Central UI Audit Log page filter set (time range, channel, kind, status, site,
|
||||
instance/script, target, actor, correlation ID, errors-only). Results stream as
|
||||
JSON (default) or table.
|
||||
- `audit export` — server-side streaming export of the central `AuditLog` to the
|
||||
requested format (`csv`, `jsonl`, `parquet`) written to `--output`. The server
|
||||
streams rows rather than materializing them in memory; the CLI writes bytes
|
||||
through to disk. Supports the same scoping filters as `audit query`.
|
||||
- `audit verify-chain` — hash-chain verification for the named month.
|
||||
**No-op in v1**: the command is defined so the command tree is stable, but
|
||||
verification only becomes meaningful once the hash-chain ships (see
|
||||
Component-AuditLog.md, Security & Tamper-Evidence). Until then, the server
|
||||
responds with a "verification not yet available" status and the CLI exits 0.
|
||||
|
||||
### Health Commands
|
||||
```
|
||||
scadalink health summary
|
||||
@@ -273,6 +307,8 @@ Configuration is resolved in the following priority order (highest wins):
|
||||
- **Commons**: Message contracts (`Messages/Management/`) for command type definitions and registry.
|
||||
- **System.CommandLine**: Command-line argument parsing.
|
||||
- **Microsoft.AspNetCore.SignalR.Client**: SignalR client for the `debug stream` command's WebSocket connection.
|
||||
- **Management Service (#18)**: The CLI hits the central cluster via the existing HTTP Management API (`POST /management`), which dispatches to the ManagementActor. The `scadalink audit` command group rides this same transport — there is no separate audit endpoint.
|
||||
- **Audit Log (#23)**: The `scadalink audit query`, `audit export`, and `audit verify-chain` subcommands target the centralized Audit Log component's query/export/verify surfaces via the Management API. Permission checks (`OperationalAudit`, `AuditExport`) are enforced server-side.
|
||||
|
||||
## Interactions
|
||||
|
||||
|
||||
@@ -58,6 +58,7 @@ Central cluster only. Sites have no user interface.
|
||||
### External System Management (Design Role)
|
||||
- Define external system contracts: connection details, API method definitions (parameters, return types).
|
||||
- Define retry settings per external system (max retry count, fixed time between retries).
|
||||
- The external system detail page includes a **"Recent activity"** link that opens the Audit Log page pre-filtered to `Channel = ApiOutbound` and `Target` starts-with the system name — surfacing the system's recent outbound API audit history.
|
||||
|
||||
### Database Connection Management (Design Role)
|
||||
- Define named database connections: server, database, credentials.
|
||||
@@ -65,7 +66,8 @@ Central cluster only. Sites have no user interface.
|
||||
|
||||
### Notification List Management (Design Role)
|
||||
- Create, edit, and delete notification lists.
|
||||
- Manage recipients (name + email) within each list.
|
||||
- Each notification list has a **`Type`** — `Email` now, with `Teams` and other types planned. The type determines the type-specific targets a list carries.
|
||||
- Manage recipients (name + email) within each `Email` list.
|
||||
- Configure SMTP settings.
|
||||
|
||||
### Site & Data Connection Management (Admin Role)
|
||||
@@ -73,6 +75,11 @@ Central cluster only. Sites have no user interface.
|
||||
- Define data connections and assign them to sites (name, protocol type, connection details).
|
||||
- **Data connection form**: "Primary Endpoint Configuration" (required JSON text area) and optional "Backup Endpoint Configuration" (collapsible section, hidden by default, revealed via "Add Backup Endpoint" button; "Remove Backup" button when editing an existing backup). "Failover Retry Count" numeric input (default 3, min 1, max 20) is visible only when a backup endpoint is configured.
|
||||
- **Data connection list page**: Shows Primary Config and Backup Config columns. Active Endpoint column populated from health reports.
|
||||
- The site detail page exposes a new **"Audit feed"** tab that hosts the Audit Log page pre-filtered to `Site = <site>` — an in-context view of every operational audit event for that site.
|
||||
|
||||
### Inbound API Management (Admin Role for keys, Design Role for methods)
|
||||
- Manage inbound API keys (create, enable / disable, delete) and define API methods (name, parameters, return values, approved keys, implementation script).
|
||||
- The API key detail page includes a **"Recent calls"** link that opens the Audit Log page pre-filtered to `Actor = <key name>` and `Channel = ApiInbound` — surfacing the key's recent inbound-call audit history.
|
||||
|
||||
### Area Management (Admin Role)
|
||||
- Define hierarchical area structures per site.
|
||||
@@ -88,6 +95,7 @@ Central cluster only. Sites have no user interface.
|
||||
- **Disable** instances — stops data collection, script triggers, and alarm evaluation at the site while retaining the deployed configuration.
|
||||
- **Enable** instances — re-activates a disabled instance.
|
||||
- **Delete** instances — removes the running configuration from the site. Blocked if the site is unreachable. Store-and-forward messages are not cleared.
|
||||
- The instance detail page exposes a new **"Audit feed"** tab that hosts the Audit Log page pre-filtered to the instance (`Site = <site>` and the `Instance / Script` filter set to the instance unique name) — an in-context view of every operational audit event involving that instance.
|
||||
|
||||
### Deployment (Deployment Role)
|
||||
- View list of instances with staleness indicators (deployed config differs from template-derived config).
|
||||
@@ -97,7 +105,7 @@ Central cluster only. Sites have no user interface.
|
||||
- Track deployment status (pending, in-progress, success, failed).
|
||||
|
||||
### System-Wide Artifact Deployment (Deployment Role)
|
||||
- Explicitly deploy shared scripts, external system definitions, database connection definitions, data connection definitions, notification lists, and SMTP configuration to all sites or to an individual site.
|
||||
- Explicitly deploy shared scripts, external system definitions, database connection definitions, and data connection definitions to all sites or to an individual site. (Notification lists and SMTP configuration are central-only and are not deployed.)
|
||||
- **Per-site deployment**: A "Deploy Artifacts" button on the Sites admin page allows deploying all artifacts to an individual site.
|
||||
- **Deploy all**: A bulk action deploys artifacts to all sites at once.
|
||||
- This is a **separate action** from instance deployment — system-wide artifacts are not automatically pushed when definitions change.
|
||||
@@ -114,21 +122,67 @@ Central cluster only. Sites have no user interface.
|
||||
- Subscribe-on-demand — stream starts when opened, stops when closed.
|
||||
|
||||
### Parked Message Management (Deployment Role)
|
||||
- Query sites for parked messages (external system calls, notifications, cached DB writes).
|
||||
- Query sites for parked messages (external system calls, cached DB writes). (Parked notifications are managed centrally on the Notification Outbox page, not here.)
|
||||
- View message details (target, payload, retry count, timestamps).
|
||||
- Retry or discard individual parked messages.
|
||||
|
||||
### Notification Outbox (Deployment Role)
|
||||
- Monitor and manage centrally-delivered notifications. The Notification Outbox dispatches every notification store-and-forwarded from sites and logs each one to the central `Notifications` table.
|
||||
- **KPI tiles** at the top of the page: queue depth (`Pending` + `Retrying`), stuck count, parked count, delivered in the last interval, and oldest pending age. The KPIs are central-computed on demand from the `Notifications` table.
|
||||
- A **queryable notification list** filterable by status, type, source site, notification list, and time range, with a **stuck-only toggle** and keyword search on subject. Each row shows the notification's status, retry count, last error, and key timestamps.
|
||||
- **Retry** and **Discard** actions are available on parked notifications: Retry returns the notification to `Pending` and resets `RetryCount` / `NextAttemptAt`; Discard moves it to `Discarded`. The row is retained either way so the table stays a complete audit record.
|
||||
- Each row exposes a **"View audit history"** action that opens the Audit Log page pre-filtered to `CorrelationId = NotificationId`, surfacing every operational audit event recorded for that notification.
|
||||
- **Stuck rows are visually badged** — a notification is stuck if it is `Pending` or `Retrying` and older than the configurable stuck-age threshold. Stuck detection is display-only; there is no automated escalation or alerting.
|
||||
- All queries are served from the central `Notifications` table — no remote per-site queries are needed, unlike the Parked Message Management page.
|
||||
|
||||
### Site Calls (Deployment Role)
|
||||
- Monitor cached calls store-and-forwarded from sites — `ExternalSystem.CachedCall()` and `Database.CachedWrite()` operations. Scoped to the `ExternalCall` and `DatabaseWrite` kinds only; notifications keep their separate Notification Outbox page and are not merged here.
|
||||
- A **queryable cached-call list** filterable by site, kind, status, and time range. Each row shows the call's timestamp, site, kind, target summary, status badge, retry count, and last error.
|
||||
- **Retry** and **Discard** actions are available on `Parked` rows only — `Failed` rows are not actionable, since a permanent failure would simply fail again and its error was already returned synchronously to the calling script. The actions issue central→site commands to the owning site; if the site is offline the UI surfaces a "site unreachable" message.
|
||||
- Each row exposes a **"View audit history"** action that opens the Audit Log page pre-filtered to `CorrelationId = TrackedOperationId`, showing every operational audit event recorded for that cached call.
|
||||
- Data is served from the central Site Call Audit component's `SiteCalls` table. The page is **read-mostly** — an eventually-consistent mirror of site state; the site remains the source of truth.
|
||||
|
||||
### Health Monitoring Dashboard (All Roles)
|
||||
- Overview of all sites with online/offline status.
|
||||
- Per-site detail: active/standby node status, data connection health, script error rates, alarm evaluation error rates, store-and-forward buffer depths.
|
||||
- Headline **Notification Outbox KPI tiles** — queue depth, stuck count, and parked count. These are central-computed by the Notification Outbox from the central `Notifications` table (not part of any site health report). The full outbox view is on the dedicated Notification Outbox page.
|
||||
- Headline **Site Call Audit KPI tiles** — buffered count, parked count, and failed-last-interval. These are central-computed by the Site Call Audit component from the central `SiteCalls` table (not part of any site health report). The full cached-call view is on the dedicated Site Calls page.
|
||||
- Headline **Audit KPI tiles** — three tiles in a new "Audit" KPI group: **Audit volume**, **Audit error rate**, and **Audit backlog**. These are sourced from the Audit Log component (#23) and Health Monitoring per the metric definitions in Component-HealthMonitoring.md; the dashboard simply surfaces them. The full audit query view is on the dedicated Audit Log page.
|
||||
|
||||
### Site Event Log Viewer (Deployment Role)
|
||||
- Query site event logs remotely.
|
||||
- Filter by event type, time range, instance.
|
||||
- View script executions, alarm events (activations, clears, evaluation errors), deployment events (including script compilation results), connection status changes, store-and-forward activity, instance lifecycle events (enable, disable, delete).
|
||||
|
||||
### Audit Log Viewer (Admin Role)
|
||||
- Query the central audit log.
|
||||
### Audit Log (Admin / Audit Role)
|
||||
- Lives under a **new top-level "Audit" nav group** (sibling to Notifications). In v1 the Audit nav group contains this single Audit Log page; the pre-existing Configuration Audit Log Viewer remains its own page below.
|
||||
- Global query / filter / drilldown over the central `AuditLog` table maintained by the Audit Log component (#23). Read-only — the table is append-only, so there are no edit actions on rows.
|
||||
- Read access to the page requires the `OperationalAudit` permission (Security & Auth #10). Per-site row scoping reuses the existing site-permission model: a user sees only rows for sites they are authorized to operate. Bulk export (see below) additionally requires `AuditExport`. The split mirrors the CLI's permission model (see Component-CLI.md).
|
||||
- **Filter bar** (top of page, collapses to a single row when not focused):
|
||||
- Time range — relative (15m / 1h / 24h / 7d) or custom.
|
||||
- Channel — multi-select: `ApiOutbound`, `DbOutbound`, `Notification`, `ApiInbound`.
|
||||
- Kind — multi-select; the available options are filtered by the selected Channels.
|
||||
- Status — multi-select.
|
||||
- Site — multi-select, scoped to the user's authorized sites.
|
||||
- Instance / Script — text search with autocomplete.
|
||||
- Target — text search (system + method, DB connection, list name).
|
||||
- Actor — text search (inbound API key name).
|
||||
- CorrelationId — paste a `TrackedOperationId` / `NotificationId` / request-id to see the full event sequence for one operation.
|
||||
- "Errors only" toggle — shorthand for `Status NOT IN (Success, Delivered, Enqueued)`.
|
||||
- **Results grid** (custom Blazor + Bootstrap component, consistent with the rest of the UI — no third-party grid):
|
||||
- Columns, all resizable and reorderable, persisted per user: `OccurredAtUtc`, `Site`, `Channel`, `Kind`, `Status`, `Target`, `Actor`, `DurationMs`, `HttpStatus`, `ErrorMessage`.
|
||||
- Keyset pagination ordered by `(OccurredAtUtc desc, EventId desc)`. Default page size 100.
|
||||
- Clicking a row opens the drilldown drawer.
|
||||
- **Drilldown drawer**:
|
||||
- Pretty-prints `RequestSummary` / `ResponseSummary` — JSON is auto-detected and syntax-highlighted; SQL is syntax-highlighted.
|
||||
- Surfaces **redaction indicators** wherever headers or fields were stripped at write time, per the Audit Log component's "Payload Capture Policy".
|
||||
- **"Copy as cURL"** action on `ApiOutbound` and `ApiInbound` rows.
|
||||
- **"Show all events for this operation"** link — re-applies the current view filtered by the row's `CorrelationId`.
|
||||
- **Export** button on the page header streams a server-side CSV of the current filter (default cap 100k rows; larger exports go through the CLI). Requires the `AuditExport` permission.
|
||||
|
||||
### Configuration Audit Log Viewer (Admin Role)
|
||||
- Pre-existing viewer for the `IAuditService` configuration-change log (template / instance / site / etc. before-after edits). Lives under the same **Audit** nav group as the operational Audit Log above.
|
||||
- Query the central configuration audit log.
|
||||
- Filter by user, entity type, action type, time range.
|
||||
- View before/after state for each change.
|
||||
|
||||
@@ -144,3 +198,6 @@ Central cluster only. Sites have no user interface.
|
||||
- **Security & Auth**: Authenticates users and enforces role-based access.
|
||||
- **Configuration Database**: All central data, including audit log data for the audit log viewer. Accessed via `ICentralUiRepository`.
|
||||
- **Health Monitoring**: Provides site health data for the dashboard.
|
||||
- **Notification Outbox**: Provides notification delivery KPIs and serves the `Notifications` table queries and Retry/Discard actions for the Notification Outbox page.
|
||||
- **Site Call Audit**: Serves the `SiteCalls` table queries and relays Retry/Discard actions to sites for the Site Calls page.
|
||||
- **Audit Log (#23)**: Serves all `AuditLog` table queries (filter / grid / drilldown / CSV export) for the new Audit Log page and the drill-in surfaces on Notifications, Site Calls, External Systems, Inbound API keys, Sites, and Instances. Payload capture, redaction, and per-site authorization follow the Audit Log component's "Payload Capture Policy" and "Security & Tamper-Evidence" sections.
|
||||
|
||||
@@ -54,6 +54,23 @@ remains the home of the configuration contract that the Host consumes.
|
||||
- Connected to local SQLite databases (store-and-forward buffer, event logs, deployed configurations).
|
||||
- Connected to machines via data connections (OPC UA).
|
||||
|
||||
## Cluster Singletons
|
||||
|
||||
Akka.NET cluster singletons run on the active node of their cluster and migrate on failover. Each singleton listed here is owned by the named component; this component (Cluster Infrastructure) provides only the hosting, supervision, and active-node placement guarantee.
|
||||
|
||||
### Central singletons (active central node)
|
||||
|
||||
- **`NotificationOutboxActor`** — owned by Notification Outbox (#21). Drives the central notification dispatch loop against the `Notifications` table.
|
||||
- **`SiteCallAuditActor`** — owned by Site Call Audit (#22). Owns the operational `SiteCalls` table: drives periodic reconciliation pulls for `CachedCall` / `CachedWrite` lifecycle, computes KPIs, and relays operator Retry/Discard actions to the owning site. Note: ingest of cached-call telemetry is performed by `AuditLogIngestActor` (#23) in one transaction with the immutable `AuditLog` insert — see Component-AuditLog.md, Cached Operations — Combined Telemetry.
|
||||
- **`AuditLogIngestActor`** — owned by Audit Log (#23). Receives gRPC telemetry batches of `AuditEvent` rows from sites and performs insert-if-not-exists on `EventId` against the central `AuditLog` table. For cached-call telemetry (which carries both audit-row content and operational-state fields in a single packet), the ingest performs the `AuditLog` insert and the `SiteCalls` upsert in **one transaction** — see Component-AuditLog.md for the combined-telemetry contract.
|
||||
- **`SiteAuditReconciliationActor`** — owned by Audit Log (#23). Periodic per-site pull (default every 5 minutes) that self-heals missed audit telemetry by asking each site for its oldest `ForwardState = 'Pending'` row and issuing a `PullAuditEvents(sinceUtc, batchSize)` when a non-draining backlog is detected.
|
||||
- **`AuditLogPurgeActor`** — owned by Audit Log (#23). Daily partition-switch purge against `ps_AuditLog_Month`; switches out any partition older than `AuditLog:RetentionDays` and emits an `AuditLog:Purged` event. Also rolls the partition scheme forward each month so the next month's partition exists ahead of time.
|
||||
|
||||
### Site singletons (active site node, per site cluster)
|
||||
|
||||
- **Site Runtime Deployment Manager** — owned by Site Runtime (#3). Owns the full Instance Actor hierarchy; re-creates it on failover from local SQLite.
|
||||
- **`SiteAuditTelemetryActor`** — owned by Audit Log (#23). Drains the local site `AuditLog` SQLite's `ForwardState = 'Pending'` rows to central in batches via the existing `SiteStream` gRPC channel; cadence is short (default 5 s) when the queue is non-empty and longer (default 30 s) when idle. Runs on a **dedicated dispatcher** so it does not compete with the script blocking-I/O dispatcher (per Component-AuditLog.md, Ingestion Paths → Telemetry forward).
|
||||
|
||||
## Failover Behavior
|
||||
|
||||
### Detection
|
||||
|
||||
@@ -35,6 +35,13 @@ Commons must define shared primitive and utility types used across multiple comp
|
||||
- **`AlarmLevel` enum**: None, Low, LowLow, High, HighHigh. Severity level for an active alarm; always `None` for binary trigger types, set by `HiLo` triggers.
|
||||
- **`AlarmTriggerType` enum**: ValueMatch, RangeViolation, RateOfChange, HiLo.
|
||||
- **`ConnectionHealth` enum**: Connected, Disconnected, Connecting, Error.
|
||||
- **`TrackedOperationId`**: A GUID identifying a tracked store-and-forward operation (`ExternalSystem.CachedCall`, `Database.CachedWrite`, `Notify.Send`). Generated caller-side at the site at call time, returned to the script as a tracking handle, and reused as the idempotency key for telemetry sent to central. The notification domain's existing `NotificationId` is the notification-specific name for this same concept.
|
||||
- **`TrackedOperationKind` enum**: ExternalCall, DatabaseWrite. Discriminates the two cached-call kinds carried by a tracked operation (notifications are tracked separately via the `NotificationType` enum).
|
||||
- **`TrackedOperationStatus` enum**: Pending, Retrying, Delivered, Parked, Failed, Discarded. The unified lifecycle state shared by all tracked store-and-forward operations. This is the operation's externally-observable lifecycle status in the site-local tracking table (the status record); it is related to but distinct from the S&F buffer's own `StoreAndForwardMessageStatus`, which tracks a buffered message's retry state within the buffer (the retry mechanism). `Failed` (permanent failure) has no notification analogue — notifications use only the other five states (the `NotificationStatus` enum omits `Failed`).
|
||||
- **`AuditChannel` enum**: ApiOutbound, DbOutbound, Notification, ApiInbound. Discriminates the script-trust-boundary channel that produced an `AuditEvent`. Owned by the Audit Log component.
|
||||
- **`AuditKind` enum**: SyncCall, CachedEnqueued, CachedAttempt, CachedTerminal, SyncWrite, SyncRead, Enqueued, Attempt, Terminal, Completed. Channel-specific event kind — the valid `Kind` values for each `AuditChannel` are listed in the Audit Log component design (`Component-AuditLog.md`).
|
||||
- **`AuditStatus` enum**: Success, TransientFailure, PermanentFailure, Enqueued, Retrying, Delivered, Parked, Discarded. Outcome of a single audit event row; superset of `TrackedOperationStatus` to also cover one-shot sync calls.
|
||||
- **`AuditEvent`**: A record carrying every column of the central `AuditLog` row — `EventId` (GUID, idempotency key), `OccurredAtUtc`, `IngestedAtUtc`, `Channel` (`AuditChannel`), `Kind` (`AuditKind`), `CorrelationId`, `SourceSiteId`, `SourceInstanceId`, `SourceScript`, `Actor`, `Target`, `Status` (`AuditStatus`), `HttpStatus`, `DurationMs`, `ErrorMessage`, `ErrorDetail`, `RequestSummary`, `ResponseSummary`, `PayloadTruncated`, `Extra` — plus a site-only `ForwardState` (`Pending` | `Forwarded` | `Reconciled`) used by the site SQLite write-buffer's telemetry/reconciliation loop. `IngestedAtUtc` is unset at the site and stamped on central ingest. See `Component-AuditLog.md` for the persistence schema and ingest semantics.
|
||||
|
||||
Types defined here must be immutable and thread-safe.
|
||||
|
||||
@@ -65,12 +72,14 @@ Entity classes are organized by domain area:
|
||||
- **Shared Scripts**: `SharedScript`.
|
||||
- **Sites & Data Connections**: `Site`, `DataConnection`.
|
||||
- **External Systems & Database Connections**: `ExternalSystemDefinition`, `ExternalSystemMethod`, `DatabaseConnectionDefinition`.
|
||||
- **Notifications**: `NotificationList`, `NotificationRecipient`, `SmtpConfiguration`.
|
||||
- **Notifications**: `NotificationList` (carries a `Type` field — `NotificationType` enum — selecting the list's notification type and its type-specific targets), `NotificationRecipient`, `SmtpConfiguration`, `Notification` (the durable central-queue row — see below).
|
||||
- **Inbound API**: `ApiKey`, `ApiMethod`.
|
||||
- **Security**: `LdapGroupMapping`, `SiteScopeRule`.
|
||||
- **Deployment**: `DeploymentRecord`, `SystemArtifactDeploymentRecord`, `DeployedConfigSnapshot`.
|
||||
- **Audit**: `AuditLogEntry`.
|
||||
|
||||
The **`Notification`** entity is the persistence-ignorant POCO for a row of the central `Notifications` table — the durable notification queue owned by the Notification Outbox. It is a plain class with properties for `NotificationId` (GUID, the idempotency key), `Type` (`NotificationType` enum discriminator), `ListName`, `Subject`, `Body`, `TypeData` (a JSON string — the type-agnostic extensibility hook), `Status` (`NotificationStatus` enum), `RetryCount`, `LastError`, `ResolvedTargets`, the provenance fields `SourceSiteId` / `SourceInstanceId` / `SourceScript`, and the UTC timestamps `SiteEnqueuedAt`, `CreatedAt`, `LastAttemptAt`, `NextAttemptAt`, `DeliveredAt`. As with every entity class it has no EF dependency; the Configuration Database component supplies the Fluent API mapping, value conversions, and indexes. The `Type` and `Status` enums (`NotificationType`: `Email`, `Teams`, …; `NotificationStatus`: `Pending`, `Retrying`, `Delivered`, `Parked`, `Discarded`) are defined under `Types/Enums/` per REQ-COM-1.
|
||||
|
||||
### REQ-COM-4: Per-Component Repository Interfaces
|
||||
|
||||
Commons must define repository interfaces that consuming components use for data access. Each interface is tailored to the data needs of its consuming component:
|
||||
@@ -80,7 +89,9 @@ Commons must define repository interfaces that consuming components use for data
|
||||
- `ISecurityRepository` — LDAP group mappings, site scoping rules.
|
||||
- `IInboundApiRepository` — API keys, API method definitions.
|
||||
- `IExternalSystemRepository` — External system definitions, method definitions, database connection definitions.
|
||||
- `INotificationRepository` — Notification lists, recipients, SMTP configuration.
|
||||
- `INotificationRepository` — Notification lists (including the `Type` field), recipients, SMTP configuration.
|
||||
- `INotificationOutboxRepository` — The `Notifications` table: insert-if-not-exists ingest on `NotificationId`, due-row polling (`Pending` rows and `Retrying` rows past `NextAttemptAt`), status transitions, KPI aggregate queries, and the bulk delete of terminal rows used by the daily purge job.
|
||||
- `ISiteCallAuditRepository` — The `SiteCalls` table: insert-if-not-exists ingest on `TrackedOperationId`, upsert-on-newer-status from telemetry and reconciliation pulls, KPI aggregate queries, and the bulk delete of terminal rows used by the daily purge job.
|
||||
- `ISiteRepository` — Sites, data connections, and their site assignments.
|
||||
- `ICentralUiRepository` — Read-oriented queries spanning multiple domain areas for display purposes.
|
||||
|
||||
@@ -100,6 +111,8 @@ Commons must define service interfaces for cross-cutting concerns that multiple
|
||||
- **`IExternalSystemClient`**: Provides script-facing invocation of external system HTTP APIs (synchronous `Call` and store-and-forward `CachedCall`). Implemented by the External System Gateway, consumed by the script runtime context.
|
||||
- **`IInstanceLocator`**: Resolves an instance unique name to its site identifier. Used by the Inbound API's `Route.To()` to determine the destination site.
|
||||
- **`INotificationDeliveryService`**: Sends notifications to a named notification list, routing transient failures to store-and-forward. Implemented by the Notification Service, consumed by the script runtime context.
|
||||
- **`IAuditWriter`**: Site-local hot-path interface for appending an `AuditEvent` to the site SQLite `AuditLog`: `Task WriteAsync(AuditEvent evt, CancellationToken ct)`. Single durable INSERT, `ForwardState = Pending`. Consumed by the script-trust-boundary call paths (External System Gateway, Database layer, Store-and-Forward Engine). Implementation lives in the Audit Log component.
|
||||
- **`ICentralAuditWriter`**: Central direct-write interface for central-originated audit rows (Inbound API request completion, Notification Outbox dispatcher attempts/terminals): `Task WriteAsync(AuditEvent evt, CancellationToken ct)`, with insert-if-not-exists semantics on `EventId` so retried handlers cannot produce duplicates. Implementation lives in the Audit Log component.
|
||||
|
||||
These interfaces are defined in Commons so that consuming components depend only on the abstraction, not on the implementing component.
|
||||
|
||||
@@ -115,6 +128,10 @@ Commons must define the shared DTOs and message contracts used for inter-compone
|
||||
- **Debug View DTOs**: Subscribe/unsubscribe requests, one-shot snapshot request (`DebugSnapshotRequest`), initial snapshot, stream filter criteria.
|
||||
- **Script Execution DTOs**: Script call requests (with recursion depth), return values, error results.
|
||||
- **System-Wide Artifact DTOs**: Shared script packages, external system definitions, database connection definitions, notification list definitions.
|
||||
- **Notification DTOs**: `NotificationSubmit` (site→central submission: `NotificationId`, `ListName`, `Subject`, `Body`, provenance, `SiteEnqueuedAt`) and `NotificationSubmitAck` (central acknowledgement returned only after the `Notifications` row is persisted — ack-after-persist — which the site Store-and-Forward Engine waits on before clearing the buffered message). `NotificationStatusQuery` / `NotificationStatusResponse` back the `Notify.Status` script API, round-tripping a status record (status, retry count, last error, key timestamps) once a notification has been forwarded. Recipient resolution is *not* part of any contract — the site forwards only `(listName, subject, body)` and central resolves the list at delivery time. Subject to the additive-only evolution rules in REQ-COM-5a, since a submission can cross the site→central version-skew boundary.
|
||||
- **Cached Call Tracking DTOs**: `CachedCallTelemetry` (site→central lifecycle telemetry for a tracked cached call: `TrackedOperationId`, source site, `Kind` — the `TrackedOperationKind` enum (`ExternalCall` / `DatabaseWrite`) — target summary, status, retry count, last error, key timestamps, and source instance / script provenance) and `CachedCallReconcileRequest` / `CachedCallReconcileResponse` (cursor-based per-site pull of tracking rows changed since a cursor, used so missed telemetry self-heals). All three live in the `Integration/` message folder and are subject to the additive-only evolution rules in REQ-COM-5a, since they cross the site→central version-skew boundary. `CachedCallTelemetry` is additively extended to also carry the `AuditEvent` content for the corresponding lifecycle transition (`CachedEnqueued` / `CachedAttempt` / `CachedTerminal`), so one packet drives both the `SiteCalls` operational upsert and the `AuditLog` insert-if-not-exists in a single central transaction — see [Component-AuditLog.md](Component-AuditLog.md), Cached Operations — Combined Telemetry.
|
||||
- **Parked Operation Command DTOs**: `RetryParkedOperation` and `DiscardParkedOperation` (central→site command/control messages keyed by `TrackedOperationId`, instructing the owning site to retry or discard a parked store-and-forward operation). These generalize the existing parked-message retry/discard commands to also cover parked cached calls; they live in the `RemoteQuery/` message folder alongside the other parked-message management messages.
|
||||
- **Audit Telemetry DTOs**: `AuditTelemetryEnvelope` (site→central gRPC message wrapping a batch of `AuditEvent` rows for the `IngestAuditEvents` telemetry call) and the matching reconciliation pull messages (`PullAuditEvents` request/response carrying a `sinceUtc` cursor and a batch of `AuditEvent` rows). Live in the `Integration/` message folder, subject to the additive-only evolution rules in REQ-COM-5a since they cross the site→central version-skew boundary. Cached-operation audit rows do **not** travel via `AuditTelemetryEnvelope` — they are folded into `CachedCallTelemetry` per the bullet above.
|
||||
|
||||
All message types must be `record` types or immutable classes suitable for use as Akka.NET messages (though Commons itself must not depend on Akka.NET).
|
||||
|
||||
@@ -141,10 +158,15 @@ ScadaLink.Commons/
|
||||
│ ├── StaleTagMonitor.cs # heartbeat staleness watchdog
|
||||
│ ├── ValueFormatter.cs # culture-invariant value-to-string helper
|
||||
│ ├── DynamicJsonElement.cs # dynamic JSON wrapper for scripts
|
||||
│ ├── TrackedOperationId.cs # tracked store-and-forward operation ID (GUID)
|
||||
│ ├── Enums/ # InstanceState, DeploymentStatus, AlarmState,
|
||||
│ │ # AlarmLevel, AlarmTriggerType, ConnectionHealth,
|
||||
│ │ # DataType, StoreAndForwardCategory,
|
||||
│ │ # StoreAndForwardMessageStatus
|
||||
│ │ # StoreAndForwardMessageStatus,
|
||||
│ │ # NotificationType, NotificationStatus,
|
||||
│ │ # TrackedOperationKind, TrackedOperationStatus,
|
||||
│ │ # AuditChannel, AuditKind, AuditStatus
|
||||
│ ├── Audit/ # AuditEvent record (site + central audit row)
|
||||
│ ├── DataConnections/ # OPC UA endpoint config value objects + enums
|
||||
│ ├── Flattening/ # FlattenedConfiguration, ConfigurationDiff,
|
||||
│ │ # DeploymentPackage, ValidationResult
|
||||
@@ -158,10 +180,14 @@ ScadaLink.Commons/
|
||||
│ │ ├── IInboundApiRepository.cs
|
||||
│ │ ├── IExternalSystemRepository.cs
|
||||
│ │ ├── INotificationRepository.cs
|
||||
│ │ ├── INotificationOutboxRepository.cs
|
||||
│ │ ├── ISiteCallAuditRepository.cs
|
||||
│ │ ├── ISiteRepository.cs
|
||||
│ │ └── ICentralUiRepository.cs
|
||||
│ └── Services/ # REQ-COM-4a: Cross-cutting service interfaces
|
||||
│ ├── IAuditService.cs
|
||||
│ ├── IAuditWriter.cs
|
||||
│ ├── ICentralAuditWriter.cs
|
||||
│ ├── IDatabaseGateway.cs
|
||||
│ ├── IExternalSystemClient.cs
|
||||
│ ├── IInstanceLocator.cs
|
||||
@@ -174,7 +200,8 @@ ScadaLink.Commons/
|
||||
│ ├── Sites/ # Site, DataConnection
|
||||
│ ├── ExternalSystems/ # ExternalSystemDefinition, ExternalSystemMethod,
|
||||
│ │ # DatabaseConnectionDefinition
|
||||
│ ├── Notifications/ # NotificationList, NotificationRecipient, SmtpConfiguration
|
||||
│ ├── Notifications/ # NotificationList, NotificationRecipient, SmtpConfiguration,
|
||||
│ │ # Notification (central Notifications-table row)
|
||||
│ ├── InboundApi/ # ApiKey, ApiMethod
|
||||
│ ├── Security/ # LdapGroupMapping, SiteScopeRule
|
||||
│ ├── Deployment/ # DeploymentRecord, SystemArtifactDeploymentRecord,
|
||||
@@ -192,9 +219,14 @@ ScadaLink.Commons/
|
||||
│ ├── Artifacts/
|
||||
│ ├── DataConnection/ # data-connection subscribe/write/health messages
|
||||
│ ├── Instance/ # attribute get/set request/command messages
|
||||
│ ├── Integration/ # external-integration call request/response
|
||||
│ ├── Integration/ # external-integration call request/response,
|
||||
│ │ # cached-call tracking telemetry + reconcile,
|
||||
│ │ # audit telemetry envelope + reconcile
|
||||
│ ├── Notification/ # NotificationSubmit + ack,
|
||||
│ │ # NotificationStatusQuery/Response
|
||||
│ ├── InboundApi/ # Route.To() request messages
|
||||
│ ├── RemoteQuery/ # event-log and parked-message query messages
|
||||
│ ├── RemoteQuery/ # event-log and parked-message query messages,
|
||||
│ │ # parked-operation retry/discard commands
|
||||
│ └── Management/ # HTTP/ClusterClient management commands + registry
|
||||
├── Serialization/ # OpcUaEndpointConfigSerializer (typed↔legacy JSON)
|
||||
└── Validators/ # OpcUaEndpointConfigValidator
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
## Purpose
|
||||
|
||||
The Communication component manages all messaging between the central cluster and site clusters. It provides the transport layer for deployments, instance lifecycle commands, integration routing, debug streaming, health reporting, and remote queries (parked messages, event logs). Two transports are used: **Akka.NET ClusterClient** for command/control messaging and **gRPC server-streaming** for real-time data (attribute values, alarm states).
|
||||
The Communication component manages all messaging between the central cluster and site clusters. It provides the transport layer for deployments, instance lifecycle commands, integration routing, debug streaming, health reporting, notification submission, and remote queries (parked messages, event logs). Two transports are used: **Akka.NET ClusterClient** for command/control messaging and **gRPC server-streaming** for real-time data (attribute values, alarm states).
|
||||
|
||||
## Location
|
||||
|
||||
@@ -36,7 +36,7 @@ Both central and site clusters. Each side has communication actors that handle m
|
||||
|
||||
### 3. System-Wide Artifact Deployment (Central → Site(s))
|
||||
- **Pattern**: Broadcast with per-site acknowledgment (deploy to all sites), or targeted to a single site (per-site deployment).
|
||||
- When shared scripts, external system definitions, database connections, data connections, notification lists, or SMTP configuration are explicitly deployed, central sends them to the target site(s).
|
||||
- When shared scripts, external system definitions, database connections, or data connections are explicitly deployed, central sends them to the target site(s). (Notification lists and SMTP configuration are central-only and are not deployed to sites.)
|
||||
- Each site acknowledges receipt and reports success/failure independently.
|
||||
- **Shared script deployment triggers immediate recompilation on the site** — the site's `SharedScriptLibrary` replaces its in-memory compiled code, making updated shared scripts available to all running instances without redeployment. Other artifact types (external systems, database connections, etc.) are stored but do not require recompilation.
|
||||
|
||||
@@ -122,7 +122,22 @@ Keepalive settings are configurable via `CommunicationOptions`:
|
||||
- Site event logs.
|
||||
- Instance debug snapshots (attribute values and alarm states).
|
||||
- Central can also send management commands:
|
||||
- Retry or discard parked messages.
|
||||
- Retry or discard parked messages and parked cached calls — central sends `RetryParkedOperation` / `DiscardParkedOperation` (keyed by `TrackedOperationId`) to the owning site, which applies the change to its S&F buffer and tracking table.
|
||||
|
||||
### 9. Notification Submission (Site → Central)
|
||||
- **Pattern**: Fire-and-forget with acknowledgment.
|
||||
- The site **Store-and-Forward Engine** sends a `NotificationSubmit` message to central carrying the notification — `NotificationId`, target list name, subject, body, and source provenance.
|
||||
- Central ingests the submission with an insert-if-not-exists on `NotificationId` and acknowledges **after the row is persisted** to the `Notifications` table in the central configuration database. The site S&F engine clears the buffered message only on that ack.
|
||||
- The `NotificationId` GUID — generated at the site — is the **idempotency key**. The handoff is at-least-once: a re-sent submission after a lost ack is harmless because central's insert-if-not-exists treats the duplicate as a no-op.
|
||||
- **Transport**: ClusterClient (site→central command/control), consistent with how other site→central messages are sent.
|
||||
|
||||
### 10. Cached Call Telemetry (Site → Central)
|
||||
- **Pattern**: Fire-and-forget telemetry with a periodic reconciliation pull.
|
||||
- The site **Store-and-Forward Engine** emits a `CachedCallTelemetry` message to central on **every** cached-call lifecycle transition (`Pending → Retrying → Delivered / Parked / Failed / Discarded`). The first telemetry event for an operation carries its initial status — `Pending` when a transient failure has buffered the call, or directly `Delivered`/`Failed` for a cached call that never buffers. The message carries the `TrackedOperationId`, source site, `Kind` (the `TrackedOperationKind` enum), target summary, status, retry count, last error, key timestamps, and source provenance.
|
||||
- Emission is **best-effort and at-least-once**, **idempotent on `TrackedOperationId`** — central's Site Call Audit component ingests with insert-if-not-exists then upsert-on-newer-status, so a re-sent or out-of-order event is harmless.
|
||||
- **Reconciliation pull**: because telemetry is best-effort, the central **Site Call Audit** component periodically — and on site reconnect — issues a `CachedCallReconcileRequest` to each site; the site replies with a `CachedCallReconcileResponse` carrying all tracking rows changed since a cursor. Any telemetry missed during a disconnect self-heals through this pull.
|
||||
- Central audit is an **eventually-consistent mirror** — the site's operation tracking table remains the source of truth for cached-call status (`Tracking.Status(id)` is always answered site-locally).
|
||||
- **Transport**: ClusterClient (site→central command/control), consistent with how other site→central messages are sent.
|
||||
|
||||
## Topology
|
||||
|
||||
@@ -174,6 +189,8 @@ Each request/response pattern has a default timeout that can be overridden in co
|
||||
| 4. Integration Routing | 30 seconds | External system waiting for response; Inbound API per-method timeout may cap this further |
|
||||
| 5. Recipe/Command Delivery | 30 seconds | Fire-and-forget with ack |
|
||||
| 8. Remote Queries | 30 seconds | Querying parked messages or event logs |
|
||||
| 9. Notification Submission | 30 seconds | Fire-and-forget with ack; central acks after persisting the row |
|
||||
| 10. Cached Call Telemetry | 30 seconds | Reconciliation pull is request/response; telemetry emission itself is fire-and-forget |
|
||||
|
||||
Timeouts use the Akka.NET **ask pattern**. If no response is received within the timeout, the caller receives a timeout failure.
|
||||
|
||||
@@ -229,6 +246,7 @@ The ManagementActor is registered at the well-known path `/user/management` on c
|
||||
- **Site Runtime**: Receives deployments, lifecycle commands, and artifact updates. Provides debug view data.
|
||||
- **Central UI**: Debug view requests and remote queries flow through communication.
|
||||
- **Health Monitoring**: Receives periodic health reports from sites.
|
||||
- **Store-and-Forward Engine (site)**: Parked message queries/commands are routed through communication.
|
||||
- **Store-and-Forward Engine (site)**: Parked message queries/commands are routed through communication. Also emits `CachedCallTelemetry` and answers `CachedCallReconcileRequest` pulls, and receives relayed `RetryParkedOperation` / `DiscardParkedOperation` commands.
|
||||
- **Site Call Audit (central)**: Receives cached-call telemetry and reconciliation responses; issues reconciliation pulls and relays parked-operation Retry/Discard commands to sites through communication.
|
||||
- **Site Event Logging**: Event log queries are routed through communication.
|
||||
- **Management Service**: The ManagementActor is registered with ClusterClientReceptionist on central nodes. The CLI communicates with the ManagementActor via ClusterClient, which is a separate channel from inter-cluster remoting.
|
||||
|
||||
@@ -52,9 +52,16 @@ The configuration database stores all central system data, organized by domain a
|
||||
- **Database Connection Definitions**: Named database connections (name, connection details, retry settings).
|
||||
|
||||
### Notifications
|
||||
- **Notification Lists**: List definitions (name).
|
||||
- **Notification Lists**: List definitions (name, `Type` discriminator — `Email` / `Teams` / … — selecting the list's notification type and type-specific targets).
|
||||
- **Notification Recipients**: Recipients per list (name, email address).
|
||||
- **SMTP Configuration**: Email server settings.
|
||||
- **Notifications**: The durable central notification queue owned by the Notification Outbox — one row per notification, the single source of audit truth. The schema is **type-agnostic** so it records any notification type the system supports (email today, Microsoft Teams and others later): a `Type` discriminator selects the type, and a `TypeData` JSON column (`nvarchar(max)`) carries any future per-type fields without a schema change. Columns: `NotificationId` (GUID, primary key — generated at the site, used as the idempotency key), `Type`, `ListName`, `Subject`, `Body`, `TypeData`, `Status`, `RetryCount`, `LastError`, `ResolvedTargets`, `SourceSiteId`, `SourceInstanceId`, `SourceScript`, `SiteEnqueuedAt`, `CreatedAt`, `LastAttemptAt`, `NextAttemptAt`, `DeliveredAt`. `Status` is a `NotificationStatus` enum stored with values `Pending`, `Retrying`, `Delivered`, `Parked`, `Discarded` (the site-local `Forwarding` state is never persisted centrally). Indexed on `Status` and `NextAttemptAt` for efficient dispatcher polling of due rows, and on `SourceSiteId` and `CreatedAt` for KPI computation and the Central UI query page. Terminal rows are removed by a daily purge job — see Scheduled Maintenance below. See Component-NotificationOutbox.md for the full lifecycle.
|
||||
|
||||
### Site Calls
|
||||
- **SiteCalls**: The central audit table for cached site calls — `ExternalSystem.CachedCall()` and `Database.CachedWrite()` — owned by the Site Call Audit component and a sibling of the `Notifications` table. One row per cached operation. Columns: `TrackedOperationId` (GUID, primary key — generated site-side at call time, used as the idempotency key), `SourceSite`, `Kind` (a `TrackedOperationKind` enum stored with values `ExternalCall` / `DatabaseWrite`), `TargetSummary` (external system + method for an `ExternalCall`, database connection name for a `DatabaseWrite`), `Status` (a `TrackedOperationStatus` enum stored with values `Pending`, `Retrying`, `Delivered`, `Parked`, `Failed`, `Discarded`), `RetryCount`, `LastError`, `Provenance` (source instance / script), `CreatedAtUtc`, `UpdatedAtUtc`, `TerminalAtUtc`. The table is populated **only** by Site Call Audit telemetry and reconciliation pulls — sites are the source of truth and the row is an eventually-consistent mirror, never written by a central dispatcher. Ingestion is **insert-if-not-exists** keyed on `TrackedOperationId`, then **upsert-on-newer-status**; the lifecycle is monotonic, so at-least-once and out-of-order telemetry are harmless. Indexed on `Status` and `SourceSite` for KPI computation and the Central UI query page. Terminal rows are removed by a daily purge job — see Scheduled Maintenance below. See Component-SiteCallAudit.md for the full lifecycle.
|
||||
|
||||
### Audit Log
|
||||
- **AuditLog**: The central, append-only audit table owned by the Audit Log component — one row per script-trust-boundary lifecycle event across all channels (outbound API calls, outbound DB writes/reads, notifications, and inbound API requests). Sibling of the `Notifications` and `SiteCalls` tables but distinct: `AuditLog` is the immutable history that observes the other subsystems, not an operational state store. Columns: `EventId` (`uniqueidentifier` primary key — generated at the originator, used as the idempotency key), `OccurredAtUtc` (`datetime2`), `IngestedAtUtc` (`datetime2`), `Channel` (`varchar(32)` — `ApiOutbound` / `DbOutbound` / `Notification` / `ApiInbound`), `Kind` (`varchar(32)` — channel-specific event kind), `CorrelationId` (`uniqueidentifier` NULL — `TrackedOperationId` for cached calls, `NotificationId` for notifications, request-id for inbound API), `SourceSiteId` (`varchar(64)` NULL), `SourceInstanceId` (`varchar(128)` NULL), `SourceScript` (`varchar(128)` NULL), `Actor` (`varchar(128)` NULL), `Target` (`varchar(256)` NULL), `Status` (`varchar(32)` — outcome of *this event*: `Success`, `TransientFailure`, `PermanentFailure`, `Enqueued`, `Retrying`, `Delivered`, `Parked`, `Discarded`), `HttpStatus` (`int` NULL), `DurationMs` (`int` NULL), `ErrorMessage` (`nvarchar(1024)` NULL), `ErrorDetail` (`nvarchar(max)` NULL), `RequestSummary` (`nvarchar(max)` NULL — truncated request payload, headers redacted), `ResponseSummary` (`nvarchar(max)` NULL — truncated response payload), `PayloadTruncated` (`bit`), `Extra` (`nvarchar(max)` NULL — channel-specific JSON for fields not promoted to columns). Indexes: `IX_AuditLog_OccurredAtUtc` (primary time-range index for global scans), `IX_AuditLog_Site_Occurred (SourceSiteId, OccurredAtUtc)` (per-site filters), `IX_AuditLog_Correlation (CorrelationId)` (drilldown from a single operation), `IX_AuditLog_Channel_Status_Occurred (Channel, Status, OccurredAtUtc)` (KPI / dashboard tiles), and `IX_AuditLog_Target_Occurred (Target, OccurredAtUtc)` ("what did we send to system X"). The primary key on `EventId` enforces idempotency — central ingest is `INSERT … WHERE NOT EXISTS`, so at-least-once telemetry and reconciliation retries collapse to a single row. **Monthly partitioning** on `OccurredAtUtc` from day one via partition function `pf_AuditLog_Month` and partition scheme `ps_AuditLog_Month`, with a filegroup-per-month rollover so that retention purge is a partition switch rather than a row-level delete. The partition-maintenance job that rolls the scheme forward and switches expired partitions is owned by the Audit Log component, not this component. The table is populated only by Audit Log writers (site telemetry, central direct-write, reconciliation pulls); central ingest is **insert-if-not-exists** keyed on `EventId`. See Component-AuditLog.md for the full lifecycle, payload-capture policy, and ingestion paths.
|
||||
|
||||
### Inbound API
|
||||
- **API Keys**: Key definitions (name/label, key value, enabled flag).
|
||||
@@ -94,7 +101,9 @@ Repository interfaces are defined in **Commons** alongside the POCO entity class
|
||||
| `ISecurityRepository` | Security & Auth | LDAP group mappings, site scoping rules |
|
||||
| `IInboundApiRepository` | Inbound API | API keys, API method definitions |
|
||||
| `IExternalSystemRepository` | External System Gateway | External system definitions, method definitions, database connection definitions |
|
||||
| `INotificationRepository` | Notification Service | Notification lists, recipients, SMTP configuration |
|
||||
| `INotificationRepository` | Notification Service | Notification lists (including the `Type` field), recipients, SMTP configuration |
|
||||
| `INotificationOutboxRepository` | Notification Outbox | The `Notifications` table — insert-if-not-exists ingest, due-row polling, status transitions, KPI aggregate queries, and bulk delete of terminal rows used by the daily purge job |
|
||||
| `ISiteCallAuditRepository` | Site Call Audit | The `SiteCalls` table — insert-if-not-exists ingest, upsert-on-newer-status, KPI aggregate queries, and bulk delete of terminal rows used by the daily purge job |
|
||||
| `IHealthMonitoringRepository` | Health Monitoring | (Minimal — health data is in-memory; repository needed only if connectivity history is persisted in the future) |
|
||||
| `ICentralUiRepository` | Central UI | Read-oriented queries spanning multiple domain areas for display purposes |
|
||||
|
||||
@@ -198,7 +207,7 @@ Since only the after-state is stored, change history for an entity is reconstruc
|
||||
| Alarms | Create, edit, delete alarm definitions |
|
||||
| Instances | Create, override values, bind connections, area assignment, disable, enable, delete |
|
||||
| Deployments | Deploy to instance (who, what, which instance, success/failure) |
|
||||
| System-Wide Artifact Deployments | Deploy shared scripts / external system definitions / DB connections / data connections / notification lists / SMTP config to site(s) (who, what, which site(s), result) |
|
||||
| System-Wide Artifact Deployments | Deploy shared scripts / external system definitions / DB connections / data connections to site(s) (who, what, which site(s), result) |
|
||||
| External Systems | Create, edit, delete definitions |
|
||||
| Database Connections | Create, edit, delete definitions |
|
||||
| Notification Lists | Create, edit, delete lists and recipients |
|
||||
@@ -209,7 +218,7 @@ Since only the after-state is stored, change history for an entity is reconstruc
|
||||
|
||||
### Query Capabilities
|
||||
|
||||
The Central UI audit log viewer can filter by:
|
||||
The Central UI Configuration Audit Log Viewer (distinct from the operational Audit Log page in #23) can filter by:
|
||||
- **User**: Who made the change.
|
||||
- **Entity type**: What kind of entity was changed.
|
||||
- **Action type**: What kind of operation was performed.
|
||||
@@ -220,12 +229,25 @@ Results are returned in reverse chronological order (most recent first) with pag
|
||||
|
||||
---
|
||||
|
||||
## Database Roles
|
||||
|
||||
The configuration database defines dedicated SQL Server roles for the append-only `AuditLog` table so that the application can never accidentally mutate audit history:
|
||||
|
||||
- **`scadalink_audit_writer`** — the role used by application code that ingests audit events (the `AuditLogIngestActor`, central direct-write paths, and the Notification Outbox dispatcher). Granted `INSERT` and `SELECT` on `AuditLog` only — explicitly **no** `UPDATE` and **no** `DELETE`. Audit ingest is `INSERT … WHERE NOT EXISTS` keyed on `EventId`, which this grant set fully supports.
|
||||
- **`scadalink_audit_purger`** — the role used by the `AuditLogPurgeActor`. Granted only the permissions required to execute the monthly partition-switch operation (switch out a partition to a staging table and drop the staging table). Row-level `DELETE` on `AuditLog` is **not** granted even to the purge role; retention is a partition switch, never a row-by-row delete.
|
||||
|
||||
A CI grep guard fails the build on any occurrence of `UPDATE … AuditLog` or `DELETE … AuditLog` in the data-access layer source, backstopping the DB-grant enforcement at code-review time. See Component-AuditLog.md (Security & Tamper-Evidence) for the full enforcement contract.
|
||||
|
||||
---
|
||||
|
||||
## Migration Management
|
||||
|
||||
### Entity Framework Core Migrations
|
||||
|
||||
- Schema changes are managed via EF Core Migrations (`dotnet ef migrations add`, `dotnet ef migrations script`).
|
||||
- Each migration is a versioned, incremental schema change.
|
||||
- New tables are introduced as their own migration — for example, the `Notifications` table for the Notification Outbox ships as a dedicated EF Core migration that creates the table, its `Type`/`Status` value conversions, and its dispatcher and KPI indexes.
|
||||
- The initial `AuditLog` migration creates the monthly partition function `pf_AuditLog_Month` and partition scheme `ps_AuditLog_Month`, then creates the `AuditLog` table aligned to that scheme on `OccurredAtUtc`, along with the indexes listed under Database Schema. The migration also creates the `scadalink_audit_writer` and `scadalink_audit_purger` DB roles with the grants described in Database Roles. The ongoing **partition-maintenance job** that rolls the scheme forward each month (creating the next month's partition ahead of time) and switches out expired partitions is owned by the **Audit Log component** (`AuditLogPurgeActor` and its monthly roll-forward step), not by the Configuration Database component — this component is responsible only for the initial schema, roles, and any EF migrations against the table going forward.
|
||||
|
||||
### Development Environment
|
||||
- Migrations are **auto-applied** at application startup using `dbContext.Database.MigrateAsync()`.
|
||||
@@ -265,6 +287,22 @@ The Configuration Database supports seeding initial data required for the system
|
||||
|
||||
---
|
||||
|
||||
## Scheduled Maintenance
|
||||
|
||||
### Notifications Table Purge
|
||||
|
||||
The `Notifications` table grows one row per notification and is never trimmed by normal operation — `Discarded` rows are deliberately retained for audit. To bound table growth while preserving a strong audit trail, a **daily purge job** deletes terminal rows (`Delivered`, `Parked`, `Discarded`) older than a configurable retention window (default 365 days). Non-terminal rows (`Pending`, `Retrying`) are never purged. The purge is a bulk `DELETE` against `INotificationOutboxRepository`; it is owned and scheduled by the Notification Outbox component (see Component-NotificationOutbox.md), which supplies the retention window from `NotificationOutboxOptions`. The Configuration Database component provides only the repository operation and the table.
|
||||
|
||||
### SiteCalls Table Purge
|
||||
|
||||
The `SiteCalls` table grows one row per cached site call and is never trimmed by normal operation. To bound table growth while preserving a strong audit trail, a **daily purge job** deletes terminal rows (`Delivered`, `Failed`, `Discarded`) older than a configurable retention window (default 365 days). Non-terminal rows (`Pending`, `Retrying`, `Parked`) are never purged. The purge is a bulk `DELETE`; it is owned and scheduled by the Site Call Audit component (see Component-SiteCallAudit.md), which supplies the retention window. The Configuration Database component provides only the repository operation and the table.
|
||||
|
||||
### AuditLog Table Purge
|
||||
|
||||
The `AuditLog` table is append-only and grows by every script-trust-boundary event across all channels. Unlike `Notifications` and `SiteCalls`, purge is **never a row-level `DELETE`** — it is a **monthly partition switch** against the `ps_AuditLog_Month` scheme. A daily job switches out any partition whose latest `OccurredAtUtc` is older than the global retention window (default 365 days, configurable via the `AuditLog:RetentionDays` Audit Log option — single global value in v1, no per-channel overrides) and drops the resulting staging table. The job is owned and scheduled by the Audit Log component (`AuditLogPurgeActor` — see Component-AuditLog.md), which is also the consumer of the `AuditLog:RetentionDays` option. The Configuration Database component contributes only the table, the partition function/scheme, the indexes, and the DB roles that constrain the purge to a partition switch.
|
||||
|
||||
---
|
||||
|
||||
## Connection Management
|
||||
|
||||
- Connection strings are provided via the Host's `DatabaseConfiguration` options (bound from `appsettings.json`).
|
||||
@@ -289,6 +327,8 @@ The Configuration Database supports seeding initial data required for the system
|
||||
- **Inbound API**: Uses `IInboundApiRepository` for API keys and method definitions.
|
||||
- **External System Gateway**: Uses `IExternalSystemRepository` for external system and database connection definitions.
|
||||
- **Notification Service**: Uses `INotificationRepository` for notification lists and SMTP configuration.
|
||||
- **Central UI**: Uses `ICentralUiRepository` for read-oriented queries across domain areas, including audit log queries for the audit log viewer.
|
||||
- **Notification Outbox**: Uses `INotificationOutboxRepository` for all access to the `Notifications` table — ingest, dispatch polling, status updates, KPI queries, and the daily purge of terminal rows.
|
||||
- **Site Call Audit**: Uses `ISiteCallAuditRepository` for all access to the `SiteCalls` table — telemetry/reconciliation ingest, KPI queries, and the daily purge of terminal rows.
|
||||
- **Central UI**: Uses `ICentralUiRepository` for read-oriented queries across domain areas, including config-audit queries for the Configuration Audit Log Viewer (the operational Audit Log page is owned by #23).
|
||||
- **All central components that modify state**: Call `IAuditService.LogAsync()` after successful operations to record audit entries within the same transaction.
|
||||
- **Host**: Provides database connection configuration. Registers DbContext, repository implementations, and `IAuditService` implementation in the DI container. Triggers auto-migration in development or validates schema version in production.
|
||||
|
||||
@@ -17,7 +17,7 @@ Central cluster only. The site-side deployment responsibilities (receiving confi
|
||||
- Track deployment status (pending, in-progress, success, failed).
|
||||
- Handle deployment failures gracefully — if a site is unreachable or the deployment fails, report the failure. No retry or buffering at central.
|
||||
- If a central failover occurs during deployment, the deployment is treated as failed and must be re-initiated.
|
||||
- Deploy system-wide artifacts (shared scripts, external system definitions, database connection definitions, data connection definitions, notification lists, SMTP configuration) to all sites or to an individual site on explicit request.
|
||||
- Deploy system-wide artifacts (shared scripts, external system definitions, database connection definitions, data connection definitions) to all sites or to an individual site on explicit request.
|
||||
- Send instance lifecycle commands (disable, enable, delete) to sites via the Communication Layer.
|
||||
|
||||
## Deployment Flow
|
||||
@@ -106,30 +106,14 @@ A deployment to a site includes the flattened instance configuration plus any sy
|
||||
- External system definitions
|
||||
- Database connection definitions
|
||||
- Data connection definitions
|
||||
- Notification lists
|
||||
- SMTP configuration
|
||||
|
||||
System-wide artifact deployment is a **separate action** from instance deployment, triggered explicitly by a user with the Deployment role. Artifacts can be deployed to all sites at once or to an individual site (per-site deployment via the Sites admin page).
|
||||
|
||||
### Secret handling in artifacts
|
||||
|
||||
The SMTP configuration artifact carries the SMTP credential (password or OAuth2
|
||||
client secret). This is a **conscious, accepted design decision**: SMTP
|
||||
configuration is a deployable artifact, so the credential is distributed to
|
||||
sites that need it. The credential is protected by the following controls:
|
||||
|
||||
- **In transit** — artifact-deployment commands travel over the inter-cluster
|
||||
transport, which is TLS-protected (see Cluster Infrastructure / Communication).
|
||||
- **Not logged** — the Deployment Manager never writes credential values to
|
||||
logs; deployment log statements reference only site IDs/names, the deployment
|
||||
ID, and exception messages.
|
||||
- **At rest on the site** — the credential is stored in the site's local SQLite
|
||||
artifact store. At-rest encryption of that field is **not** currently applied;
|
||||
it is treated as acceptable given the TLS-protected transport, the absence of
|
||||
any logging leak, and the trust boundary of the site host. Encrypting the
|
||||
credential field within the artifact payload would require a key-management
|
||||
scheme (key location and distribution to sites) and is recorded here as a
|
||||
possible future hardening item, not a current requirement.
|
||||
Notification lists and SMTP configuration are **not** deployable artifacts — they
|
||||
are central-only definitions managed by the Notification Service (see
|
||||
Component-NotificationService.md). Notification delivery happens on the central
|
||||
cluster, so no notification artifact or SMTP credential is ever distributed to
|
||||
sites.
|
||||
|
||||
## Site-Side Apply Atomicity
|
||||
|
||||
|
||||
@@ -57,12 +57,15 @@ Each database connection definition includes:
|
||||
- Script calls `Database.Connection("name")` and receives a raw ADO.NET `SqlConnection`.
|
||||
- Full control: queries, updates, transactions, stored procedures.
|
||||
- Failures are immediate — no buffering.
|
||||
- **Audit emission**: script-initiated `Execute`/`ExecuteScalar` calls emit `DbOutbound.SyncWrite` rows; `ExecuteReader` emits `DbOutbound.SyncRead`. SQL parameter values are captured by default; per-connection redaction opt-in via the Audit Log configuration (see [Component-AuditLog.md](Component-AuditLog.md), Payload Capture Policy). Audit-write failure never aborts the script.
|
||||
|
||||
### Cached Write (Store-and-Forward)
|
||||
- Script calls `Database.CachedWrite("name", "sql", parameters)`.
|
||||
- The write is submitted to the Store-and-Forward Engine.
|
||||
- Script calls `Database.CachedWrite("name", "sql", parameters)`. This is **deferred delivery**: the call returns a `TrackedOperationId` tracking handle immediately rather than the write result.
|
||||
- Payload includes: connection name, SQL statement, serialized parameter values.
|
||||
- If the database is unavailable, the write is buffered and retried per the connection's retry settings.
|
||||
- The write is attempted immediately. On immediate success it is recorded as a terminal `Delivered` tracking record. On **transient failure** (database unavailable) it is buffered (`Pending`/`Retrying`) and retried per the connection's retry settings by the Store-and-Forward Engine.
|
||||
- On **permanent failure** (e.g. a SQL syntax or constraint error — a request that will never succeed), the error is returned **synchronously** to the calling script and the write is **not** buffered. The call is also recorded as a terminal `Failed` tracking record capturing the error.
|
||||
- Cached-write status is observable to scripts via `Tracking.Status(id)` (answered site-locally and authoritatively) and centrally via the Site Call Audit component.
|
||||
- **Audit emission**: each lifecycle transition (`CachedEnqueued`, `CachedAttempt`, `CachedTerminal`) emits an audit row via the combined cached-operation telemetry packet — one packet carries both the audit row and the SiteCalls upsert (see [Component-AuditLog.md](Component-AuditLog.md), Cached Operations — Combined Telemetry, and [Component-SiteCallAudit.md](Component-SiteCallAudit.md)). Audit-write failure never aborts the script.
|
||||
|
||||
## Invocation Protocol
|
||||
|
||||
@@ -82,12 +85,15 @@ Scripts choose between two call modes per invocation, mirroring the dual-mode da
|
||||
- The HTTP request is executed immediately. The script blocks until the response is received or the timeout elapses.
|
||||
- **All failures** (transient and permanent) return an error to the calling script. No store-and-forward buffering.
|
||||
- Use for request/response interactions where the script needs the result (e.g., fetching a recipe, querying inventory).
|
||||
- **Audit emission**: emits an `ApiOutbound.SyncCall` row to `IAuditWriter` at call completion (success or failure). Payload captured per the Audit Log policy (see [Component-AuditLog.md](Component-AuditLog.md), Payload Capture Policy). Audit-write failure never aborts the script.
|
||||
|
||||
### Cached (Store-and-Forward)
|
||||
- Script calls `ExternalSystem.CachedCall("systemName", "methodName", params)`.
|
||||
- The call is attempted immediately. If it succeeds, the response is discarded (fire-and-forget).
|
||||
- On **transient failure** (connection refused, timeout, HTTP 5xx), the call is routed to the Store-and-Forward Engine for retry per the system's retry settings. The script does **not** block — the call is buffered and the script continues.
|
||||
- On **permanent failure** (HTTP 4xx), the error is returned **synchronously** to the calling script. No retry — the request itself is wrong.
|
||||
- Script calls `ExternalSystem.CachedCall("systemName", "methodName", params)`. This is **deferred delivery**: the call returns a `TrackedOperationId` tracking handle immediately rather than the response body.
|
||||
- The call is attempted immediately. If it succeeds, the response is discarded and the call is recorded as a terminal `Delivered` tracking record.
|
||||
- On **transient failure** (connection refused, timeout, HTTP 5xx), the call is routed to the Store-and-Forward Engine for retry per the system's retry settings. The script does **not** block — the call is buffered (`Pending`/`Retrying`) and the script continues.
|
||||
- On **permanent failure** (HTTP 4xx), the error is returned **synchronously** to the calling script. No retry — the request itself is wrong. The call is also recorded as a terminal `Failed` tracking record capturing the error.
|
||||
- Cached-call status is observable to scripts via `Tracking.Status(id)` (answered site-locally and authoritatively) and centrally via the Site Call Audit component.
|
||||
- **Audit emission**: each lifecycle transition (`CachedEnqueued`, `CachedAttempt`, `CachedTerminal`) emits an audit row via the combined cached-operation telemetry packet — one packet carries both the audit row and the SiteCalls upsert (see [Component-AuditLog.md](Component-AuditLog.md), Cached Operations — Combined Telemetry, and [Component-SiteCallAudit.md](Component-SiteCallAudit.md)). Audit-write failure never aborts the script.
|
||||
- Use for outbound data pushes where deferred delivery is acceptable (e.g., posting production data, sending quality reports).
|
||||
|
||||
## Call Timeout & Error Handling
|
||||
@@ -95,7 +101,7 @@ Scripts choose between two call modes per invocation, mirroring the dual-mode da
|
||||
- Each external system definition specifies a **timeout** that applies to all method calls on that system.
|
||||
- Error classification by HTTP response:
|
||||
- **Transient failures** (connection refused, timeout, HTTP 408, 429, 5xx): Behavior depends on call mode — `CachedCall` buffers for retry; `Call` returns error to script.
|
||||
- **Permanent failures** (HTTP 4xx except 408/429): Always returned to the calling script regardless of call mode. Logged to Site Event Logging.
|
||||
- **Permanent failures** (HTTP 4xx except 408/429): Always returned to the calling script regardless of call mode. Logged to Site Event Logging. For `CachedCall`, the failure is additionally recorded as a terminal `Failed` tracking record — so even a never-buffered cached call has an authoritative status record.
|
||||
- This classification ensures the S&F buffer is not polluted with requests that will never succeed.
|
||||
- **Idempotency note**: `CachedCall` retries may result in duplicate delivery if the external system received the original request but the response was lost. Callers should use `CachedCall` only for operations that are idempotent or where duplicate delivery is acceptable.
|
||||
|
||||
@@ -114,7 +120,8 @@ Scripts choose between two call modes per invocation, mirroring the dual-mode da
|
||||
|
||||
- **Configuration Database (MS SQL)**: Stores external system and database connection definitions (central only).
|
||||
- **Local SQLite**: At sites, external system and database connection definitions are read from local SQLite (populated by artifact deployment). Sites do not access the central config DB.
|
||||
- **Store-and-Forward Engine**: Handles buffering for failed external system calls and cached database writes.
|
||||
- **Store-and-Forward Engine**: Handles buffering for failed external system calls and cached database writes, and owns the site-local operation tracking table read by `Tracking.Status(id)`.
|
||||
- **Site Call Audit**: Central audit mirror for cached calls — receives cached-call lifecycle telemetry so `CachedCall`/`CachedWrite` status is observable centrally.
|
||||
- **Communication Layer**: Routes inbound external system requests from central to sites.
|
||||
- **Security & Auth**: Design role manages definitions.
|
||||
- **Configuration Database (via IAuditService)**: Definition changes are audit logged.
|
||||
@@ -122,5 +129,6 @@ Scripts choose between two call modes per invocation, mirroring the dual-mode da
|
||||
## Interactions
|
||||
|
||||
- **Site Runtime (Script/Alarm Execution Actors)**: Scripts invoke external system methods and database operations through this component.
|
||||
- **Store-and-Forward Engine**: Failed calls and cached writes are routed here for reliable delivery.
|
||||
- **Store-and-Forward Engine**: Failed calls and cached writes are routed here for reliable delivery; it also assigns each cached call a `TrackedOperationId` tracking row.
|
||||
- **Site Call Audit**: The central observability sibling for cached calls — cached-call status reported here is queried via the Central UI Site Calls page.
|
||||
- **Deployment Manager**: Receives updated definitions as part of system-wide artifact deployment (triggered explicitly by Deployment role).
|
||||
|
||||
@@ -29,8 +29,16 @@ Site clusters (metric collection and reporting). Central cluster (aggregation an
|
||||
| Tag resolution counts | Data Connection Layer | Per connection: total subscribed tags vs. successfully resolved tags |
|
||||
| Script error rates | Site Runtime (Script Actors) | Frequency of script failures |
|
||||
| Alarm evaluation error rates | Site Runtime (Alarm Actors) | Frequency of alarm evaluation failures |
|
||||
| Store-and-forward buffer depth | Store-and-Forward Engine | Pending messages by category (external, notification, DB write) |
|
||||
| Store-and-forward buffer depth | Store-and-Forward Engine | Pending messages by category — external, notification (notifications awaiting forward to central), DB write |
|
||||
| Dead letter count | Akka.NET EventStream | Messages sent to actors that no longer exist — indicates stale references or timing issues |
|
||||
| Notification Outbox queue depth | Notification Outbox (central) | Count of `Pending` + `Retrying` notifications — central-computed, not site-reported |
|
||||
| Notification Outbox stuck count | Notification Outbox (central) | Count of `Pending` / `Retrying` notifications older than the configurable stuck-age threshold — central-computed, not site-reported |
|
||||
| Notification Outbox parked count | Notification Outbox (central) | Count of `Parked` notifications — central-computed, not site-reported |
|
||||
| `SiteAuditBacklog` | Audit Log (site) | Count of `Pending` rows in the site-local `AuditLog` plus oldest-pending-age plus on-disk bytes. A configurable threshold drives a Health dashboard warning on the affected site tile. |
|
||||
| `SiteAuditWriteFailures` | Audit Log (site) | Count of failed hot-path audit appends at the site since the last health report. |
|
||||
| `SiteAuditTelemetryStalled` | Audit Log (site) | Boolean flag set when reconciliation reports a non-draining site-local audit backlog over two consecutive cycles. |
|
||||
| `CentralAuditWriteFailures` | Audit Log (central) | Count of central direct-write audit failures (Inbound API middleware, Notification Outbox dispatcher, and any other central direct writers) since the last interval. |
|
||||
| `AuditRedactionFailure` | Audit Log (central) | Count of payload redactor errors (over-redacted payloads, safety-net hit) since the last interval. |
|
||||
|
||||
## Reporting Protocol
|
||||
|
||||
@@ -50,10 +58,44 @@ Script error rates and alarm evaluation error rates are calculated as **raw coun
|
||||
- **Alarm evaluation errors** include all failures during alarm condition evaluation.
|
||||
- For detailed diagnostics (error types, stack traces, affected instances), operators use the **Site Event Log Viewer** — the health dashboard is for quick triage, not forensics.
|
||||
|
||||
## Notification Outbox KPIs
|
||||
|
||||
The Notification Outbox is a **central** component, so its KPIs are **central-computed** rather than collected from sites and carried in the site health report:
|
||||
|
||||
- The dashboard surfaces three **headline** outbox KPIs: **queue depth** (`Pending` + `Retrying`), **stuck count** (`Pending` / `Retrying` rows older than the configurable stuck-age threshold), and **parked count** (`Parked`).
|
||||
- The Notification Outbox component computes these on demand from the central `Notifications` table; the health dashboard polls it for the headline tiles.
|
||||
- The fuller KPI set — which also includes **delivered (last interval)** and **oldest pending age** — lives on the Central UI **Notification Outbox** page, not the health dashboard.
|
||||
- Outbox KPIs are **point-in-time**, computed on demand from the `Notifications` table. There is no time-series store — consistent with Health Monitoring's "current status only" philosophy. The outbox's own ~1-year row retention answers historical questions directly.
|
||||
|
||||
These are distinct from the site-reported **Store-and-forward buffer depth** notification metric, which now covers the **site→central leg** — notifications still buffered in a site's Store-and-Forward Engine awaiting forward to central — and remains part of the site health report.
|
||||
|
||||
## Site Call Audit KPIs
|
||||
|
||||
The Site Call Audit is a **central** component, so its KPIs — like the Notification Outbox's — are **central-computed** rather than collected from sites and carried in the site health report:
|
||||
|
||||
- The dashboard surfaces Site Call Audit **headline** KPI tiles alongside the existing Notification Outbox tiles.
|
||||
- The Site Call Audit component computes these on demand from the central `SiteCalls` table, **global and per-source-site**; the health dashboard polls it for the headline tiles.
|
||||
- The KPI set is **buffered count** (`Pending` + `Retrying`), **parked count** (`Parked`), **failed (last interval)**, **delivered (last interval)**, **oldest pending age**, and **stuck count** (`Pending` / `Retrying` rows older than the configurable stuck-age threshold).
|
||||
- **Stuck** is `Pending` / `Retrying` rows older than a configurable threshold (default **10 minutes**) — **display-only** (KPI count plus a row badge), with no escalation or alerting, consistent with the Notification Outbox stuck metric.
|
||||
- Site Call Audit KPIs are **point-in-time**, computed on demand from the `SiteCalls` table. There is no time-series store — consistent with Health Monitoring's "current status only" philosophy.
|
||||
|
||||
Unlike the Notification Outbox, the Site Call Audit is **not a dispatcher** — cached calls are delivered by each site's Store-and-Forward Engine, and the `SiteCalls` table is an eventually-consistent central mirror of site-owned status.
|
||||
|
||||
## Audit Log KPIs
|
||||
|
||||
The Audit Log spans both sites (hot-path append + telemetry forward) and central (direct-write + ingest + redaction). Its operational health surfaces as three new dashboard tiles grouped under **Audit**:
|
||||
|
||||
- **Audit volume** — events/min landing in the central `AuditLog` table, shown global plus per-site sparkline; sourced from the Audit Log component on the active central node.
|
||||
- **Audit error rate** — percent of central `AuditLog` rows with `Status` other than `Success` / `Delivered` / `Enqueued` over a rolling 5-minute window. This is the operational error rate of audited operations (HTTP 5xx, transient failures, parked deliveries, etc.) — NOT the audit writer's own health. Audit-writer issues surface separately via `CentralAuditWriteFailures` and `AuditRedactionFailure`.
|
||||
- **Audit backlog** — global aggregate of `SiteAuditBacklog` across reporting sites (count of `Pending` site-local audit rows, oldest pending age, on-disk bytes); click drills into a per-site breakdown. The per-site tile surfaces a warning badge when its `SiteAuditBacklog` crosses the configurable threshold or when `SiteAuditTelemetryStalled` is set.
|
||||
|
||||
These tiles are **point-in-time** like the Notification Outbox and Site Call Audit KPI tiles — no time-series store; consistent with Health Monitoring's "current status only" philosophy. The site-scoped `SiteAuditBacklog` / `SiteAuditWriteFailures` / `SiteAuditTelemetryStalled` metrics arrive in the existing site health report; the central-scoped `CentralAuditWriteFailures` / `AuditRedactionFailure` metrics are central-computed alongside the existing central KPIs.
|
||||
|
||||
## Central Storage
|
||||
|
||||
- Health metrics are held **in memory** at the central cluster for display in the UI.
|
||||
- No historical health data is persisted — the dashboard shows current/latest status only.
|
||||
- Notification Outbox and Site Call Audit KPIs are not stored by Health Monitoring; they are computed point-in-time from the central `Notifications` and `SiteCalls` tables respectively each time the dashboard refreshes — consistent with the current-status-only philosophy.
|
||||
- Site connectivity history (online/offline transitions) may optionally be logged via the Audit Log or a separate mechanism if needed in the future.
|
||||
|
||||
## No Alerting
|
||||
@@ -66,8 +108,11 @@ Script error rates and alarm evaluation error rates are calculated as **raw coun
|
||||
- **Communication Layer**: Transports health reports from sites to central.
|
||||
- **Data Connection Layer (site)**: Provides connection health metrics.
|
||||
- **Site Runtime (site)**: Provides script error rate and alarm evaluation error rate metrics.
|
||||
- **Store-and-Forward Engine (site)**: Provides buffer depth metrics.
|
||||
- **Store-and-Forward Engine (site)**: Provides buffer depth metrics, including the notification backlog awaiting forward to central.
|
||||
- **Cluster Infrastructure (site)**: Provides node role status.
|
||||
- **Notification Outbox (central)**: Provides central-computed outbox KPIs — queue depth, stuck count, parked count — for the headline dashboard tiles.
|
||||
- **Site Call Audit (central)**: Provides central-computed cached-call KPIs — buffered count, parked count, failed/delivered (last interval), oldest pending age, stuck count — for the headline dashboard tiles.
|
||||
- **Audit Log (#23)**: Provides the site-reported `SiteAuditBacklog` / `SiteAuditWriteFailures` / `SiteAuditTelemetryStalled` metrics (via the site health report) and the central-computed `CentralAuditWriteFailures` / `AuditRedactionFailure` metrics, plus the central audit-row rate feeding the **Audit** dashboard tile group (Audit volume, Audit error rate, Audit backlog).
|
||||
|
||||
## Interactions
|
||||
|
||||
|
||||
@@ -31,8 +31,8 @@ The same compiled binary must be deployable to both central and site nodes. The
|
||||
|
||||
At startup the Host must inspect the configured node role and register only the component services appropriate for that role:
|
||||
|
||||
- **Shared** (both Central and Site): ClusterInfrastructure, Communication, HealthMonitoring, ExternalSystemGateway, NotificationService.
|
||||
- **Central only**: TemplateEngine, DeploymentManager, Security, AuditLogging, CentralUI, InboundAPI, ManagementService.
|
||||
- **Shared** (both Central and Site): ClusterInfrastructure, Communication, HealthMonitoring, ExternalSystemGateway.
|
||||
- **Central only**: TemplateEngine, DeploymentManager, Security, AuditLogging, CentralUI, InboundAPI, ManagementService, NotificationService, NotificationOutbox, SiteCallAudit.
|
||||
- **Site only**: SiteRuntime, DataConnectionLayer, StoreAndForward, SiteEventLogging.
|
||||
|
||||
Components not applicable to the current role must not be registered in the DI container or the Akka.NET actor system.
|
||||
@@ -60,7 +60,9 @@ The Host must bind configuration sections from `appsettings.json` to strongly-ty
|
||||
| `ScadaLink:Communication` | `CommunicationOptions` | Communication | DeploymentTimeout, LifecycleTimeout, QueryTimeout, TransportHeartbeatInterval, TransportFailureThreshold |
|
||||
| `ScadaLink:Security` | `SecurityOptions` | Security & Auth | LdapServer, LdapPort, LdapUseTls, JwtSigningKey, JwtExpiryMinutes, IdleTimeoutMinutes |
|
||||
| `ScadaLink:InboundApi` | `InboundApiOptions` | Inbound API | DefaultMethodTimeout |
|
||||
| `ScadaLink:Notification` | `NotificationOptions` | Notification Service | (SMTP config is stored in config DB and deployed to sites, not in appsettings) |
|
||||
| `ScadaLink:Notification` | `NotificationOptions` | Notification Service | (SMTP config is stored in the central config DB, not in appsettings) |
|
||||
| `ScadaLink:NotificationOutbox` | `NotificationOutboxOptions` | Notification Outbox | Dispatcher poll interval, stuck-age threshold, retention window (delivery retry settings reuse the central SMTP configuration) |
|
||||
| `ScadaLink:SiteCallAudit` | `SiteCallAuditOptions` | Site Call Audit | Reconciliation pull interval, stuck-age threshold, retention window |
|
||||
| `ScadaLink:ManagementService` | `ManagementServiceOptions` | Management Service | (Reserved for future configuration) |
|
||||
| `ScadaLink:Logging` | `LoggingOptions` | Host | Serilog sink configuration, log level overrides |
|
||||
|
||||
@@ -176,7 +178,10 @@ The Host's `Program.cs` calls these extension methods; the component libraries o
|
||||
| Communication | Yes | Yes | Yes | Yes | No |
|
||||
| HealthMonitoring | Yes | Yes | Yes | Yes | No |
|
||||
| ExternalSystemGateway | Yes | Yes | Yes | Yes | No |
|
||||
| NotificationService | Yes | Yes | Yes | Yes | No |
|
||||
| AuditLog | Yes | Yes | Yes | Yes | No |
|
||||
| NotificationService | Yes | No | Yes | Yes | No |
|
||||
| NotificationOutbox | Yes | No | Yes | Yes | No |
|
||||
| SiteCallAudit | Yes | No | Yes | Yes | No |
|
||||
| TemplateEngine | Yes | No | Yes | Yes | No |
|
||||
| DeploymentManager | Yes | No | Yes | Yes | No |
|
||||
| Security | Yes | No | Yes | Yes | No |
|
||||
@@ -193,7 +198,7 @@ The Host's `Program.cs` calls these extension methods; the component libraries o
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **All 17 component libraries**: The Host references every component project to call their extension methods (excludes CLI, which is a separate executable).
|
||||
- **All 19 component libraries**: The Host references every component project to call their extension methods (excludes CLI, which is a separate executable). Audit Log (#23) ships its central+site code in `ScadaLink.AuditLog`; the Host calls `AddAuditLog()` on both roles, M2+ will add `AddAuditLogActors()`.
|
||||
- **Akka.Hosting**: For `AddAkka()` and the hosting configuration builder.
|
||||
- **Akka.Remote.Hosting, Akka.Cluster.Hosting**: For Akka subsystem configuration. (No Akka.Persistence plugin — see the Persistence note under REQ-HOST-6.)
|
||||
- **Serilog.AspNetCore**: For structured logging integration.
|
||||
|
||||
@@ -116,8 +116,9 @@ API method scripts are compiled at central startup — all method definitions ar
|
||||
|
||||
## API Call Logging
|
||||
|
||||
- **Only failures are logged.** Script execution errors (500 responses) are logged centrally.
|
||||
- Successful API calls are **not** logged — the audit log is reserved for configuration changes, not operational traffic.
|
||||
- **Every request — success or failure — emits one `ApiInbound.Completed` row** to `ICentralAuditWriter` from request middleware before the HTTP response is flushed. The row captures the API key **name** (never the key material), remote IP, user-agent, response status, duration, and truncated request/response bodies per the Audit Log capture policy (see Component-AuditLog.md, Payload Capture Policy). This supersedes the earlier failures-only stance: operational API traffic is now part of the centralized audit log, so configuration changes and call activity share a single retention/query surface.
|
||||
- Script execution errors (500 responses) remain captured on the same `ApiInbound.Completed` row (response status + error fields) rather than emitting a separate failure-only event.
|
||||
- **Fail-soft semantics.** The audit write is synchronous (inline before the response is flushed), but failures are caught: a write that throws is logged and increments `CentralAuditWriteFailures` (see Health Monitoring #11) and the request still returns its normal HTTP response. A failed audit append never turns a successful API call into an error returned to the caller.
|
||||
- No rate limiting — this is a private API in a controlled industrial environment with a known set of callers. Misbehaving callers are handled operationally (disable the API key).
|
||||
|
||||
## Request Flow
|
||||
@@ -197,7 +198,8 @@ Inbound API scripts **cannot** call shared scripts directly — shared scripts a
|
||||
- **Configuration Database (MS SQL)**: Stores API keys and method definitions.
|
||||
- **Communication Layer**: Routes requests to sites when method implementations need site data.
|
||||
- **Security & Auth**: API key validation (separate from LDAP/AD — API uses key-based auth).
|
||||
- **Configuration Database (via IAuditService)**: All API key and method definition changes are audit logged. Optionally, API call activity can be logged.
|
||||
- **Configuration Database (via IAuditService)**: All API key and method definition changes are audit logged.
|
||||
- **Audit Log (#23)**: Every inbound API request emits an `ApiInbound.Completed` row via `ICentralAuditWriter` from request middleware (non-blocking for the HTTP response). Payload truncation/redaction follows the Audit Log Payload Capture Policy.
|
||||
- **Cluster Infrastructure**: API is hosted on the active central node and fails over with it.
|
||||
|
||||
## Interactions
|
||||
|
||||
@@ -123,7 +123,7 @@ The endpoint performs LDAP authentication and role resolution server-side, colla
|
||||
### Deployments
|
||||
|
||||
- **DeployInstance**: Deploy configuration to a specific instance (includes pre-deployment validation).
|
||||
- **DeployArtifacts**: Deploy system-wide artifacts (shared scripts, external system definitions, DB connections, data connections, notification lists, SMTP config) to all sites or a specific site.
|
||||
- **DeployArtifacts**: Deploy system-wide artifacts (shared scripts, external system definitions, DB connections, data connections) to all sites or a specific site.
|
||||
- **GetDeploymentStatus**: Query deployment status.
|
||||
|
||||
### External Systems
|
||||
|
||||
@@ -0,0 +1,175 @@
|
||||
# Component: Notification Outbox
|
||||
|
||||
## Purpose
|
||||
|
||||
The Notification Outbox is the central component that receives store-and-forwarded notifications from site clusters, logs every one to the `Notifications` table in the central configuration database, and delivers them through per-type delivery adapters. The `Notifications` table is the single source of audit truth: every notification — successfully delivered, parked, or discarded — has exactly one durable row. The outbox provides delivery retry, parking of failures, per-notification status tracking, and KPIs for delivery health.
|
||||
|
||||
This inverts where notification delivery happens. Sites no longer send notifications directly via SMTP; a site script's notification is store-and-forwarded to central, and the central outbox owns dispatch and delivery.
|
||||
|
||||
## Location
|
||||
|
||||
Central cluster. The `NotificationOutboxActor` is a **singleton on the active central node**. It is the first outbox component to live centrally — the Store-and-Forward Engine remains site-only.
|
||||
|
||||
## Responsibilities
|
||||
|
||||
- Own the durable central queue — the `Notifications` table in the central MS SQL database.
|
||||
- Ingest store-and-forwarded notifications from sites, insert-if-not-exists on `NotificationId`, and ack the site only after the row is persisted.
|
||||
- Run the dispatcher loop: poll due rows, resolve the target notification list, and deliver via the matching adapter.
|
||||
- Schedule retries for transient failures and park notifications on permanent failure or exhausted retries.
|
||||
- Track per-notification status across the delivery lifecycle.
|
||||
- Compute delivery KPIs from the `Notifications` table for the Health Monitoring dashboard and the Central UI.
|
||||
- Purge terminal rows daily after a configurable retention window.
|
||||
|
||||
SMTP and HTTP delivery is blocking I/O. Delivery work runs on a **dedicated blocking-I/O dispatcher**, the same pattern used by Script Execution Actors, so delivery never blocks the actor's dispatcher loop.
|
||||
|
||||
## End-to-End Flow
|
||||
|
||||
```
|
||||
Site script: Notify.To("list").Send(subject, body)
|
||||
│ generate NotificationId (GUID) locally; return it to the script immediately
|
||||
▼
|
||||
Site Store-and-Forward Engine (notification category, target = central)
|
||||
│ durably forwards to central via Central–Site Communication (ClusterClient);
|
||||
│ buffers/retries if central is unreachable
|
||||
▼
|
||||
Central ingest: insert-if-not-exists on NotificationId → Notifications table (Pending)
|
||||
│ ack the site → site S&F clears the message
|
||||
▼
|
||||
Central Notification Outbox actor (singleton, active central node)
|
||||
│ polls due rows; resolves the list; delivers via the matching adapter
|
||||
├── success → Delivered
|
||||
├── transient failure → Retrying (schedule NextAttemptAt)
|
||||
└── permanent failure
|
||||
/ retries exhausted → Parked
|
||||
```
|
||||
|
||||
The site forwards only `(listName, subject, body)` plus provenance — recipient resolution happens at central, at delivery time. This keeps notification-list definitions in one place and removes the deploy-to-sites artifact entirely.
|
||||
|
||||
`Notify.Status(notificationId)` returns a small status record — status, retry count, last error, and key timestamps (enqueued, delivered). While the notification is still in the site S&F buffer the site answers the query **locally** (status `Forwarding`); once forwarded, the query round-trips to central and reads the `Notifications` table.
|
||||
|
||||
## The `Notifications` Table
|
||||
|
||||
The table is type-agnostic so it can record any notification type the system supports — email today, Microsoft Teams and others later. One row per notification.
|
||||
|
||||
| Field | Notes |
|
||||
|---|---|
|
||||
| `NotificationId` | GUID, primary key. Generated at the **site**; used as the idempotency key. |
|
||||
| `Type` | `Email` / `Teams` / … discriminator. |
|
||||
| `ListName` | Target notification list. |
|
||||
| `Subject`, `Body` | Plain-text content. |
|
||||
| `TypeData` | JSON — extensibility hook for future per-type fields. |
|
||||
| `Status` | Lifecycle state — one of `Pending`, `Retrying`, `Delivered`, `Parked`, `Discarded`. See Status Lifecycle below. |
|
||||
| `RetryCount` | Delivery attempts so far. |
|
||||
| `LastError` | Detail of the most recent failure. |
|
||||
| `ResolvedTargets` | Who the notification actually went to — snapshotted by central at delivery time, for audit. |
|
||||
| `SourceSiteId`, `SourceInstanceId`, `SourceScript` | Provenance. |
|
||||
| `SiteEnqueuedAt` | When the script called `Send()` (carried from the site). |
|
||||
| `CreatedAt` | When central ingested the row. |
|
||||
| `LastAttemptAt`, `NextAttemptAt`, `DeliveredAt` | Delivery timestamps. |
|
||||
|
||||
All timestamps are UTC.
|
||||
|
||||
### Status Lifecycle
|
||||
|
||||
- `Forwarding` — in the site S&F buffer, not yet received by central. **Site-local only** — never stored in the central `Notifications` table; reported by `Notify.Status` while the site still holds the notification.
|
||||
- `Pending` — ingested by central, awaiting first dispatch.
|
||||
- `Retrying` — a transient failure occurred; `NextAttemptAt` schedules the next attempt.
|
||||
- `Delivered` — terminal, success.
|
||||
- `Parked` — terminal-not-delivered: a permanent failure, or retries exhausted. `LastError` distinguishes which.
|
||||
- `Discarded` — terminal, reached **only by operator action** on a parked notification. The row is kept (not deleted) so the table remains a complete audit record.
|
||||
|
||||
The Notification Outbox and the central [`Site Call Audit`](Component-SiteCallAudit.md) component share the `TrackedOperationId` tracking model and this status lifecycle, but differ in delivery locality: the Notification Outbox **delivers** notifications itself (central SMTP), whereas Site Call Audit only **audits** cached calls delivered site-locally by the site Store-and-Forward Engine — it is not a dispatcher.
|
||||
|
||||
### Retry Policy
|
||||
|
||||
Delivery retry reuses the central SMTP configuration's max-retry-count and fixed retry interval. The interval is fixed (no exponential backoff), consistent with the existing fixed-interval store-and-forward convention.
|
||||
|
||||
### Retention
|
||||
|
||||
Terminal rows (`Delivered`, `Parked`, `Discarded`) are removed by a **daily purge job** after a configurable window (default 365 days). This preserves a strong audit trail while bounding table growth. Non-terminal rows are never purged.
|
||||
|
||||
## Ingest & Idempotency
|
||||
|
||||
The site→central handoff is **at-least-once**. Central ingests an inbound notification submission with an insert-if-not-exists on `NotificationId`, then acks the site; the site S&F engine clears the message only on that ack. Because central acks only after the row is persisted (ack-after-persist), a lost ack causes the site to resend, and the GUID `NotificationId` idempotency key makes the resend harmless — the duplicate insert is a no-op.
|
||||
|
||||
A rare central failover mid-delivery could re-send one already-`Delivered` notification. This is an accepted trade-off, consistent with the duplicate-delivery trade-off the Store-and-Forward Engine already accepts.
|
||||
|
||||
## Dispatcher
|
||||
|
||||
The dispatcher loop runs on a fixed interval. On each tick the `NotificationOutboxActor`:
|
||||
|
||||
1. Polls the `Notifications` table for **due rows** — `Pending` rows, and `Retrying` rows whose `NextAttemptAt` has passed.
|
||||
2. Resolves the target notification list to its recipients/targets at central, at delivery time.
|
||||
3. Hands the notification to the delivery adapter registered for its `Type`, running on the dedicated blocking-I/O dispatcher.
|
||||
4. Applies the result:
|
||||
- **success** → `Delivered`, set `DeliveredAt`, snapshot `ResolvedTargets`.
|
||||
- **transient failure** → `Retrying`, increment `RetryCount`, set `NextAttemptAt`, record `LastError`; once retries are exhausted → `Parked`.
|
||||
- **permanent failure** → `Parked`, record `LastError`.
|
||||
|
||||
Each delivery attempt also writes a `Notification.Attempt` row to the central `AuditLog` via `ICentralAuditWriter`; a transition to a terminal status (`Delivered` / `Parked` / `Discarded`) writes a `Notification.Terminal` row. Audit writes are **direct** (no telemetry — the dispatcher runs at central), insert-if-not-exists on `EventId`. The site-emitted `Notification.Enqueued` row arrives separately via the standard audit telemetry channel from the site's SQLite write-buffer, so the full per-notification audit trail is `Enqueued` (site-originated) → `Attempt` × N (central direct-write) → `Terminal` (central direct-write). See [Component-AuditLog.md](Component-AuditLog.md), Central direct-write (central-originated events).
|
||||
|
||||
The operational `Notifications` table remains the **source of truth** for the dispatcher and for Retry/Discard actions; the `AuditLog` rows are immutable shadows. Operator Retry/Discard still mutates only the `Notifications` row, and each transition emits the corresponding `Notification.Attempt` / `Notification.Terminal` audit row.
|
||||
|
||||
**Audit-write failure never affects delivery.** If the `ICentralAuditWriter` direct-write fails (transient DB error, schema lock, etc.) the dispatcher logs the failure and increments the `CentralAuditWriteFailures` health metric (see Health Monitoring #11), but the delivery attempt's outcome on the `Notifications` row stands. The audit row is recovered by re-emission on the next dispatcher tick or by the on-startup reconciliation sweep; central never aborts a notification because audit failed.
|
||||
|
||||
## Delivery Adapters
|
||||
|
||||
A delivery adapter implementing `INotificationDeliveryAdapter` is registered per `Type`. Each `Deliver(...)` call returns one of `success | transient failure | permanent failure`, mirroring the External System Gateway error-classification pattern.
|
||||
|
||||
- **Email adapter — implemented now.** The existing SMTP composition/send logic, relocated to the central cluster.
|
||||
- **Teams and other adapters — future.** The `Type` discriminator and the adapter interface are the seam; no Teams code exists in this design. Teams auth and targeting (Incoming Webhooks vs Graph API) is a separate design conversation.
|
||||
|
||||
Delivery adapters are provided by the Notification Service, which manages notification-list and SMTP definitions and supplies the stateless per-type "deliver one notification" implementations.
|
||||
|
||||
## Active/Standby Behavior
|
||||
|
||||
The `NotificationOutboxActor` is a singleton on the active central node. All outbox state lives in MS SQL, which is already the central HA store, so **no Akka-level replication is needed** (unlike the site S&F engine). On central failover the new active node resumes dispatch directly from the `Notifications` table — `Pending` rows and due `Retrying` rows are picked up on the next dispatcher tick.
|
||||
|
||||
## Monitoring
|
||||
|
||||
### KPIs
|
||||
|
||||
KPIs are central-computed from the `Notifications` table — global, with a per-source-site breakdown:
|
||||
|
||||
- **Queue depth** — count of `Pending` + `Retrying`.
|
||||
- **Stuck count** — `Pending` / `Retrying` rows older than the configurable stuck-age threshold.
|
||||
- **Parked count** — count of `Parked`.
|
||||
- **Delivered (last interval)** — count of `Delivered` since the previous sample.
|
||||
- **Oldest pending age** — age of the oldest non-terminal notification.
|
||||
|
||||
KPIs are point-in-time, computed on demand from the table. The configurable row retention (default 365 days) answers historical questions directly, so no separate time-series store is added.
|
||||
|
||||
### Stuck Detection
|
||||
|
||||
A notification is **stuck** if it is `Pending` or `Retrying` and older than a configurable age threshold (default 10 minutes). Detection is **display-only** — a count KPI and a row badge. There is no automated escalation or alerting, consistent with the system-wide no-alerting policy.
|
||||
|
||||
### Surfacing
|
||||
|
||||
- **Health Monitoring dashboard** — headline KPI tiles: queue depth, stuck count, parked count. These are central-computed and are not part of the site health report. The site S&F notification backlog remains a separate site health metric covering the site→central leg.
|
||||
- **Central UI "Notification Outbox" page** — KPI tiles plus a queryable notification list: filter by status, type, source site, list, and time range; a stuck-only toggle; keyword search on subject. Parked notifications offer **Retry** (→ `Pending`, reset `RetryCount` / `NextAttemptAt`) and **Discard** (→ `Discarded`) actions. Stuck rows are badged.
|
||||
|
||||
## Configuration
|
||||
|
||||
The component is configured via `NotificationOutboxOptions`, bound from an `appsettings.json` section on the central host (Options pattern):
|
||||
|
||||
- **Dispatch interval** — how often the dispatcher loop polls for due rows.
|
||||
- **Stuck-age threshold** — age beyond which a non-terminal notification is counted as stuck (default 10 minutes).
|
||||
- **Terminal-row retention window** — age after which terminal rows are removed by the daily purge job (default 365 days).
|
||||
|
||||
Delivery max-retry-count and retry interval are not part of `NotificationOutboxOptions` — they are reused from the central SMTP configuration.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **Notification Service**: Provides notification-list and SMTP definitions, and the per-type delivery adapters the outbox invokes.
|
||||
- **Configuration Database**: Hosts the `Notifications` table; provides the entity POCO, repository, and EF migration for outbox persistence.
|
||||
- **Central–Site Communication**: Carries inbound notification submissions and acks between sites and central.
|
||||
- **Audit Log (#23)**: The dispatcher direct-writes `Notification.Attempt` and `Notification.Terminal` rows to the central `AuditLog` via `ICentralAuditWriter` (insert-if-not-exists on `EventId`); the site-emitted `Notification.Enqueued` row arrives via the standard audit telemetry channel. See [Component-AuditLog.md](Component-AuditLog.md), Central direct-write (central-originated events).
|
||||
- **Health Monitoring**: Consumes the outbox KPIs as central-computed headline metrics.
|
||||
- **Central UI**: Hosts the Notification Outbox page.
|
||||
|
||||
## Interactions
|
||||
|
||||
- **Site Store-and-Forward Engine**: Forwards notifications to central via Central–Site Communication; the outbox ingests them and acks once persisted.
|
||||
- **Notification Service**: Supplies delivery adapters and resolves notification lists at delivery time.
|
||||
- **Central UI**: Queries the `Notifications` table for the Notification Outbox page and issues operator Retry/Discard actions on parked notifications.
|
||||
- **Health Monitoring**: Polls the outbox for KPI tiles on the health dashboard.
|
||||
@@ -2,84 +2,96 @@
|
||||
|
||||
## Purpose
|
||||
|
||||
The Notification Service provides email notification capabilities to scripts running at site clusters. It manages notification lists, handles email delivery, and integrates with the Store-and-Forward Engine for reliable delivery when the email server is unavailable.
|
||||
The Notification Service is the central component that manages notification-list and SMTP definitions and provides the per-type delivery adapters used to send notifications. It manages notification-list and SMTP definitions, and supplies the stateless "deliver one notification" adapter implementations that the Notification Outbox invokes at delivery time.
|
||||
|
||||
The Notification Service no longer delivers notifications from sites. Notification delivery has been inverted: a site script's notification is store-and-forwarded to the central cluster, and the central **Notification Outbox** owns dispatch and delivery, calling an `INotificationDeliveryAdapter` supplied by this component. See [`Component-NotificationOutbox.md`](Component-NotificationOutbox.md).
|
||||
|
||||
## Location
|
||||
|
||||
Central cluster (definition management, stores in config DB). Site clusters (email delivery, reads definitions from local SQLite).
|
||||
Central cluster only. The Notification Service manages definitions in the central configuration database and provides the delivery adapters that run on the central cluster. It is no longer present at site clusters, and notification definitions and SMTP configuration are no longer deployed to sites.
|
||||
|
||||
## Responsibilities
|
||||
|
||||
### Definitions (Central)
|
||||
- Store notification lists in the configuration database: list name, recipients (name + email address).
|
||||
- Store notification lists in the configuration database: list name, list **type**, and type-specific targets (e.g. recipients for an `Email` list).
|
||||
- Store email server configuration (SMTP settings).
|
||||
- Deploy notification lists and SMTP configuration uniformly to all sites. Deployment requires **explicit action** by a user with the Deployment role.
|
||||
- Managed by users with the Design role.
|
||||
- Notification lists and SMTP configuration are **not deployed to sites** — they exist centrally only. There is no deploy-to-sites artifact and no local SQLite copy.
|
||||
|
||||
### Delivery (Site)
|
||||
- Resolve notification list names to recipient lists from **local SQLite** (populated by artifact deployment). Sites do not access the central config DB.
|
||||
- Compose and send emails via SMTP using locally stored SMTP configuration.
|
||||
- On delivery failure, submit the notification to the Store-and-Forward Engine for buffered retry.
|
||||
### Delivery Adapters (Central)
|
||||
- Provide a delivery adapter implementing `INotificationDeliveryAdapter` for each notification `Type`.
|
||||
- Each adapter is a stateless "deliver one notification" implementation: it composes and sends a single notification and classifies the outcome.
|
||||
- The **Email adapter** is the relocated SMTP composition and send logic — formerly run at sites, it now runs on the central cluster.
|
||||
- Resolve a notification list name to its concrete targets (e.g. recipient addresses) at delivery time, on behalf of the Notification Outbox.
|
||||
|
||||
## Notification List Definition
|
||||
|
||||
Each notification list includes:
|
||||
- **Name**: Unique identifier (e.g., "Maintenance-Team", "Shift-Supervisors").
|
||||
- **Recipients**: One or more entries, each with:
|
||||
- **Type**: The notification type — `Email` (implemented now); `Teams` and other types are planned for the future. `Notify.To("list")` works transparently for any type — the calling script does not care about the type.
|
||||
- **Type-specific targets**: The targets appropriate to the list type. For an `Email` list, one or more recipient entries, each with:
|
||||
- Recipient name.
|
||||
- Email address.
|
||||
|
||||
Lists are defined and stored centrally only. **Recipient resolution happens at central, at delivery time** — a site forwards only `(listName, subject, body)` plus provenance; the Notification Outbox asks the Notification Service to resolve the list when it dispatches the notification.
|
||||
|
||||
## Email Server Configuration
|
||||
|
||||
The SMTP configuration is defined centrally and deployed to all sites. It includes:
|
||||
The SMTP configuration is defined centrally and used by the central Email delivery adapter. It is not deployed to sites. It includes:
|
||||
|
||||
- **Server hostname**: SMTP server address (e.g., `smtp.office365.com`).
|
||||
- **Port**: SMTP port (e.g., 587 for StartTLS, 465 for SSL).
|
||||
- **Authentication mode**: One of:
|
||||
- **Basic Auth**: Username and password. For on-prem SMTP relays or servers that support basic authentication.
|
||||
- **OAuth2 Client Credentials**: Tenant ID, Client ID, and Client Secret. For Microsoft 365 and other modern SMTP providers that require OAuth2. The Notification Service handles the token lifecycle internally (fetch, cache, refresh on expiry).
|
||||
- **OAuth2 Client Credentials**: Tenant ID, Client ID, and Client Secret. For Microsoft 365 and other modern SMTP providers that require OAuth2. The Email adapter handles the token lifecycle internally (fetch, cache, refresh on expiry).
|
||||
- **TLS mode**: None, StartTLS, or SSL.
|
||||
- **From address**: The sender email address for all notifications (e.g., `scada-notifications@company.com`).
|
||||
- **Connection timeout**: Maximum time to wait for SMTP connection (default: 30 seconds).
|
||||
- **Max concurrent connections**: Maximum simultaneous SMTP connections per site (default: 5).
|
||||
- **Retry settings**: Max retry count, fixed time between retries (used by Store-and-Forward Engine for transient failures).
|
||||
- **Max concurrent connections**: Maximum simultaneous SMTP connections from the central cluster (default: 5).
|
||||
- **Retry settings**: Max retry count, fixed time between retries. The Notification Outbox reuses these for delivery retry of transient failures.
|
||||
|
||||
## Script API
|
||||
|
||||
```csharp
|
||||
Notify.To("listName").Send("subject", "message")
|
||||
NotificationId id = Notify.To("listName").Send("subject", "message");
|
||||
NotificationStatus status = Notify.Status(id);
|
||||
```
|
||||
|
||||
- Available to instance scripts (via Script Execution Actors), alarm on-trigger scripts (via Alarm Execution Actors), and shared scripts (executing inline).
|
||||
- Resolves the list name to recipients, composes the email, and attempts delivery.
|
||||
- `Notify.To("listName").Send(...)` is **asynchronous**: it generates a `NotificationId` (GUID) locally, hands the notification to the site Store-and-Forward Engine for forwarding to central, and returns the `NotificationId` to the script **immediately**. The script does not block waiting for delivery.
|
||||
- The message body is **plain text** only. No HTML content.
|
||||
- `Notify.Status(notificationId)` returns a small **status record** — the current status, retry count, last error, and key timestamps (enqueued, delivered). While the notification is still in the site Store-and-Forward buffer, the site answers the query **locally** with status `Forwarding`; once forwarded to central, the query round-trips to central and reads the `Notifications` table.
|
||||
- The returned `NotificationId` is a `TrackedOperationId` — the shared Commons tracking-handle type used by all store-and-forward producers; `NotificationId` is simply the notification-domain name for it. Likewise, `Notify.Status` is a thin alias of the unified `Tracking.Status` accessor, retained for backward compatibility. This is a naming/type clarification only — notification delivery behavior is unchanged.
|
||||
|
||||
## Email Delivery Behavior
|
||||
## Notification Delivery Behavior
|
||||
|
||||
### Recipient Handling
|
||||
- A single email is sent per `Notify.To().Send()` call, with all list recipients in **BCC**. The from address is placed in the To field.
|
||||
Delivery is performed centrally by the Notification Outbox, which calls the `INotificationDeliveryAdapter` registered for the notification's `Type`. The behavior below describes the Email adapter.
|
||||
|
||||
### Recipient Handling (Email)
|
||||
- A single email is sent per notification, with all list recipients in **BCC**. The from address is placed in the To field.
|
||||
- Recipients do not see each other's email addresses.
|
||||
- No per-recipient deduplication — if the same email address appears in multiple lists and a script sends to both, they receive multiple emails.
|
||||
|
||||
### Error Classification
|
||||
Consistent with the External System Gateway pattern:
|
||||
- **Transient failures** (connection refused, timeout, SMTP 4xx temporary errors): The notification is handed to the **Store-and-Forward Engine** for buffered retry per the SMTP configuration's retry settings. The script does **not** block waiting for eventual delivery.
|
||||
- **Permanent failures** (SMTP 5xx permanent errors, e.g., mailbox not found): The error is returned **synchronously** to the calling script for handling. No retry — the notification will never deliver.
|
||||
- This prevents the S&F buffer from accumulating notifications that will never succeed.
|
||||
Each `Deliver(...)` call returns one of `success | transient failure | permanent failure`, consistent with the External System Gateway pattern. There is **no synchronous permanent-failure return to the script** — `Send()` returns immediately, before any delivery is attempted.
|
||||
|
||||
- **Transient failures** (connection refused, timeout, SMTP 4xx temporary errors): The Notification Outbox moves the row to `Retrying` and schedules another attempt per the SMTP configuration's retry settings.
|
||||
- **Permanent failures** (SMTP 5xx permanent errors, e.g., mailbox not found): The Notification Outbox moves the row to `Parked` with the error in `LastError`. The notification will never deliver, and an operator can review or discard it on the Central UI Notification Outbox page.
|
||||
- Retries exhausted on a transient failure also result in a `Parked` row.
|
||||
- A script observes failures only by calling `Notify.Status(id)` and seeing a `Parked` status — not as a synchronous exception.
|
||||
|
||||
### No Rate Limiting
|
||||
- No application-level rate limiting. If the SMTP server enforces sending limits (e.g., Microsoft 365 throttling), those manifest as transient failures and are handled naturally by store-and-forward.
|
||||
- No application-level rate limiting. If the SMTP server enforces sending limits (e.g., Microsoft 365 throttling), those manifest as transient failures and are retried naturally by the Notification Outbox.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **Configuration Database (MS SQL)**: Stores notification list definitions and SMTP config (central only).
|
||||
- **Local SQLite**: At sites, notification lists, recipients, and SMTP configuration are read from local SQLite (populated by artifact deployment). Sites do not access the central config DB.
|
||||
- **Store-and-Forward Engine**: Handles buffering for failed email deliveries.
|
||||
- **Configuration Database (MS SQL)**: Stores notification list definitions (name, type, type-specific targets) and SMTP config.
|
||||
- **Notification Outbox**: Invokes the delivery adapters supplied by this component and asks it to resolve notification lists at delivery time.
|
||||
- **Security & Auth**: Design role manages notification lists.
|
||||
- **Configuration Database (via IAuditService)**: Notification list changes are audit logged.
|
||||
|
||||
## Interactions
|
||||
|
||||
- **Site Runtime (Script/Alarm Execution Actors)**: Scripts invoke `Notify.To().Send()` through this component.
|
||||
- **Store-and-Forward Engine**: Failed notifications are buffered here.
|
||||
- **Deployment Manager**: Receives updated notification lists and SMTP config as part of system-wide artifact deployment (triggered explicitly by Deployment role).
|
||||
- **Notification Outbox**: Consumes the per-type delivery adapters and the list-resolution service this component provides; the outbox owns dispatch, retry, parking, and status.
|
||||
- **Site Runtime (Script/Alarm Execution Actors)**: Scripts invoke `Notify.To().Send()` and `Notify.Status()`. `Send()` generates a `NotificationId` and hands the notification to the site Store-and-Forward Engine; it does not contact this component synchronously.
|
||||
- **Store-and-Forward Engine (site)**: Forwards a script's notification to central; the central Notification Outbox ingests it for delivery. The Notification Service does not interact with the site Store-and-Forward Engine directly.
|
||||
|
||||
@@ -76,10 +76,11 @@ Central cluster. Sites do not have user-facing interfaces and do not perform ind
|
||||
- Create and manage instances (overrides, connection bindings, area assignment).
|
||||
- Disable, enable, and delete instances.
|
||||
- Deploy configurations to instances.
|
||||
- Deploy system-wide artifacts (shared scripts, external system definitions, DB connections, notification lists) to all sites.
|
||||
- Deploy system-wide artifacts (shared scripts, external system definitions, DB connections, data connections) to all sites.
|
||||
- View deployment diffs and status.
|
||||
- Use debug view.
|
||||
- Manage parked messages.
|
||||
- Monitor and manage the Notification Outbox (retry and discard parked notifications).
|
||||
- View site event logs.
|
||||
- **Site scoping**: A user with site-scoped Deployment role can only perform these actions for instances at their permitted sites.
|
||||
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
# Component: Site Call Audit
|
||||
|
||||
## Purpose
|
||||
|
||||
Provides central, queryable audit and operational visibility for cached calls
|
||||
made by site scripts — `ExternalSystem.CachedCall()` and `Database.CachedWrite()`.
|
||||
Each such call carries a `TrackedOperationId`; sites report lifecycle telemetry
|
||||
to this component, which maintains a central audit record, computes KPIs, and
|
||||
relays Retry/Discard actions back to the owning site.
|
||||
|
||||
This is the second centrally-hosted observability component for site
|
||||
store-and-forward activity (the Notification Outbox is the first). Unlike the
|
||||
Notification Outbox, Site Call Audit is **not a dispatcher** — it never delivers
|
||||
anything. Cached calls are delivered by the site's Store-and-Forward Engine
|
||||
against site-local external systems and databases, which central cannot reach.
|
||||
|
||||
## Location
|
||||
|
||||
Central cluster only. A singleton actor (`SiteCallAuditActor`) on the active
|
||||
central node. Registered as component #22 in the Host role configuration.
|
||||
|
||||
## Responsibilities
|
||||
|
||||
- Ingest cached-call lifecycle telemetry from sites into the central `SiteCalls`
|
||||
table.
|
||||
- Run periodic per-site reconciliation pulls so missed telemetry self-heals.
|
||||
- Compute point-in-time KPIs (global and per-site) from the `SiteCalls` table.
|
||||
- Relay operator Retry/Discard actions for parked cached calls to the owning
|
||||
site over the command/control channel.
|
||||
- Purge terminal audit rows after a configurable retention window.
|
||||
|
||||
## The `SiteCalls` Table
|
||||
|
||||
Lives in the central MS SQL configuration database — a sibling of the
|
||||
`Notifications` table. One row per `TrackedOperationId`:
|
||||
|
||||
- **TrackedOperationId** — GUID, primary key. Generated site-side at call time.
|
||||
- **SourceSite** — site that issued the call.
|
||||
- **Kind** — `TrackedOperationKind` enum: `ExternalCall` or `DatabaseWrite`.
|
||||
- **TargetSummary** — external system + method name for an `ExternalCall`; for a
|
||||
`DatabaseWrite`, just the database connection name — intentionally not the SQL
|
||||
statement or table, a deliberate scoping choice.
|
||||
- **Status** — `Pending`, `Retrying`, `Delivered`, `Parked`, `Failed`, `Discarded`.
|
||||
- **RetryCount** — attempts so far.
|
||||
- **LastError** — most recent error detail, if any.
|
||||
- **Provenance** — source instance / script.
|
||||
- **CreatedAtUtc**, **UpdatedAtUtc**, **TerminalAtUtc** — key timestamps.
|
||||
|
||||
## Status Lifecycle
|
||||
|
||||
`Pending → Retrying → Delivered / Parked / Failed / Discarded`
|
||||
|
||||
- **Pending** — non-terminal: buffered after a transient failure, awaiting its
|
||||
first retry.
|
||||
- **Retrying** — non-terminal: undergoing retry attempts.
|
||||
- **Delivered** — terminal, success. A cached call that succeeds on its first
|
||||
immediate attempt is recorded directly as `Delivered`.
|
||||
- **Parked** — non-terminal: transient retries exhausted; awaiting manual action.
|
||||
- **Failed** — terminal: permanent failure (e.g. HTTP 4xx). The error was also
|
||||
returned synchronously to the calling script; the record captures it. `Failed`
|
||||
rows are **not operator-actionable** — see Retry / Discard Relay.
|
||||
- **Discarded** — terminal, reached **only by operator action** on a `Parked`
|
||||
row. The row is kept (not deleted) so the table remains a complete audit
|
||||
record.
|
||||
|
||||
The site is the source of truth. The `SiteCalls` row is an eventually-consistent
|
||||
mirror — never queried by scripts (`Tracking.Status()` is answered site-locally).
|
||||
|
||||
## Ingest & Idempotency
|
||||
|
||||
Telemetry ingestion is **insert-if-not-exists** keyed on `TrackedOperationId`,
|
||||
then **upsert-on-newer-status**. The lifecycle is monotonic, so status only
|
||||
advances and never regresses; at-least-once and out-of-order telemetry are
|
||||
therefore harmless.
|
||||
|
||||
From v1.x onward, the `CachedCallTelemetry` message additively carries the
|
||||
`AuditEvent` content alongside the existing operational fields. Central's
|
||||
`AuditLogIngestActor` (Audit Log #23) performs both the immutable `AuditLog`
|
||||
insert and the `SiteCalls` upsert in a single transaction. Idempotency keys
|
||||
remain `EventId` (for `AuditLog`) and `TrackedOperationId` (for `SiteCalls`).
|
||||
See [Component-AuditLog.md](Component-AuditLog.md), Cached Operations —
|
||||
Combined Telemetry, for the dual-write contract.
|
||||
|
||||
## Reconciliation
|
||||
|
||||
Because telemetry is best-effort, `SiteCallAuditActor` periodically — and on site
|
||||
reconnect — pulls "all tracking rows changed since cursor X" from each site.
|
||||
Gaps left by lost telemetry self-heal. Central converges to the site; the site
|
||||
never depends on central.
|
||||
|
||||
## Retry / Discard Relay
|
||||
|
||||
Parked cached calls live in the owning site's S&F buffer. Operator Retry/Discard
|
||||
from the Central UI is relayed to that site as a `RetryParkedOperation` /
|
||||
`DiscardParkedOperation` command over the command/control channel. The site
|
||||
applies the change and emits telemetry reflecting the new state; central never
|
||||
mutates the `SiteCalls` row directly. If the site is offline the command fails
|
||||
fast and the UI surfaces a "site unreachable" message.
|
||||
|
||||
Only `Parked` rows are operator-actionable. `Failed` rows offer no Retry or
|
||||
Discard: a permanent failure (e.g. HTTP 4xx) would simply fail again, and the
|
||||
error was already returned synchronously to the calling script — there is
|
||||
nothing for an operator to recover.
|
||||
|
||||
## KPIs
|
||||
|
||||
Point-in-time, computed from the `SiteCalls` table, global and per-source-site,
|
||||
mirroring the Notification Outbox KPI shape:
|
||||
|
||||
- Buffered count (`Pending` + `Retrying`)
|
||||
- Parked count
|
||||
- Failed-last-interval
|
||||
- Delivered-last-interval
|
||||
- Oldest-pending age
|
||||
- Stuck count — `Pending`/`Retrying` older than a configurable threshold
|
||||
(default 10 minutes); display-only, no escalation.
|
||||
|
||||
## Retention
|
||||
|
||||
Daily purge of terminal rows (`Delivered`, `Failed`, `Discarded`) after a
|
||||
configurable window (default 365 days), matching the `Notifications` purge.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **Configuration Database**: hosts the `SiteCalls` table and its repository.
|
||||
- **Central–Site Communication**: receives cached-call telemetry and reconciliation
|
||||
responses; sends Retry/Discard commands.
|
||||
- **Store-and-Forward Engine**: the site-side origin of cached-call telemetry and
|
||||
the executor of relayed Retry/Discard commands.
|
||||
- **Audit Log (#23)**: shares the `CachedCallTelemetry` packet — each lifecycle
|
||||
transition (`CachedEnqueued`, `CachedAttempt`, `CachedTerminal`) carries an
|
||||
`AuditEvent` alongside the operational fields, and central's
|
||||
`AuditLogIngestActor` performs the `AuditLog` insert and the `SiteCalls`
|
||||
upsert in a single transaction (see [Component-AuditLog.md](Component-AuditLog.md),
|
||||
Cached Operations — Combined Telemetry).
|
||||
- **Commons**: `TrackedOperationId`, status enum, telemetry message contracts.
|
||||
|
||||
## Interactions
|
||||
|
||||
- **Central UI**: the Site Calls page queries this component and issues
|
||||
Retry/Discard actions.
|
||||
- **Health Monitoring**: surfaces Site Call Audit KPI tiles on the dashboard.
|
||||
- **Cluster Infrastructure**: hosts the `SiteCallAuditActor` singleton with
|
||||
active/standby failover.
|
||||
@@ -25,12 +25,13 @@ Site clusters (event recording and storage). Central cluster (remote query acces
|
||||
| Data Connection Status | Connected, disconnected, reconnected (per connection) |
|
||||
| Store-and-Forward | Message queued, delivered, retried, parked |
|
||||
| Instance Lifecycle | Instance enabled, disabled, deleted |
|
||||
| Notification | Site→central forward failure, long-buffered notification (still in the site buffer past a threshold) |
|
||||
|
||||
## Event Entry Schema
|
||||
|
||||
Each event entry contains:
|
||||
- **Timestamp**: When the event occurred.
|
||||
- **Event Type**: Category of the event (script, alarm, deployment, connection, store-and-forward, instance-lifecycle).
|
||||
- **Event Type**: Category of the event (script, alarm, deployment, connection, store-and-forward, instance-lifecycle, notification).
|
||||
- **Severity**: Info, Warning, or Error.
|
||||
- **Instance ID** *(optional)*: The instance associated with the event (if applicable).
|
||||
- **Source**: The subsystem that generated the event (e.g., "ScriptActor:MonitorSpeed", "AlarmActor:OverTemp", "DataConnection:PLC1").
|
||||
@@ -62,11 +63,12 @@ Each event entry contains:
|
||||
- **Communication Layer**: Handles remote query requests from central.
|
||||
- **Site Runtime**: Generates script execution events, alarm events, deployment application events, and instance lifecycle events.
|
||||
- **Data Connection Layer**: Generates connection status events.
|
||||
- **Store-and-Forward Engine**: Generates buffer activity events.
|
||||
- **Store-and-Forward Engine**: Generates buffer activity events, including notification-category forward failures and long-buffered notifications on the site→central notification path.
|
||||
|
||||
## Interactions
|
||||
|
||||
- **All site subsystems**: Event logging is a cross-cutting concern — any subsystem that produces notable events calls the Event Logging service.
|
||||
- **Communication Layer**: Receives remote queries from central and returns results.
|
||||
- **Central UI**: Site Event Log Viewer displays queried events.
|
||||
- **Store-and-Forward Engine**: Its notification path (the site→central forward of script-generated notifications) reports forward failures and long-buffered notifications as Notification-category events. Routine enqueue and forward-success events are deliberately not logged — central's authoritative `Notifications` table (owned by the Notification Outbox component) is the audit record of record; site-side logging covers only the in-transit blind spot when central is unreachable.
|
||||
- **Health Monitoring**: Script error rates and alarm evaluation error rates can be derived from event log data.
|
||||
|
||||
@@ -66,7 +66,7 @@ Deployment Manager Singleton (Cluster Singleton)
|
||||
- Reports deployment result (success/failure) back to central.
|
||||
|
||||
### System-Wide Artifact Handling
|
||||
- Receives updated shared scripts, external system definitions, database connection definitions, data connection definitions, notification lists, and SMTP configuration from central.
|
||||
- Receives updated shared scripts, external system definitions, database connection definitions, and data connection definitions from central. (Notification lists and SMTP configuration are central-only and are not deployed to sites — see Component-NotificationService.md.)
|
||||
- Stores all artifacts in local SQLite. After artifact deployment, the site is fully self-contained — all runtime configuration is read from local SQLite with no access to the central configuration database.
|
||||
- Recompiles shared scripts and makes updated code available to all Script Actors.
|
||||
|
||||
@@ -254,14 +254,19 @@ Available to all Script Execution Actors and Alarm Execution Actors:
|
||||
|
||||
### External Systems
|
||||
- `ExternalSystem.Call("systemName", "methodName", params)` — Synchronous HTTP call. Blocks until response or timeout. All failures return to script. Use when the script needs the result.
|
||||
- `ExternalSystem.CachedCall("systemName", "methodName", params)` — Fire-and-forget with store-and-forward on transient failure. Use for outbound data pushes where deferred delivery is acceptable.
|
||||
- `ExternalSystem.CachedCall("systemName", "methodName", params)` — Deferred delivery. Returns a `TrackedOperationId` tracking handle immediately rather than the response; the call is attempted immediately and, on transient failure, store-and-forwarded for retry. Use for outbound data pushes where deferred delivery is acceptable.
|
||||
- The returned `TrackedOperationId` can be passed to `Tracking.Status(id)` (see **Operation Tracking** below) to observe delivery progress.
|
||||
|
||||
### Notifications
|
||||
- `Notify.To("listName").Send("subject", "message")` — Send an email notification via a named notification list.
|
||||
- `Notify.To("listName").Send("subject", "message")` — Send a notification via a named notification list. Generates a `TrackedOperationId` (GUID) locally and returns it immediately; the notification is store-and-forwarded to the central cluster, which owns delivery. The script never contacts SMTP. (`NotificationId` is the notification-domain name for this same `TrackedOperationId` type.)
|
||||
- `Notify.Status("trackedOperationId")` — A thin alias of `Tracking.Status(id)` retained for the notification domain. Returns a status record (status, retry count, last error, key timestamps). While the notification is still in the site store-and-forward buffer the site answers locally (status `Forwarding`); once forwarded the query round-trips to central.
|
||||
|
||||
### Database Access
|
||||
- `Database.Connection("connectionName")` — Obtain a raw MS SQL client connection (ADO.NET) for synchronous read/write.
|
||||
- `Database.CachedWrite("connectionName", "sql", parameters)` — Submit a write operation for store-and-forward delivery.
|
||||
- `Database.CachedWrite("connectionName", "sql", parameters)` — Submit a write operation for store-and-forward delivery. Returns a `TrackedOperationId` tracking handle immediately; pass it to `Tracking.Status(id)` to observe delivery progress.
|
||||
|
||||
### Operation Tracking
|
||||
- `Tracking.Status("trackedOperationId")` — Returns a status record (status, retry count, last error, key timestamps) for any tracked operation: a cached external system call, a cached database write, or a notification. For cached calls and writes the answer is always site-local and authoritative — the site owns the operation tracking table. (`Notify.Status(...)` is a thin alias scoped to the notification domain.)
|
||||
|
||||
### Parameter Access
|
||||
- `Parameters["key"]` — Raw dictionary access (returns `object?`, requires manual casting).
|
||||
@@ -282,13 +287,17 @@ Available to all Script Execution Actors and Alarm Execution Actors:
|
||||
|
||||
Scripts execute **in-process** with constrained access. The following restrictions are enforced at compilation and runtime:
|
||||
|
||||
- **Allowed**: Access to the Script Runtime API (GetAttribute, SetAttribute, CallScript, CallShared, ExternalSystem, Notify, Database), standard C# language features, basic .NET types (collections, string manipulation, math, date/time).
|
||||
- **Allowed**: Access to the Script Runtime API (GetAttribute, SetAttribute, CallScript, CallShared, ExternalSystem, Notify, Database, Tracking), standard C# language features, basic .NET types (collections, string manipulation, math, date/time).
|
||||
- **Forbidden**: File system access (`System.IO`), process spawning (`System.Diagnostics.Process`), threading (`System.Threading` — except async/await), reflection (`System.Reflection`), raw network access (`System.Net.Sockets`, `System.Net.Http` — must use `ExternalSystem.Call`), assembly loading, unsafe code.
|
||||
- **Execution timeout**: Configurable per-script maximum execution time. Exceeding the timeout cancels the script and logs an error.
|
||||
- **Memory**: Scripts share the host process memory. No per-script memory limit, but the execution timeout prevents runaway allocations.
|
||||
|
||||
These constraints are enforced by restricting the set of assemblies and namespaces available to the script compilation context.
|
||||
|
||||
### Script Trust Boundary Auditing
|
||||
|
||||
Every script-trust-boundary call (External System Gateway, Database layer, Notify) emits an `AuditEvent` to `IAuditWriter` (site-local SQLite append). Hot path; never fails the calling action; failures logged via the `SiteAuditWriteFailures` health metric (see [Component-HealthMonitoring.md](Component-HealthMonitoring.md)). The central audit mirror and event schema live in [Component-AuditLog.md](Component-AuditLog.md).
|
||||
|
||||
## Script Scoping Rules
|
||||
|
||||
- Scripts can only read/write attributes on **their own instance** (via the parent Instance Actor).
|
||||
@@ -353,13 +362,12 @@ Per Akka.NET best practices, internal actor communication uses **Tell** (fire-an
|
||||
## Dependencies
|
||||
|
||||
- **Data Connection Layer**: Provides tag value updates to Instance Actors. Receives write requests from Instance Actors.
|
||||
- **Store-and-Forward Engine**: Handles reliable delivery for external system calls, notifications, and cached database writes submitted by scripts.
|
||||
- **Store-and-Forward Engine**: Handles reliable delivery for external system calls, cached database writes, and notifications submitted by scripts. For the notification category specifically, it forwards to the central cluster for delivery (not directly to SMTP). Owns the site-local operation tracking table that backs `Tracking.Status(id)`.
|
||||
- **External System Gateway**: Provides external system method invocations for scripts.
|
||||
- **Notification Service**: Handles email delivery for scripts.
|
||||
- **Communication Layer**: Receives deployments and lifecycle commands from central. Handles debug view requests. Reports deployment results.
|
||||
- **Site Event Logging**: Records script executions, alarm events, deployment events, instance lifecycle events.
|
||||
- **Health Monitoring**: Reports script error rates and alarm evaluation error rates.
|
||||
- **Local SQLite**: Persists deployed configurations, system-wide artifacts (external system definitions, database connection definitions, data connection definitions, notification lists, SMTP configuration).
|
||||
- **Local SQLite**: Persists deployed configurations, system-wide artifacts (external system definitions, database connection definitions, data connection definitions). Sites also maintain peer SQLite stores for the Store-and-Forward buffer, the site event log, the operation tracking table, and the site-local `AuditLog` (see [Component-AuditLog.md](Component-AuditLog.md)). The `AuditLog` file is purged on the same daily cadence as the others but respects the hard `ForwardState` invariant — rows still `Pending` forward are never purged, regardless of age.
|
||||
|
||||
## Interactions
|
||||
|
||||
|
||||
@@ -13,14 +13,16 @@ Site clusters only. The central cluster does not buffer messages.
|
||||
- Buffer outbound messages when the target system is unavailable.
|
||||
- Manage three categories of buffered messages:
|
||||
- External system API calls.
|
||||
- Email notifications.
|
||||
- Notifications forwarded to the central cluster.
|
||||
- Cached database writes.
|
||||
- Retry delivery per message according to the configured retry policy.
|
||||
- Park messages that exhaust their retry limit (dead-letter).
|
||||
- Persist buffered messages to local SQLite for durability.
|
||||
- Maintain a site-local **operation tracking table** holding one row per `TrackedOperationId` for cached calls (`ExternalCall` and `DatabaseWrite`) — the authoritative status record consulted by `Tracking.Status(id)`.
|
||||
- Emit cached-call lifecycle telemetry to the central Site Call Audit component on every status transition.
|
||||
- Replicate buffered messages to the standby node via application-level replication over Akka.NET remoting.
|
||||
- On failover, the standby node takes over delivery from its replicated copy.
|
||||
- Respond to remote queries from central for parked message management (list, retry, discard).
|
||||
- Respond to remote queries from central for parked message management (list, retry, discard), including central-driven Retry/Discard of parked cached calls.
|
||||
|
||||
## Message Lifecycle
|
||||
|
||||
@@ -42,16 +44,23 @@ Attempt immediate delivery
|
||||
└── Max retries exhausted → Park message
|
||||
```
|
||||
|
||||
For notifications, "delivery" means forwarding the message to the central cluster via Central–Site Communication; "success" is central's ack, on which the message is cleared. Notifications do not park — they are retried at the fixed forward interval until central acks. Parking applies only to the external-system-call and cached-database-write categories.
|
||||
|
||||
For the cached-call categories (`ExternalCall` and `DatabaseWrite`), the operation tracking table is the status record and the S&F buffer is purely the retry mechanism. A cached call that succeeds on its first immediate attempt is written directly as a terminal `Delivered` tracking row and never enters the S&F buffer. When immediate delivery fails transiently, the message is buffered and its tracking row moves to `Pending`/`Retrying`; the buffered message carries its `TrackedOperationId` so the tracking row and the retry record stay linked. When immediate delivery fails **permanently** (e.g. HTTP 4xx), the message is not buffered — the error is returned synchronously to the calling script as before — but the tracking row is written directly as a terminal `Failed` row capturing the error. On every tracking-table status transition the site emits `CachedCallTelemetry` to central.
|
||||
|
||||
Every cached-call outcome maps to a tracking-table state: immediate success → `Delivered`; transient failure → `Pending`/`Retrying`, eventually `Delivered` or `Parked`; permanent failure → terminal `Failed`; operator discard of a parked row → terminal `Discarded`.
|
||||
|
||||
## Retry Policy
|
||||
|
||||
Retry settings are defined on the **source entity** (not per-message):
|
||||
For the external-system-call and cached-database-write categories, retry settings are defined on the **source entity** (not per-message):
|
||||
- **External systems**: Each external system definition includes max retry count and time between retries.
|
||||
- **Notifications**: Email/SMTP configuration includes max retry count and time between retries.
|
||||
- **Cached database writes**: Each database connection definition includes max retry count and time between retries.
|
||||
|
||||
The **notification** category retries differently: it has no source-entity setting. The site→central forward uses a single fixed retry interval configured in the host `appsettings.json`. This interval is infrastructure config for reaching the central cluster, not a per-notification-list setting. It applies uniformly to every buffered notification regardless of its target list. A buffered notification is retried until central acks it; it is not parked on a retry limit (central, once reachable, owns delivery, retry, and parking from that point on).
|
||||
|
||||
The retry interval is **fixed** (not exponential backoff). Fixed interval is sufficient for the expected use cases.
|
||||
|
||||
**Note**: Only **transient failures** are eligible for store-and-forward buffering. For external system calls, transient failures are connection errors, timeouts, and HTTP 5xx responses. Permanent failures (HTTP 4xx) are returned directly to the calling script and are **not** queued for retry. This prevents the buffer from accumulating requests that will never succeed.
|
||||
**Note**: Only **transient failures** are eligible for store-and-forward buffering. For external system calls, transient failures are connection errors, timeouts, and HTTP 5xx responses. Permanent failures (HTTP 4xx) are returned directly to the calling script and are **not** queued for retry. This prevents the buffer from accumulating requests that will never succeed. For the cached-call categories, a permanent failure additionally sets the operation's tracking-table row to terminal `Failed`, capturing the error — so even a never-buffered cached call has an authoritative status record. `Failed` rows are not operator-actionable: a permanent failure would only fail again, and the error was already returned to the script.
|
||||
|
||||
## Buffer Size
|
||||
|
||||
@@ -65,6 +74,22 @@ There is **no maximum buffer size**. Messages accumulate in the buffer until del
|
||||
- On failover, the new active node has a near-complete copy of the buffer. In rare cases, the most recent operations may not have been replicated (e.g., a message added or removed just before failover). This can result in a few **duplicate deliveries** (message delivered but remove not replicated) or a few **missed retries** (message added but not replicated). Both are acceptable trade-offs for the latency benefit.
|
||||
- On failover, the new active node resumes delivery from its local copy.
|
||||
|
||||
### Operation Tracking Table
|
||||
|
||||
Alongside the S&F buffer DB, each site node holds a **site-local operation tracking table** in SQLite. It carries one row per `TrackedOperationId` for cached calls (`ExternalCall` and `DatabaseWrite`), created the moment the script issues the cached call and kept regardless of outcome.
|
||||
|
||||
- This table is the **status record**; the S&F buffer remains purely the **retry mechanism**. A buffered cached-call message references its `TrackedOperationId` back to its tracking row.
|
||||
- Each row records the operation kind (`TrackedOperationKind`), a target summary (external system + method, or database connection name), the unified `TrackedOperationStatus`, retry count, last error, source provenance (instance / script), and the created/updated/terminal UTC timestamps.
|
||||
- `Tracking.Status(id)` reads this table. For cached calls the **site is the authoritative source of truth** for status — the query is always answered site-locally, even when central is unreachable. The central Site Call Audit `SiteCalls` table is an eventually-consistent mirror.
|
||||
- A cached call that succeeds on its first immediate attempt writes a terminal `Delivered` row directly here, with nothing placed in the S&F buffer.
|
||||
- Terminal rows are purged after a configurable retention window (default 7 days) — the site holds live operational state; central holds long-term audit.
|
||||
|
||||
Notifications are unaffected: they have no tracking table. Their `NotificationId` and status are owned by the central `Notifications` table, and their lifecycle continues to forward to central exactly as before.
|
||||
|
||||
### Telemetry to Central
|
||||
|
||||
On every tracking-table status transition, the site emits a `CachedCallTelemetry` message to the central Site Call Audit component over the site→central channel. Emission is best-effort, at-least-once, and idempotent on `TrackedOperationId`. Because telemetry is best-effort, the site also responds to `CachedCallReconcileRequest` reconciliation pulls — cursor-based per-site reads of tracking rows changed since a cursor — so any missed telemetry self-heals. The site never depends on central; central converges to the site.
|
||||
|
||||
## Parked Message Management
|
||||
|
||||
- Parked messages remain stored at the site in SQLite.
|
||||
@@ -72,31 +97,36 @@ There is **no maximum buffer size**. Messages accumulate in the buffer until del
|
||||
- Operators can:
|
||||
- **Retry** a parked message (moves it back to the retry queue).
|
||||
- **Discard** a parked message (removes it permanently).
|
||||
- Store-and-forward messages are **not** automatically cleared when an instance is deleted. Pending and parked messages continue to exist and can be managed via the central UI.
|
||||
- For parked cached calls, Retry/Discard can be driven centrally: the Site Call Audit component relays `RetryParkedOperation` / `DiscardParkedOperation` commands (keyed by `TrackedOperationId`) down to the owning site. The site applies the command to its S&F buffer and tracking table, then emits `CachedCallTelemetry` reflecting the new state (`Retrying` or `Discarded`) — central never mutates its mirror row directly.
|
||||
- Store-and-forward messages are **not** automatically cleared when an instance is deleted. Pending and parked messages, and their tracking rows, continue to exist and can be managed via the central UI.
|
||||
|
||||
## Message Format
|
||||
|
||||
Each buffered message stores:
|
||||
- **Message ID**: Unique identifier.
|
||||
- **Category**: External system call, notification, or cached database write.
|
||||
- **Target**: External system name, notification list name, or database connection name.
|
||||
- **Payload**: Serialized message content (API method + parameters, email subject + body, SQL + parameters).
|
||||
- **Tracked Operation ID**: For the cached-call categories, the `TrackedOperationId` linking the buffered message to its row in the operation tracking table. Not used by the notification category, which is tracked centrally via its `NotificationId`.
|
||||
- **Target**: External system name, the central cluster (for notifications), or database connection name.
|
||||
- **Payload**: Serialized message content (API method + parameters; notification list name + subject + body plus the locally generated `NotificationId` and source provenance; SQL + parameters).
|
||||
- **Retry Count**: Number of attempts so far.
|
||||
- **Created At**: Timestamp when the message was first queued.
|
||||
- **Last Attempt At**: Timestamp of the most recent delivery attempt.
|
||||
- **Status**: Pending, retrying, or parked.
|
||||
- **Status**: Pending, retrying, or parked. This is the **buffer message's** retry state, distinct from the operation's `TrackedOperationStatus` lifecycle in the operation tracking table. A buffer message exists only while a cached call is mid-retry, so it never carries the terminal `Delivered`, `Failed`, or `Discarded` states — those live solely on the tracking row.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **SQLite**: Local persistence on each node.
|
||||
- **Communication Layer**: Application-level replication to standby node; remote query handling from central.
|
||||
- **Communication Layer**: Application-level replication to standby node; remote query handling from central; carries buffered notifications to the central cluster (ClusterClient) and receives central's acks.
|
||||
- **External System Gateway**: Delivers external system API calls.
|
||||
- **Notification Service**: Delivers email notifications.
|
||||
- **Central–Site Communication**: The delivery target for the notification category — a buffered notification is forwarded to the central cluster over Central–Site Communication and cleared on central's ack. Also carries `CachedCallTelemetry` and reconciliation responses to central, and receives `RetryParkedOperation` / `DiscardParkedOperation` commands.
|
||||
- **Site Call Audit**: The central audit mirror for cached calls — receives this engine's cached-call telemetry and reconciliation responses, and relays operator Retry/Discard of parked cached calls back as commands.
|
||||
- **Database Connections**: Delivers cached database writes.
|
||||
- **Site Event Logging**: Logs store-and-forward activity (queued, delivered, retried, parked).
|
||||
|
||||
## Interactions
|
||||
|
||||
- **Site Runtime (Script Actors)**: Scripts submit messages to the buffer (external calls, notifications, cached DB writes).
|
||||
- **Communication Layer**: Handles parked message queries/commands from central.
|
||||
- **Health Monitoring**: Reports buffer depth metrics.
|
||||
- **Communication Layer**: Handles parked message queries/commands from central; carries buffered notifications to the central cluster.
|
||||
- **Notification Outbox**: The central destination for the notification category — central ingests each forwarded notification into the `Notifications` table and acks the site, on which the engine clears the buffered message.
|
||||
- **Site Call Audit**: The central observability sibling for the cached-call categories — this engine emits `CachedCallTelemetry` on every tracking-table transition, answers `CachedCallReconcileRequest` pulls, and executes the `RetryParkedOperation` / `DiscardParkedOperation` commands it relays.
|
||||
- **Health Monitoring**: Reports buffer depth metrics, including the notification backlog covering the site→central forward leg.
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
- **Pre-deployment validation**: Before any deployment is sent to a site, the central cluster performs comprehensive validation including flattening the configuration, test-compiling all scripts, verifying alarm trigger references, verifying script trigger references, and checking data connection binding completeness (see Section 3.11).
|
||||
|
||||
### 1.5 System-Wide Artifact Deployment
|
||||
- Changes to shared scripts, external system definitions, database connection definitions, data connection definitions, notification lists, and SMTP configuration are **not automatically propagated** to sites.
|
||||
- Changes to shared scripts, external system definitions, database connection definitions, and data connection definitions are **not automatically propagated** to sites. (Notification lists and SMTP configuration are central-only and are not deployed to sites — see Section 6.)
|
||||
- Deployment of system-wide artifacts requires **explicit action** by a user with the **Deployment** role.
|
||||
- Artifacts can be deployed to **all sites at once** or to an **individual site** (per-site deployment).
|
||||
- The Design role manages the definitions; the Deployment role triggers deployment to sites. A user may hold both roles.
|
||||
@@ -55,7 +55,7 @@
|
||||
|
||||
### 2.3 Site-Level Storage & Interface
|
||||
- Sites have **no user interface** — they are headless collectors, forwarders, and script executors.
|
||||
- Sites require local storage for: the current deployed (flattened) configurations, deployed scripts, shared scripts, external system definitions, database connection definitions, data connection definitions, notification lists, and SMTP configuration.
|
||||
- Sites require local storage for: the current deployed (flattened) configurations, deployed scripts, shared scripts, external system definitions, database connection definitions, and data connection definitions. (Notification lists and SMTP configuration are central-only and are not stored at sites — see Section 6.)
|
||||
- After artifact deployment, sites are **fully self-contained** — all runtime configuration is read from local SQLite. Sites do **not** access the central configuration database at runtime.
|
||||
- Store-and-forward buffers are persisted to a **local SQLite database on each node** and replicated between nodes via application-level replication (see 1.3).
|
||||
|
||||
@@ -231,7 +231,7 @@ Scripts executing on a site for a given instance can:
|
||||
- **Write** attribute values on that instance. For attributes with a data source reference, the write goes to the Data Connection Layer which writes to the physical device; the in-memory value updates when the device confirms the new value via the existing subscription. For static attributes, the write updates the in-memory value and **persists the override to local SQLite** — the value survives restart and failover. Persisted overrides are reset when the instance is redeployed.
|
||||
- **Call other scripts** on that instance via `Instance.CallScript("scriptName", params)`. Calls use the Akka ask pattern and return the called script's return value. Script-to-script calls support concurrent execution.
|
||||
- **Call shared scripts** via `Scripts.CallShared("scriptName", params)`. Shared scripts execute **inline** in the calling Script Actor's context — they are compiled code libraries, not separate actors.
|
||||
- **Call external system API methods** in two modes: `ExternalSystem.Call()` for synchronous request/response, or `ExternalSystem.CachedCall()` for fire-and-forget with store-and-forward on transient failure (see Section 5).
|
||||
- **Call external system API methods** in two modes: `ExternalSystem.Call()` for synchronous request/response, or `ExternalSystem.CachedCall()` for deferred delivery — it returns a `TrackedOperationId` tracking handle immediately and store-and-forwards the call on transient failure (see Section 5).
|
||||
- **Send notifications** (see Section 6).
|
||||
- **Access databases** by requesting an MS SQL client connection by name (see Section 5.5).
|
||||
|
||||
@@ -286,7 +286,7 @@ Scripts **cannot** access other instances' attributes or scripts.
|
||||
- Parked messages are **stored at the site** where they originated.
|
||||
- The **central UI** can **query sites** for parked messages and manage them remotely.
|
||||
- Operators can **retry** or **discard** parked messages from the central UI.
|
||||
- Parked message management covers **external system calls**, **notifications**, and **cached database writes**.
|
||||
- Parked message management covers **external system calls** and **cached database writes**. (Notifications are not parked at sites — they are store-and-forwarded to central, which owns delivery and parking; see Section 6.)
|
||||
|
||||
### 5.5 Database Connections
|
||||
- Database connections are **predefined, named resources** created by users with the **Design** role.
|
||||
@@ -306,23 +306,23 @@ Scripts can interact with databases in two modes:
|
||||
|
||||
### 6.1 Notification Lists
|
||||
- Notification lists are **system-wide**, managed by users with the **Design** role.
|
||||
- Each list has a **name** and contains one or more **recipients**.
|
||||
- Each recipient has a **name** and an **email address**.
|
||||
- Notification lists are deployed to **all sites** (deployment requires explicit action by a user with the Deployment role).
|
||||
- At the site, notification lists and recipients are read from **local SQLite** (populated by artifact deployment), not from the central config DB.
|
||||
- Each list has a **name** and a **`Type`** (`Email` now; `Teams` and other types planned). The type determines the type-specific targets the list carries.
|
||||
- An `Email` list contains one or more **recipients**, each with a **name** and an **email address**.
|
||||
- Notification lists are defined and stored **centrally only** — they are **not deployed to sites** and have no local SQLite copy. Recipient resolution happens at central, at delivery time.
|
||||
|
||||
### 6.2 Email Support
|
||||
- The system has **predefined support for sending email** as the notification delivery mechanism.
|
||||
- Email server configuration (SMTP settings) is defined centrally and deployed to all sites as part of **artifact deployment** (see Section 1.5). Sites read SMTP configuration from **local SQLite**.
|
||||
- Email server configuration (SMTP settings) is defined and stored **centrally only**. Sites never talk to SMTP; all delivery happens on the central cluster.
|
||||
|
||||
### 6.3 Script API
|
||||
- Scripts send notifications using a simplified API: `Notify.To("list name").Send("subject", "message")`
|
||||
- This API is available to instance scripts, alarm on-trigger scripts, and shared scripts.
|
||||
- `Send()` generates a `NotificationId` (GUID) locally and returns it to the script immediately; `Notify.Status(notificationId)` returns a status record (status, retry count, last error, key timestamps).
|
||||
|
||||
### 6.4 Store-and-Forward for Notifications
|
||||
- If the email server is unavailable, notifications are **buffered locally at the site**.
|
||||
- Follows the same retry pattern as external system calls: configurable **max retry count** and **time between retries** (fixed interval).
|
||||
- After max retries are exhausted, the notification is **parked** for manual review (managed via central UI alongside external system parked messages).
|
||||
### 6.4 Notification Delivery and Store-and-Forward
|
||||
- Notification delivery is **inverted to the central cluster**. A site script's notification is **store-and-forwarded to central** (target = central, not SMTP); the central **Notification Outbox** logs every notification to a `Notifications` audit table and owns dispatch, delivery, retry, and parking.
|
||||
- If the central cluster is unreachable, the notification is **buffered locally at the site** by the Store-and-Forward Engine and retried at a fixed forward interval until central acks it. The site→central forward does not park.
|
||||
- Once central holds the notification, delivery retry reuses the central SMTP configuration's **max retry count** and fixed **time between retries**. After retries are exhausted, or on a permanent failure, the notification is **parked** for review on the Central UI Notification Outbox page.
|
||||
- There is **no maximum buffer size** for notification messages.
|
||||
|
||||
## 7. Inbound API (Central)
|
||||
@@ -370,10 +370,11 @@ The central cluster hosts a **configuration and management UI** (no live machine
|
||||
- **Site & Data Connection Management**: Define sites (including optional NodeAAddress and NodeBAddress fields for Akka remoting paths, and optional GrpcNodeAAddress and GrpcNodeBAddress fields for gRPC streaming endpoints), manage data connections and assign them to sites.
|
||||
- **Area Management**: Define hierarchical area structures per site for organizing instances.
|
||||
- **Deployment**: View diffs between deployed and current template-derived configurations, deploy updates to individual instances. Filter instances by area. Pre-deployment validation runs automatically before any deployment is sent.
|
||||
- **System-Wide Artifact Deployment**: Explicitly deploy shared scripts, external system definitions, database connection definitions, data connection definitions, notification lists, and SMTP configuration to all sites or to an individual site (requires Deployment role). Per-site deployment is available via the Sites admin page.
|
||||
- **System-Wide Artifact Deployment**: Explicitly deploy shared scripts, external system definitions, database connection definitions, and data connection definitions to all sites or to an individual site (requires Deployment role). Per-site deployment is available via the Sites admin page. (Notification lists and SMTP configuration are central-only and are not deployed.)
|
||||
- **Deployment Status Monitoring**: Track whether deployments were successfully applied at site level.
|
||||
- **Debug View**: On-demand real-time view of a specific instance's tag values and alarm states for troubleshooting (see 8.1).
|
||||
- **Parked Message Management**: Query sites for parked messages (external system calls, notifications, and cached database writes), retry or discard them.
|
||||
- **Parked Message Management**: Query sites for parked messages (external system calls and cached database writes), retry or discard them.
|
||||
- **Notification Outbox**: Monitor centrally-delivered notifications — KPI tiles and a queryable `Notifications` audit list with Retry/Discard actions on parked notifications (see Section 6).
|
||||
- **Health Monitoring Dashboard**: View site cluster health, node status, data connection health, script error rates, alarm evaluation errors, and store-and-forward buffer depths (see Section 11).
|
||||
- **Site Event Log Viewer**: Query and view operational event logs from site clusters (see Section 12).
|
||||
|
||||
@@ -422,7 +423,7 @@ All system-modifying actions are logged, including:
|
||||
- **Alarm changes**: Create, edit, delete alarm definitions.
|
||||
- **Instance changes**: Create, override values, bind connections, area assignment, disable, enable, delete.
|
||||
- **Deployments**: Who deployed what to which instance, and the result (success/failure).
|
||||
- **System-wide artifact deployments**: Who deployed shared scripts / external system definitions / DB connections / data connections / notification lists / SMTP config, to which site(s), and the result.
|
||||
- **System-wide artifact deployments**: Who deployed shared scripts / external system definitions / DB connections / data connections, to which site(s), and the result.
|
||||
- **External system definition changes**: Create, edit, delete.
|
||||
- **Database connection changes**: Create, edit, delete.
|
||||
- **Notification list changes**: Create, edit, delete lists and recipients.
|
||||
@@ -439,6 +440,25 @@ All system-modifying actions are logged, including:
|
||||
### 10.4 Transactional Guarantee
|
||||
- Audit entries are written **synchronously** within the same database transaction as the change (via the unit-of-work pattern). If the change succeeds, the audit entry is guaranteed to be recorded. If the change rolls back, the audit entry rolls back too.
|
||||
|
||||
### 10.5 Centralized Audit Log (Script Trust Boundary)
|
||||
|
||||
*See [Component-AuditLog.md](Component-AuditLog.md) (#23) for the full component design.*
|
||||
|
||||
Sections 10.1–10.4 cover **configuration-database audit** (config-mutating user actions via `IAuditService`). This subsection defines the separate **runtime Audit Log** that captures every action crossing the **script trust boundary** at sites and central:
|
||||
|
||||
- **AL-1**: The system maintains an **append-only** central Audit Log recording every script-trust-boundary action — outbound external system calls (sync `Call` and `CachedCall`), outbound database operations (sync `Connection` access and `CachedWrite`), notifications, and inbound API method invocations.
|
||||
- **AL-2**: For cached calls and notifications, the Audit Log captures **one row per lifecycle event** (e.g., enqueued, retrying, delivered, parked, discarded), not a single mutable row per operation.
|
||||
- **AL-3**: Site-originated events are appended to a **site-local SQLite hot-path** synchronously with the action, then **forwarded to central via gRPC telemetry**; central ingest is **idempotent on `EventId`** (insert-if-not-exists; the `AuditLog` table is strictly append-only, so rows are never updated after insert).
|
||||
- **AL-4**: A periodic **central→site reconciliation pull** detects and replays any telemetry events that were missed (e.g., during a central outage), making the central Audit Log eventually consistent with sites.
|
||||
- **AL-5**: Each row captures **payload metadata** (target, method, status, timings, correlation IDs) plus a **truncated request/response body** — **8 KB default**, expanded to **64 KB on error** outcomes.
|
||||
- **AL-6**: **HTTP headers are redacted by default**; **SQL parameter values are captured by default**. Per-target **redaction opt-in** is configurable on external systems, database connections, and inbound API methods.
|
||||
- **AL-7**: A failure to write or forward an audit row **never aborts the user-facing action** — the hot-path action proceeds and the audit record is recovered via the local hot-path buffer plus reconciliation.
|
||||
- **AL-8**: Central retention defaults to **365 days**, enforced by a **monthly partition switch-and-drop** purge — no row-by-row delete.
|
||||
- **AL-9**: The site SQLite Audit Log is purged only when `ForwardState ∈ {Forwarded, Reconciled}` — i.e., a row must be either confirmed-forwarded *or* confirmed-reconciled before it can be removed. A central outage therefore **cannot cause audit loss at sites**.
|
||||
- **AL-10**: The Central UI exposes an **Audit Log page** with a cross-channel filter (by site, target, status, time range, correlation ID), plus **drill-ins from existing operational pages** (Site Calls, Notification Outbox, Inbound API).
|
||||
- **AL-11**: Append-only semantics are **enforced via DB roles** (no UPDATE/DELETE granted on the `AuditLog` table to application accounts); a **tamper-evidence hash chain is deferred to v1.x**.
|
||||
- **AL-12**: The CLI provides a `scadalink audit` command group for query, export, and hash-chain verification (verify-chain becomes operational once AL-11's hash chain ships) against the central Audit Log.
|
||||
|
||||
## 11. Health Monitoring
|
||||
|
||||
### 11.1 Monitored Metrics
|
||||
@@ -448,7 +468,8 @@ The central cluster monitors the health of each site cluster, including:
|
||||
- **Data connection health**: Connected/disconnected status per data connection at the site.
|
||||
- **Script error rates**: Frequency of script failures at the site.
|
||||
- **Alarm evaluation errors**: Frequency of alarm evaluation failures at the site.
|
||||
- **Store-and-forward buffer depth**: Number of messages currently queued (broken down by external system calls, notifications, and cached database writes).
|
||||
- **Store-and-forward buffer depth**: Number of messages currently queued (broken down by external system calls, notifications, and cached database writes). The notification backlog covers the site→central forward leg.
|
||||
- **Notification Outbox KPIs**: Central-computed delivery-health metrics — queue depth, stuck count, and parked count — surfaced as headline tiles on the health dashboard. These are computed centrally from the `Notifications` table, not collected from sites.
|
||||
|
||||
### 11.2 Reporting
|
||||
- Site clusters **report health metrics to central** periodically.
|
||||
|
||||
@@ -29,18 +29,30 @@ For `appsettings.Development.json` (Notification Service):
|
||||
"Smtp": {
|
||||
"Server": "localhost",
|
||||
"Port": 1025,
|
||||
"AuthMode": "None",
|
||||
"AuthMode": "Basic",
|
||||
"Credentials": "test:test",
|
||||
"TlsMode": "None",
|
||||
"FromAddress": "scada-notifications@company.com",
|
||||
"ConnectionTimeout": 30
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Since `MP_SMTP_AUTH_ACCEPT_ANY` is enabled, the Notification Service can use any auth mode:
|
||||
- **No auth**: Connect directly, no credentials needed.
|
||||
- **Basic Auth**: Any username/password will be accepted (useful for testing the auth code path without a real server).
|
||||
> **`Server` host**: use `localhost` only when the Notification Service runs directly on
|
||||
> the host. When it runs inside the docker cluster, set `Server` to the container name
|
||||
> `scadalink-smtp` — the cluster compose stack and the infra compose stack share the
|
||||
> `scadalink-net` network, so the container is reachable by name.
|
||||
|
||||
The delivery service (`MailKitSmtpClientWrapper`) only accepts `Basic` or `OAuth2` —
|
||||
there is no "no auth" mode — so the working config above uses `Basic`:
|
||||
- **Basic Auth**: `MP_SMTP_AUTH_ACCEPT_ANY` makes Mailpit accept any `username:password`,
|
||||
so use a throwaway value such as `test:test`. This exercises the real auth code path
|
||||
without a real server.
|
||||
- **OAuth2**: Not supported by Mailpit. For OAuth2 testing, use a real Microsoft 365 tenant.
|
||||
|
||||
`TlsMode` **must** be `None`: Mailpit on port 1025 is plain SMTP and does not offer
|
||||
STARTTLS. `StartTLS` or `SSL` would fail the connection.
|
||||
|
||||
## Mailpit API
|
||||
|
||||
Mailpit exposes a REST API at `http://localhost:8025/api` for programmatic access:
|
||||
|
||||
@@ -183,8 +183,9 @@ INSERT INTO [DataConnections] ([Id], [Name], [Protocol], [PrimaryConfiguration],
|
||||
SET IDENTITY_INSERT [DataConnections] OFF;
|
||||
|
||||
-- ExternalSystemDefinitions (1 rows)
|
||||
-- NOTE: [AuthConfiguration] is an encrypted secret column — dumped as NULL. Restore via the app (CLI/API) post-seed.
|
||||
SET IDENTITY_INSERT [ExternalSystemDefinitions] ON;
|
||||
INSERT INTO [ExternalSystemDefinitions] ([Id], [Name], [EndpointUrl], [AuthType], [AuthConfiguration], [MaxRetries], [RetryDelay]) VALUES (1, N'Test REST API', N'http://scadalink-restapi:5200', N'ApiKey', N'scadalink-test-key-1', 0, '00:00:00.000000');
|
||||
INSERT INTO [ExternalSystemDefinitions] ([Id], [Name], [EndpointUrl], [AuthType], [AuthConfiguration], [MaxRetries], [RetryDelay]) VALUES (1, N'Test REST API', N'http://scadalink-restapi:5200', N'ApiKey', NULL, 0, '00:00:00.000000');
|
||||
SET IDENTITY_INSERT [ExternalSystemDefinitions] OFF;
|
||||
|
||||
-- ExternalSystemMethods (1 rows)
|
||||
|
||||
@@ -114,6 +114,34 @@ docker exec -i scadalink-mssql /opt/mssql-tools18/bin/sqlcmd \
|
||||
-S localhost -U sa -P 'ScadaLink_Dev1#' -C -d ScadaLinkConfig -b < "$SEED_FILE"
|
||||
echo " Seed replayed."
|
||||
|
||||
echo ""
|
||||
echo "--- Stage 6d/6: restore encrypted secret config (CLI) ---"
|
||||
# Configuration that lives in encrypted secret columns cannot be replayed from
|
||||
# raw SQL: ASP.NET Data Protection ciphertext is non-deterministic and bound to
|
||||
# the source key ring. Create/restore it through the app so the EF value
|
||||
# converter encrypts against this cluster's key ring.
|
||||
CLI="dotnet run --project $PROJECT_ROOT/src/ScadaLink.CLI --"
|
||||
AUTH="--username multi-role --password password"
|
||||
|
||||
# ExternalSystemDefinitions Id 1 ("Test REST API") is inserted by the seed with
|
||||
# a fixed identity but a NULL AuthConfiguration; set the API key here.
|
||||
$CLI --url "$MGMT_URL" $AUTH external-system update \
|
||||
--id 1 \
|
||||
--name "Test REST API" \
|
||||
--endpoint-url "http://scadalink-restapi:5200" \
|
||||
--auth-type ApiKey \
|
||||
--auth-config "scadalink-test-key-1"
|
||||
echo " External-system auth config restored (encrypted)."
|
||||
|
||||
# The "Machine Data DB" database connection is referenced by name from the
|
||||
# seeded TestDatabaseQuery script. It is not in seed-config.sql (its
|
||||
# ConnectionString is an encrypted secret column); create it through the app.
|
||||
$CLI --url "$MGMT_URL" $AUTH db-connection create \
|
||||
--name "Machine Data DB" \
|
||||
--connection-string "Server=scadalink-mssql,1433;Database=ScadaLinkMachineData;User Id=scadalink_app;Password=ScadaLink_Dev1#;TrustServerCertificate=true" \
|
||||
|| echo " (Machine Data DB connection may already exist)"
|
||||
echo " Database connection created (encrypted)."
|
||||
|
||||
echo ""
|
||||
echo "=== Reseed complete ==="
|
||||
echo ""
|
||||
|
||||
@@ -13,6 +13,12 @@ Excluded by design (per-environment, not design-time): Sites (seeded via
|
||||
seed-sites.sh), Instances + InstanceConnectionBindings + InstanceOverrides,
|
||||
NotificationLists/Recipients, SmtpConfigurations, ApiKeys, Areas,
|
||||
SiteScopeRules, LdapGroupMappings, DataProtectionKeys, audit, deployment.
|
||||
|
||||
Encrypted secret columns (see ENCRYPTED_COLUMNS) are emitted as NULL: they
|
||||
hold ASP.NET Data Protection ciphertext, which is non-deterministic and bound
|
||||
to the source key ring, so a raw SQL dump can never replay a valid value.
|
||||
Re-populate them through the application after the seed runs (infra/reseed.sh
|
||||
does this via the ScadaLink CLI).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -45,6 +51,18 @@ INSERT_ORDER = [
|
||||
# the column list. All listed tables happen to use Id as their identity.
|
||||
IDENTITY_TABLES = set(INSERT_ORDER)
|
||||
|
||||
# (table, column) pairs encrypted at rest via ASP.NET Data Protection
|
||||
# (EncryptedStringConverter in ScadaLink.ConfigurationDatabase). Ciphertext is
|
||||
# non-deterministic and key-ring-bound, so it cannot be replayed from a static
|
||||
# SQL dump — the application would fail to decrypt it on read. These columns
|
||||
# are dumped as NULL; re-seed their values through the app (CLI / API) so the
|
||||
# value converter encrypts them against the target key ring.
|
||||
ENCRYPTED_COLUMNS = {
|
||||
("ExternalSystemDefinitions", "AuthConfiguration"),
|
||||
("SmtpConfigurations", "Credentials"),
|
||||
("DatabaseConnectionDefinitions", "ConnectionString"),
|
||||
}
|
||||
|
||||
# Templates has self-FK Templates.ParentTemplateId; emit a single batch that
|
||||
# inserts shallow rows first then deeper ones. pymssql returns rows in Id order
|
||||
# from our ORDER BY, which matches insertion order for this schema (parent Id
|
||||
@@ -175,6 +193,16 @@ def dump(args):
|
||||
rows = cursor.fetchall()
|
||||
|
||||
out.append("-- " + table + " (" + str(len(rows)) + " rows)")
|
||||
|
||||
# Columns encrypted at rest cannot be dumped verbatim; emit NULL and
|
||||
# note it so the secret value is restored through the app afterwards.
|
||||
nulled = [c for c in columns if (table, c) in ENCRYPTED_COLUMNS]
|
||||
for c in nulled:
|
||||
out.append(
|
||||
"-- NOTE: [{}] is an encrypted secret column — dumped as NULL. "
|
||||
"Restore via the app (CLI/API) post-seed.".format(c)
|
||||
)
|
||||
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
@@ -183,7 +211,10 @@ def dump(args):
|
||||
if identity:
|
||||
out.append("SET IDENTITY_INSERT [{}] ON;".format(table))
|
||||
for row in rows:
|
||||
values = ", ".join(quote(v) for v in row)
|
||||
values = ", ".join(
|
||||
"NULL" if (table, c) in ENCRYPTED_COLUMNS else quote(v)
|
||||
for c, v in zip(columns, row)
|
||||
)
|
||||
out.append(
|
||||
"INSERT INTO [{}] ({}) VALUES ({});".format(table, col_list, values)
|
||||
)
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
using System.Collections.Concurrent;
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T8, T9) — central singleton implementation of
|
||||
/// <see cref="IAuditCentralHealthSnapshot"/>. Owns thread-safe
|
||||
/// <see cref="System.Threading.Interlocked"/> counters for
|
||||
/// <c>CentralAuditWriteFailures</c> + <c>AuditRedactionFailure</c> and a
|
||||
/// per-site latched stalled-state map fed by the
|
||||
/// <see cref="SiteAuditTelemetryStalledTracker"/>. Also implements the
|
||||
/// writer surfaces (<see cref="ICentralAuditWriteFailureCounter"/> +
|
||||
/// <see cref="IAuditRedactionFailureCounter"/>) so a single concrete object
|
||||
/// is the source of truth — DI binds those two interfaces to this same
|
||||
/// singleton instance on the central composition root.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why one type for read + write.</b> The writer interfaces are tiny
|
||||
/// (<c>Increment()</c>) and the read surface needs visibility of those
|
||||
/// counters anyway — having a single class own both means the
|
||||
/// <c>Interlocked</c> field IS the snapshot value, no extra plumbing needed.
|
||||
/// Mirrors the
|
||||
/// <see cref="ScadaLink.HealthMonitoring.SiteHealthCollector"/> pattern where
|
||||
/// the collector both receives and exposes the metric.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Stalled-state plumbing.</b> The per-site stalled latch lives directly
|
||||
/// on this snapshot. <see cref="SiteAuditTelemetryStalledTracker"/> is the
|
||||
/// EventStream subscriber that pushes
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> publications in via
|
||||
/// <see cref="ApplyStalled"/>. Keeping the dictionary on this type (rather
|
||||
/// than reading the tracker on every access) lets the snapshot be constructed
|
||||
/// without an <see cref="Akka.Actor.ActorSystem"/> dependency — the tracker
|
||||
/// is wired up later from the Akka bootstrap, once the system is built.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditCentralHealthSnapshot
|
||||
: IAuditCentralHealthSnapshot,
|
||||
ICentralAuditWriteFailureCounter,
|
||||
IAuditRedactionFailureCounter
|
||||
{
|
||||
private int _centralAuditWriteFailures;
|
||||
private int _auditRedactionFailure;
|
||||
private readonly ConcurrentDictionary<string, bool> _stalled = new();
|
||||
|
||||
/// <inheritdoc/>
|
||||
public int CentralAuditWriteFailures =>
|
||||
Interlocked.CompareExchange(ref _centralAuditWriteFailures, 0, 0);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public int AuditRedactionFailure =>
|
||||
Interlocked.CompareExchange(ref _auditRedactionFailure, 0, 0);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled =>
|
||||
new Dictionary<string, bool>(_stalled);
|
||||
|
||||
/// <summary>
|
||||
/// Apply a <see cref="SiteAuditTelemetryStalledChanged"/> publication
|
||||
/// observed by <see cref="SiteAuditTelemetryStalledTracker"/>. Public
|
||||
/// so the tracker (which lives in the same assembly but is constructed
|
||||
/// later from the Akka host) can push without a friend reference;
|
||||
/// readers should call <see cref="SiteAuditTelemetryStalled"/>.
|
||||
/// </summary>
|
||||
public void ApplyStalled(SiteAuditTelemetryStalledChanged evt)
|
||||
{
|
||||
if (evt is null) return;
|
||||
_stalled[evt.SiteId] = evt.Stalled;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
void ICentralAuditWriteFailureCounter.Increment() =>
|
||||
Interlocked.Increment(ref _centralAuditWriteFailures);
|
||||
|
||||
/// <inheritdoc/>
|
||||
void IAuditRedactionFailureCounter.Increment() =>
|
||||
Interlocked.Increment(ref _auditRedactionFailure);
|
||||
}
|
||||
@@ -0,0 +1,295 @@
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Messages.Audit;
|
||||
using ScadaLink.ConfigurationDatabase;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central-side singleton (per Bundle E wiring) that ingests batches of
|
||||
/// <see cref="AuditEvent"/> rows pushed from sites via the
|
||||
/// <c>IngestAuditEvents</c> gRPC RPC. Each row is stamped with the central-side
|
||||
/// <see cref="AuditEvent.IngestedAtUtc"/> and inserted idempotently via
|
||||
/// <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> — duplicates are
|
||||
/// silently swallowed (first-write-wins per Bundle A's hardening).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Idempotency is the contract: a row that already exists at central counts
|
||||
/// as "accepted" for the purposes of the reply, because the storage state is
|
||||
/// consistent and the site is free to flip its local row to <c>Forwarded</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per Bundle D's brief, audit-write failures must NEVER abort the user-facing
|
||||
/// action. The actor wraps each repository call in its own try/catch so a
|
||||
/// single bad row cannot cause the rest of the batch to be lost; the actor's
|
||||
/// <see cref="SupervisorStrategy"/> uses <c>Resume</c> so a thrown exception
|
||||
/// inside <c>ReceiveAsync</c> does not restart the actor (which would also
|
||||
/// reset any in-flight state).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Two constructors exist for a deliberate reason: Bundle D's tests inject a
|
||||
/// concrete <see cref="IAuditLogRepository"/> against a per-test MSSQL fixture
|
||||
/// (the only way to verify the IngestedAtUtc stamp + duplicate-key idempotency
|
||||
/// end to end), while Bundle E's host wiring registers the actor as a cluster
|
||||
/// singleton and must therefore resolve the repository — which is a scoped EF
|
||||
/// Core service — from a fresh DI scope per message. Mirroring the Notification
|
||||
/// Outbox actor's pattern.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class AuditLogIngestActor : ReceiveActor
|
||||
{
|
||||
private readonly IServiceProvider? _serviceProvider;
|
||||
private readonly IAuditLogRepository? _injectedRepository;
|
||||
private readonly ILogger<AuditLogIngestActor> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Test-mode constructor — injects a concrete repository instance whose
|
||||
/// lifetime exceeds the test, so the actor reuses the same instance across
|
||||
/// every message. Used by Bundle D's MSSQL-backed TestKit fixture.
|
||||
/// </summary>
|
||||
public AuditLogIngestActor(
|
||||
IAuditLogRepository repository,
|
||||
ILogger<AuditLogIngestActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(repository);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_injectedRepository = repository;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
|
||||
// The single-repository test ctor cannot service the M3 dual-write —
|
||||
// it has no SiteCalls repo and no DbContext. The handler still
|
||||
// registers (so callers don't dead-letter) but replies empty so the
|
||||
// test surface stays explicit about what this ctor supports.
|
||||
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryWithoutDualWriteAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Production constructor — resolves <see cref="IAuditLogRepository"/> from
|
||||
/// a fresh DI scope per message because the repository is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The actor itself
|
||||
/// is a long-lived cluster singleton, so it cannot hold a scope across
|
||||
/// messages.
|
||||
/// </summary>
|
||||
public AuditLogIngestActor(
|
||||
IServiceProvider serviceProvider,
|
||||
ILogger<AuditLogIngestActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(serviceProvider);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_serviceProvider = serviceProvider;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<IngestAuditEventsCommand>(OnIngestAsync);
|
||||
ReceiveAsync<IngestCachedTelemetryCommand>(OnCachedTelemetryAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit-write failures are best-effort by design (see alog.md §13): a
|
||||
/// thrown exception in the ingest pipeline must not crash the actor.
|
||||
/// Resume keeps the actor's state intact so the next batch is processed
|
||||
/// against the same repository instance.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
|
||||
Akka.Actor.SupervisorStrategy.DefaultDecider);
|
||||
}
|
||||
|
||||
private async Task OnIngestAsync(IngestAuditEventsCommand cmd)
|
||||
{
|
||||
// Sender is captured before the first await — Akka resets Sender
|
||||
// between message dispatches, so a post-await Tell would go to
|
||||
// DeadLetters.
|
||||
var replyTo = Sender;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
var accepted = new List<Guid>(cmd.Events.Count);
|
||||
|
||||
// Resolve the repository for the whole batch — one DbContext per
|
||||
// message, mirroring NotificationOutboxActor. The injected-repository
|
||||
// mode (Bundle D tests) skips the scope entirely.
|
||||
// Bundle C (M5-T6): the IAuditPayloadFilter is also resolved from the
|
||||
// per-message scope when one is available so the row is truncated +
|
||||
// redacted before InsertIfNotExistsAsync. The single-repository test
|
||||
// ctor has no service provider — it falls through with no filter,
|
||||
// which preserves the small-payload assumptions baked into the
|
||||
// existing D2 fixtures.
|
||||
IServiceScope? scope = null;
|
||||
IAuditLogRepository repository;
|
||||
IAuditPayloadFilter? filter = null;
|
||||
ICentralAuditWriteFailureCounter? failureCounter = null;
|
||||
if (_injectedRepository is not null)
|
||||
{
|
||||
repository = _injectedRepository;
|
||||
}
|
||||
else
|
||||
{
|
||||
scope = _serviceProvider!.CreateScope();
|
||||
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
|
||||
// M6 Bundle E (T8): central health counter is best-effort —
|
||||
// unregistered (test composition roots) means the per-row catch
|
||||
// simply logs without surfacing on the health dashboard.
|
||||
failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var evt in cmd.Events)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Stamp IngestedAtUtc here, not at the site. Bundle A's
|
||||
// repository hardening already swallows duplicate-key races,
|
||||
// so the same id arriving twice (site retry, reconciliation)
|
||||
// is a silent no-op.
|
||||
// Filter BEFORE the IngestedAtUtc stamp so the redacted
|
||||
// copy carries the central-side ingest timestamp. Filter
|
||||
// is contract-bound to never throw; null = pass-through.
|
||||
var filtered = filter?.Apply(evt) ?? evt;
|
||||
var ingested = filtered with { IngestedAtUtc = nowUtc };
|
||||
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
|
||||
accepted.Add(evt.EventId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Per-row catch — one bad row never sinks the whole batch.
|
||||
// The row stays Pending at the site; the next drain retries.
|
||||
// M6 Bundle E (T8): bump the central health counter so a
|
||||
// sustained insert-throw failure surfaces on the dashboard.
|
||||
try { failureCounter?.Increment(); }
|
||||
catch { /* counter must never throw — defence in depth */ }
|
||||
_logger.LogError(ex,
|
||||
"Failed to persist audit event {EventId} during batch ingest; row will be retried by the site.",
|
||||
evt.EventId);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
scope?.Dispose();
|
||||
}
|
||||
|
||||
replyTo.Tell(new IngestAuditEventsReply(accepted));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M3 dual-write handler. For every <see cref="CachedTelemetryEntry"/> the
|
||||
/// actor opens a fresh MS SQL transaction, inserts the AuditLog row
|
||||
/// idempotently AND upserts the SiteCalls row monotonically. Both succeed
|
||||
/// or both roll back, so the audit and operational mirrors never drift
|
||||
/// mid-row. The IngestedAtUtc stamp is unified between the two rows so a
|
||||
/// downstream join lines up cleanly.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Per-entry isolation — one entry's failed transaction does NOT abort
|
||||
/// other entries in the batch (each gets its own
|
||||
/// <see cref="Microsoft.EntityFrameworkCore.RelationalDatabaseFacadeExtensions.BeginTransactionAsync"/>
|
||||
/// scope and a try/catch around it). Audit-write failure NEVER aborts the
|
||||
/// user-facing action — the site keeps the row Pending and retries on the
|
||||
/// next drain.
|
||||
/// </remarks>
|
||||
private async Task OnCachedTelemetryAsync(IngestCachedTelemetryCommand cmd)
|
||||
{
|
||||
var replyTo = Sender;
|
||||
var accepted = new List<Guid>(cmd.Entries.Count);
|
||||
|
||||
try
|
||||
{
|
||||
await using var scope = _serviceProvider!.CreateAsyncScope();
|
||||
var auditRepo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var siteCallRepo = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
||||
var dbContext = scope.ServiceProvider.GetRequiredService<ScadaLinkDbContext>();
|
||||
// Bundle C (M5-T6): resolve the filter for the whole batch from
|
||||
// the scope; null = pass-through for test composition roots that
|
||||
// skip the filter registration. The filter is contract-bound to
|
||||
// never throw, so we can apply it inside the per-entry try
|
||||
// without risking an unbounded blast radius.
|
||||
var filter = scope.ServiceProvider.GetService<IAuditPayloadFilter>();
|
||||
// M6 Bundle E (T8): same best-effort central health counter as
|
||||
// the OnIngestAsync path — null on test composition roots that
|
||||
// skip the registration.
|
||||
var failureCounter = scope.ServiceProvider.GetService<ICentralAuditWriteFailureCounter>();
|
||||
|
||||
foreach (var entry in cmd.Entries)
|
||||
{
|
||||
try
|
||||
{
|
||||
await using var tx = await dbContext.Database
|
||||
.BeginTransactionAsync()
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// Stamp IngestedAtUtc on both rows from a single
|
||||
// central-side instant so a join on the two tables sees
|
||||
// matching timestamps (debugging convenience, not a
|
||||
// correctness invariant).
|
||||
var ingestedAt = DateTime.UtcNow;
|
||||
// Filter the audit half BEFORE the dual-write — only the
|
||||
// AuditLog row's payload columns are filterable; SiteCalls
|
||||
// carries operational state only (status, retry count) and
|
||||
// is left untouched.
|
||||
var filteredAudit = filter?.Apply(entry.Audit) ?? entry.Audit;
|
||||
var auditStamped = filteredAudit with { IngestedAtUtc = ingestedAt };
|
||||
var siteCallStamped = entry.SiteCall with { IngestedAtUtc = ingestedAt };
|
||||
|
||||
await auditRepo.InsertIfNotExistsAsync(auditStamped)
|
||||
.ConfigureAwait(false);
|
||||
await siteCallRepo.UpsertAsync(siteCallStamped)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await tx.CommitAsync().ConfigureAwait(false);
|
||||
accepted.Add(entry.Audit.EventId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Both rows rolled back via the disposing transaction. The
|
||||
// EventId is NOT added to `accepted` so the site keeps its
|
||||
// row Pending and retries on the next drain. Other entries
|
||||
// in the batch continue with their own transactions.
|
||||
// M6 Bundle E (T8): bump the central health counter so a
|
||||
// sustained dual-write failure surfaces on the dashboard.
|
||||
try { failureCounter?.Increment(); }
|
||||
catch { /* counter must never throw — defence in depth */ }
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Combined telemetry dual-write failed for AuditEvent {EventId} / TrackedOperationId {TrackedOpId}; rolled back.",
|
||||
entry.Audit.EventId,
|
||||
entry.SiteCall.TrackedOperationId);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Resolving the scope itself threw (e.g. DI mis-wiring). Log and
|
||||
// reply with whatever we managed to accept (likely empty) — the
|
||||
// central singleton MUST stay alive.
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Combined telemetry batch ingest failed before per-entry processing.");
|
||||
}
|
||||
|
||||
replyTo.Tell(new IngestCachedTelemetryReply(accepted));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fallback handler installed on the single-repository test ctor — that
|
||||
/// ctor has no DbContext and no <see cref="ISiteCallAuditRepository"/>, so
|
||||
/// it cannot service the dual-write. Logs a warning and replies with an
|
||||
/// empty ack so callers fall through to their retry path.
|
||||
/// </summary>
|
||||
private Task OnCachedTelemetryWithoutDualWriteAsync(IngestCachedTelemetryCommand cmd)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"AuditLogIngestActor received IngestCachedTelemetryCommand on the single-repository ctor; dual-write requires the IServiceProvider ctor. Replying with empty ack ({Count} entries).",
|
||||
cmd.Entries.Count);
|
||||
Sender.Tell(new IngestCachedTelemetryReply(Array.Empty<Guid>()));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the central
|
||||
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service (M6-T5).
|
||||
/// Defaults: once every 24 hours, keep at least one future monthly
|
||||
/// boundary ahead of <see cref="DateTime.UtcNow"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The hosted service drives a daily roll-forward of
|
||||
/// <c>pf_AuditLog_Month</c>: each tick reads the current max boundary and
|
||||
/// SPLITs new monthly boundaries until at least
|
||||
/// <see cref="LookaheadMonths"/> future months are covered. The 1-month
|
||||
/// default is intentionally conservative — anything less risks an
|
||||
/// end-of-month race where inserts land in the unbounded tail partition;
|
||||
/// anything more wastes nothing but represents premature commitment.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The 24-hour cadence is the cheapest interval that still guarantees
|
||||
/// at-most-one missed boundary in steady state (even a hard failover the
|
||||
/// hosted service can recover on its very next tick). Lowering this below
|
||||
/// an hour would generate more metadata churn than it saves.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditLogPartitionMaintenanceOptions
|
||||
{
|
||||
/// <summary>Period of the maintenance tick in seconds (default 86 400 = 24 h).</summary>
|
||||
public int IntervalSeconds { get; set; } = 86_400;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum number of future months that <c>pf_AuditLog_Month</c> must
|
||||
/// cover after each tick. Default 1 — i.e. as of mid-May the partition
|
||||
/// for the next full month (June) must already be present.
|
||||
/// </summary>
|
||||
public int LookaheadMonths { get; set; } = 1;
|
||||
}
|
||||
@@ -0,0 +1,145 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Interfaces;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
|
||||
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
|
||||
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
|
||||
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
|
||||
/// missing future boundaries — the partition function must always cover at
|
||||
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
|
||||
/// future months, otherwise inserts past the highest boundary accumulate in
|
||||
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
|
||||
/// cannot purge cleanly.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a hosted service, not an actor.</b> Bundle C's
|
||||
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
|
||||
/// because it needs supervised lifecycle alongside the rest of the
|
||||
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
|
||||
/// chore with no cross-actor coordination, so we use the much simpler
|
||||
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
|
||||
/// between ticks, cancellation on stop. Reusing
|
||||
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
|
||||
/// keeps the contract testable without any actor framework involvement.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
|
||||
/// a try/catch so a transient SQL Server error never tears down the hosted
|
||||
/// service — the next tick simply retries. The exception is logged with
|
||||
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
|
||||
/// E's central health collector) can subscribe to the logger to alert on
|
||||
/// repeated failures.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Startup ordering.</b> A first tick fires immediately at
|
||||
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
|
||||
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
|
||||
/// the partition function to come up to spec. This is also what the brief
|
||||
/// asks for ("Run once on startup").
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
|
||||
/// (alongside the rest of the EF repositories) because the implementation
|
||||
/// reuses the per-scope <c>ScadaLinkDbContext</c>. A hosted service is a
|
||||
/// singleton, so it must open and dispose a scope around each tick — the
|
||||
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
|
||||
{
|
||||
private readonly IServiceScopeFactory _scopeFactory;
|
||||
private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
|
||||
private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _loop;
|
||||
|
||||
public AuditLogPartitionMaintenanceService(
|
||||
IServiceScopeFactory scopeFactory,
|
||||
IOptions<AuditLogPartitionMaintenanceOptions> options,
|
||||
ILogger<AuditLogPartitionMaintenanceService> logger)
|
||||
{
|
||||
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
|
||||
// token both terminate the loop; either side firing aborts the
|
||||
// pending Task.Delay.
|
||||
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task RunLoopAsync(CancellationToken ct)
|
||||
{
|
||||
// Run once on startup so a fresh deployment isn't gated on the
|
||||
// IntervalSeconds initial wait — the brief calls this out explicitly.
|
||||
await SafeMaintainAsync(ct).ConfigureAwait(false);
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await SafeMaintainAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SafeMaintainAsync(CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await using var scope = _scopeFactory.CreateAsyncScope();
|
||||
var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
|
||||
var added = await maintenance
|
||||
.EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
|
||||
.ConfigureAwait(false);
|
||||
if (added.Count > 0)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
|
||||
added.Count,
|
||||
string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all is deliberate: the hosted service must survive every
|
||||
// class of tick failure (transient SQL, DI resolution, etc.) so
|
||||
// the next tick gets a chance. The brief's contract is
|
||||
// "exception logged, not propagated".
|
||||
_logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_cts?.Cancel();
|
||||
return _loop ?? Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,214 @@
|
||||
using System.Diagnostics;
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.AuditLog.Configuration;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central singleton (M6 Bundle C) that drives the daily AuditLog partition
|
||||
/// purge. On a configurable timer (default 24 hours) the actor:
|
||||
/// <list type="number">
|
||||
/// <item>Queries <see cref="IAuditLogRepository.GetPartitionBoundariesOlderThanAsync"/>
|
||||
/// for monthly boundaries whose latest <c>OccurredAtUtc</c> is older
|
||||
/// than <c>DateTime.UtcNow - RetentionDays</c>.</item>
|
||||
/// <item>For each eligible boundary, calls
|
||||
/// <see cref="IAuditLogRepository.SwitchOutPartitionAsync"/> which runs
|
||||
/// the drop-and-rebuild dance around <c>UX_AuditLog_EventId</c>.</item>
|
||||
/// <item>Publishes <see cref="AuditLogPurgedEvent"/> on the actor-system
|
||||
/// EventStream so the Bundle E central health collector + ops surfaces
|
||||
/// can subscribe without coupling to this actor.</item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Daily cadence.</b> Partition switch is metadata-only but the
|
||||
/// drop-and-rebuild dance briefly removes <c>UX_AuditLog_EventId</c>; running
|
||||
/// more often than necessary trades unique-index rebuild outages for
|
||||
/// negligible freshness wins. The default 24-hour interval matches
|
||||
/// alog.md §10's retention policy.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Continue-on-error.</b> A single boundary that throws (transient SQL
|
||||
/// failure, contention with backup, missing object) must NOT prevent the
|
||||
/// other eligible boundaries from being purged on the same tick. Per-boundary
|
||||
/// work runs inside its own try/catch; the actor's
|
||||
/// <see cref="SupervisorStrategy"/> uses Resume so any leaked exception keeps
|
||||
/// the singleton alive for the next tick.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The singleton
|
||||
/// opens one DI scope per tick and reuses the same repository across every
|
||||
/// boundary in that tick — mirrors the
|
||||
/// <see cref="SiteAuditReconciliationActor"/> pattern.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>EventStream.</b> Publishing <see cref="AuditLogPurgedEvent"/> through
|
||||
/// the EventStream rather than direct messaging avoids coupling this actor
|
||||
/// to its consumers; M6 Bundle E will subscribe a central health-counter
|
||||
/// bridge that surfaces purge progress on the central health report.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class AuditLogPurgeActor : ReceiveActor
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly AuditLogPurgeOptions _purgeOptions;
|
||||
private readonly AuditLogOptions _auditOptions;
|
||||
private readonly ILogger<AuditLogPurgeActor> _logger;
|
||||
private ICancelable? _timer;
|
||||
|
||||
public AuditLogPurgeActor(
|
||||
IServiceProvider services,
|
||||
IOptions<AuditLogPurgeOptions> purgeOptions,
|
||||
IOptions<AuditLogOptions> auditOptions,
|
||||
ILogger<AuditLogPurgeActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(purgeOptions);
|
||||
ArgumentNullException.ThrowIfNull(auditOptions);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_services = services;
|
||||
_purgeOptions = purgeOptions.Value;
|
||||
_auditOptions = auditOptions.Value;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
|
||||
}
|
||||
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
var interval = _purgeOptions.Interval;
|
||||
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||
initialDelay: interval,
|
||||
interval: interval,
|
||||
receiver: Self,
|
||||
message: PurgeTick.Instance,
|
||||
sender: Self);
|
||||
}
|
||||
|
||||
protected override void PostStop()
|
||||
{
|
||||
_timer?.Cancel();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resume keeps the singleton alive across any leaked exception. Restart
|
||||
/// would re-run PreStart and reschedule the timer (harmless but wasteful);
|
||||
/// Stop is wrong because the singleton must keep ticking until shutdown.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: 0,
|
||||
withinTimeRange: TimeSpan.Zero,
|
||||
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
|
||||
}
|
||||
|
||||
private async Task OnTickAsync()
|
||||
{
|
||||
// Capture EventStream BEFORE the first await. Accessing Context (and
|
||||
// therefore Context.System) after an await is unsafe because Akka's
|
||||
// ActorBase.Context throws "no active ActorContext" once the
|
||||
// continuation runs on a thread that isn't currently dispatching this
|
||||
// actor — mirrors the same Sender-capture pattern in
|
||||
// AuditLogIngestActor.OnIngestAsync.
|
||||
var eventStream = Context.System.EventStream;
|
||||
|
||||
// Compute the retention threshold from AuditLogOptions.RetentionDays
|
||||
// each tick — the options class supports hot reload via
|
||||
// IOptionsMonitor for the redaction policy and similar settings; we
|
||||
// read the snapshot per-tick so an operator who lowers RetentionDays
|
||||
// sees the change applied on the next purge without an actor
|
||||
// restart.
|
||||
var threshold = DateTime.UtcNow - TimeSpan.FromDays(_auditOptions.RetentionDays);
|
||||
|
||||
IServiceScope? scope = null;
|
||||
IAuditLogRepository repository;
|
||||
try
|
||||
{
|
||||
scope = _services.CreateScope();
|
||||
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for AuditLog purge tick.");
|
||||
scope?.Dispose();
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
IReadOnlyList<DateTime> boundaries;
|
||||
try
|
||||
{
|
||||
boundaries = await repository
|
||||
.GetPartitionBoundariesOlderThanAsync(threshold)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to enumerate eligible AuditLog partition boundaries (threshold {ThresholdUtc:o}); skipping purge tick.",
|
||||
threshold);
|
||||
return;
|
||||
}
|
||||
|
||||
if (boundaries.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (var boundary in boundaries)
|
||||
{
|
||||
// Per-boundary try/catch: one bad partition (transient SQL
|
||||
// failure, missing object, contention with backup) does NOT
|
||||
// abandon the rest of the tick.
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
var rowsDeleted = await repository
|
||||
.SwitchOutPartitionAsync(boundary)
|
||||
.ConfigureAwait(false);
|
||||
sw.Stop();
|
||||
|
||||
eventStream.Publish(
|
||||
new AuditLogPurgedEvent(boundary, rowsDeleted, sw.ElapsedMilliseconds));
|
||||
|
||||
_logger.LogInformation(
|
||||
"Purged AuditLog partition {MonthBoundary:yyyy-MM-dd}; {RowsDeleted} rows in {DurationMs} ms.",
|
||||
boundary,
|
||||
rowsDeleted,
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to purge AuditLog partition {MonthBoundary:yyyy-MM-dd}; other partitions continue. Elapsed {DurationMs} ms.",
|
||||
boundary,
|
||||
sw.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
scope.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Self-tick triggering a purge pass across all eligible partitions.</summary>
|
||||
internal sealed class PurgeTick
|
||||
{
|
||||
public static readonly PurgeTick Instance = new();
|
||||
private PurgeTick() { }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the central <see cref="AuditLogPurgeActor"/> singleton.
|
||||
/// Default cadence is 24 hours per the M6 plan; the retention window itself
|
||||
/// is sourced from <see cref="ScadaLink.AuditLog.Configuration.AuditLogOptions.RetentionDays"/>
|
||||
/// (default 365) so operators tune retention from a single section.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The purge actor is a daily-cadence singleton, not a hot-loop, because
|
||||
/// partition-switch I/O is metadata-only but the drop-and-rebuild dance
|
||||
/// briefly removes the <c>UX_AuditLog_EventId</c> unique index — running
|
||||
/// more often than necessary trades index-rebuild outages for marginal
|
||||
/// freshness gains. Lower this only when an operator can prove they need
|
||||
/// sub-daily purge granularity.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <see cref="IntervalOverride"/> exists for tests to drop the cadence to
|
||||
/// milliseconds without polluting the production config surface; production
|
||||
/// binds <see cref="IntervalHours"/> only.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class AuditLogPurgeOptions
|
||||
{
|
||||
/// <summary>Period of the purge tick in hours (default 24).</summary>
|
||||
public int IntervalHours { get; set; } = 24;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only override for finer control over the tick cadence than
|
||||
/// whole-hour resolution allows. When non-null, takes precedence over
|
||||
/// <see cref="IntervalHours"/>. Not bound from config — production
|
||||
/// config exposes <see cref="IntervalHours"/> only.
|
||||
/// </summary>
|
||||
public TimeSpan? IntervalOverride { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Resolves the effective tick interval, honouring the test override
|
||||
/// when set. Falls back to <see cref="IntervalHours"/>.
|
||||
/// </summary>
|
||||
public TimeSpan Interval =>
|
||||
IntervalOverride ?? TimeSpan.FromHours(IntervalHours);
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Published on the actor-system EventStream by <see cref="AuditLogPurgeActor"/>
|
||||
/// after each successful partition switch-out. Downstream consumers (Bundle E
|
||||
/// central health collector, ops dashboards, audit trails) subscribe so a
|
||||
/// purge action is observable without the actor needing to know about any
|
||||
/// specific subscriber.
|
||||
/// </summary>
|
||||
/// <param name="MonthBoundary">
|
||||
/// The pf_AuditLog_Month lower-bound boundary that was switched out — i.e.
|
||||
/// the first instant of the purged month in UTC.
|
||||
/// </param>
|
||||
/// <param name="RowsDeleted">
|
||||
/// Approximate row count purged from the partition, sampled BEFORE the
|
||||
/// switch. Exact accounting would require a post-switch scan of the staging
|
||||
/// table, which the dance drops immediately, so this is the closest
|
||||
/// observable proxy. Zero is a valid value when the actor's enumerator
|
||||
/// included a partition the operator subsequently emptied by hand.
|
||||
/// </param>
|
||||
/// <param name="DurationMs">
|
||||
/// Wall-clock time spent inside <c>SwitchOutPartitionAsync</c> for this
|
||||
/// boundary, in milliseconds. Useful for spotting the rare slow purge
|
||||
/// without spinning up dedicated telemetry.
|
||||
/// </param>
|
||||
public sealed record AuditLogPurgedEvent(
|
||||
DateTime MonthBoundary,
|
||||
long RowsDeleted,
|
||||
long DurationMs);
|
||||
@@ -0,0 +1,57 @@
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T9) — bridges
|
||||
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
|
||||
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
|
||||
/// parameter redactor stage throws and the filter has to over-redact the
|
||||
/// offending field) into <see cref="AuditCentralHealthSnapshot"/> so the
|
||||
/// failure surfaces on the central health surface as
|
||||
/// <c>AuditCentralHealthSnapshot.AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Site vs central.</b> M5 Bundle C wired the SITE-side bridge
|
||||
/// (<see cref="ScadaLink.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>),
|
||||
/// which routes increments into the site health report payload's
|
||||
/// <c>AuditRedactionFailure</c> field. That handles redactor failures on the
|
||||
/// site SQLite hot-path (FallbackAuditWriter). M6 Bundle E (T9) adds the
|
||||
/// MIRROR bridge here so the same payload filter — when it runs on the
|
||||
/// central <see cref="CentralAuditWriter"/> /
|
||||
/// <see cref="AuditLogIngestActor"/> paths — surfaces its failures on the
|
||||
/// central dashboard rather than disappearing into a NoOp.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Registration shape.</b> Site composition roots call
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>,
|
||||
/// which overrides the binding with the site bridge. Central composition
|
||||
/// roots call <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>,
|
||||
/// which overrides with this central bridge. A node never wears both hats —
|
||||
/// site and central are distinct host roles — so the two bridges never
|
||||
/// fight over the same binding at runtime.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Why not a thin wrapper around the snapshot directly?</b> The snapshot
|
||||
/// itself <i>could</i> be the bound implementation (it already implements
|
||||
/// <see cref="IAuditRedactionFailureCounter"/>), but a dedicated class makes
|
||||
/// the central-vs-site asymmetry explicit at the DI boundary — readers of
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>
|
||||
/// see "site → site bridge, central → central bridge", matching the
|
||||
/// <see cref="ScadaLink.AuditLog.Site.HealthMetricsAuditRedactionFailureCounter"/>
|
||||
/// shape one-for-one.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CentralAuditRedactionFailureCounter : IAuditRedactionFailureCounter
|
||||
{
|
||||
private readonly AuditCentralHealthSnapshot _snapshot;
|
||||
|
||||
public CentralAuditRedactionFailureCounter(AuditCentralHealthSnapshot snapshot)
|
||||
{
|
||||
_snapshot = snapshot ?? throw new ArgumentNullException(nameof(snapshot));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Increment() => ((IAuditRedactionFailureCounter)_snapshot).Increment();
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central-only direct-write implementation of <see cref="ICentralAuditWriter"/>.
|
||||
/// Wraps <see cref="IAuditLogRepository.InsertIfNotExistsAsync"/> as a best-effort
|
||||
/// audit emission path for components that originate audit events ON the central
|
||||
/// node (Notification Outbox dispatch, Inbound API) — NOT for site telemetry
|
||||
/// ingest (that path is the SiteAudit → AuditLogIngestActor batched flow).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Best-effort contract.</b> Audit-write failures NEVER abort the user-facing
|
||||
/// action (alog.md §13). The writer catches every exception thrown by repository
|
||||
/// resolution or the insert call, logs at warning, and returns successfully.
|
||||
/// Callers may still wrap the call in their own try/catch (defensive — the writer
|
||||
/// is supposed to swallow).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Scope-per-call resolution.</b> <see cref="IAuditLogRepository"/> is a SCOPED
|
||||
/// EF Core service (registered by <c>ScadaLink.ConfigurationDatabase</c>). The
|
||||
/// writer itself is registered as a singleton (so all callers share one instance),
|
||||
/// so it cannot hold a scope across calls — it opens a fresh
|
||||
/// <see cref="IServiceScope"/> per <see cref="WriteAsync"/> invocation, mirroring
|
||||
/// the per-message scope pattern used by <c>AuditLogIngestActor</c> and
|
||||
/// <c>NotificationOutboxActor</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Idempotency.</b> Persistence is via <c>InsertIfNotExistsAsync</c>, so a
|
||||
/// double-emitted event (same <see cref="AuditEvent.EventId"/>) is a silent
|
||||
/// no-op — the writer is safe to call from any number of dispatch paths.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CentralAuditWriter : ICentralAuditWriter
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<CentralAuditWriter> _logger;
|
||||
private readonly IAuditPayloadFilter? _filter;
|
||||
private readonly ICentralAuditWriteFailureCounter _failureCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Bundle C (M5-T6) — the central direct-write path used by the
|
||||
/// NotificationOutboxActor dispatch and the Inbound API middleware also
|
||||
/// needs to truncate + redact before the row hits MS SQL. The filter is
|
||||
/// optional so the M4 test composition roots that don't pass one keep
|
||||
/// working (they only ever write small payloads); production DI registers
|
||||
/// the real filter via <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
|
||||
/// M6 Bundle E (T8) — adds the optional
|
||||
/// <see cref="ICentralAuditWriteFailureCounter"/> so a swallowed repository
|
||||
/// throw bumps the central health surface's
|
||||
/// <c>CentralAuditWriteFailures</c> counter. Defaults to a NoOp so test
|
||||
/// composition roots that don't wire the counter keep their current
|
||||
/// behaviour.
|
||||
/// </summary>
|
||||
public CentralAuditWriter(
|
||||
IServiceProvider services,
|
||||
ILogger<CentralAuditWriter> logger,
|
||||
IAuditPayloadFilter? filter = null,
|
||||
ICentralAuditWriteFailureCounter? failureCounter = null)
|
||||
{
|
||||
_services = services ?? throw new ArgumentNullException(nameof(services));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_filter = filter;
|
||||
_failureCounter = failureCounter ?? new NoOpCentralAuditWriteFailureCounter();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Persists <paramref name="evt"/> into the central <c>AuditLog</c> table
|
||||
/// idempotently on <see cref="AuditEvent.EventId"/>. Stamps
|
||||
/// <see cref="AuditEvent.IngestedAtUtc"/> from the central-side clock.
|
||||
/// Internal failures are logged and swallowed — never thrown.
|
||||
/// </summary>
|
||||
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
if (evt is null)
|
||||
{
|
||||
// Defensive — a null event is a programming bug at the caller and
|
||||
// produces no meaningful audit row. Log and return.
|
||||
_logger.LogWarning("CentralAuditWriter.WriteAsync received null event; ignoring.");
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Filter BEFORE stamping IngestedAtUtc + handing to the repo. The
|
||||
// filter contract is "never throws"; the null-coalesce keeps the
|
||||
// M4 test composition roots (no filter passed) working unchanged.
|
||||
var filtered = _filter?.Apply(evt) ?? evt;
|
||||
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
var repo = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
var stamped = filtered with { IngestedAtUtc = DateTime.UtcNow };
|
||||
await repo.InsertIfNotExistsAsync(stamped, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Audit failure NEVER aborts the user-facing action — swallow and log.
|
||||
// M6 Bundle E (T8): also surface the failure on the central health
|
||||
// counter so a sustained audit-write outage is visible on the
|
||||
// health dashboard rather than disappearing into the log file.
|
||||
try
|
||||
{
|
||||
_failureCounter.Increment();
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Counter must NEVER throw — defence in depth. Even if a
|
||||
// misbehaving custom counter does, swallowing here keeps the
|
||||
// best-effort contract intact.
|
||||
}
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"CentralAuditWriter failed for EventId {EventId} (Kind={Kind}, Status={Status})",
|
||||
evt.EventId, evt.Kind, evt.Status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E read-side surface exposing the central-side
|
||||
/// audit-health counters: <see cref="CentralAuditWriteFailures"/> (every
|
||||
/// repository insert throw from <see cref="CentralAuditWriter"/> /
|
||||
/// <see cref="AuditLogIngestActor"/>), <see cref="AuditRedactionFailure"/>
|
||||
/// (every payload-filter redactor throw on the central path), and
|
||||
/// <see cref="SiteAuditTelemetryStalled"/> (per-site latched state from the
|
||||
/// <see cref="SiteAuditTelemetryStalledTracker"/>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Read-only contract.</b> Implementations expose a point-in-time snapshot
|
||||
/// — increments and tracker updates happen through the dedicated counter /
|
||||
/// tracker interfaces, not through this surface. Consumers (M7+ central
|
||||
/// health pages) read these properties; they never mutate.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Why a parallel surface from <see cref="ICentralHealthAggregator"/>.</b>
|
||||
/// <see cref="ICentralHealthAggregator"/> aggregates per-site
|
||||
/// <c>SiteHealthState</c> reports the SITE emits. The central audit-write
|
||||
/// failure / redaction-failure counters originate ON central (no site report
|
||||
/// carries them), so they live on a dedicated snapshot rather than being
|
||||
/// retro-fitted into a per-site state. The two surfaces will be composed at
|
||||
/// the M7 dashboard layer.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IAuditCentralHealthSnapshot
|
||||
{
|
||||
/// <summary>
|
||||
/// Count of central-side audit-write failures since process start.
|
||||
/// Incremented by every <see cref="CentralAuditWriter"/> /
|
||||
/// <see cref="AuditLogIngestActor"/> repository insert that throws.
|
||||
/// </summary>
|
||||
int CentralAuditWriteFailures { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Count of central-side payload-filter redactor over-redactions since
|
||||
/// process start. Incremented by every header / body / SQL-parameter
|
||||
/// redactor stage that throws (the filter falls back to the
|
||||
/// <c><redacted: redactor error></c> marker and never aborts the
|
||||
/// user-facing action). Sites have their own counter
|
||||
/// (<see cref="IAuditRedactionFailureCounter"/>-backed
|
||||
/// <c>SiteHealthReport.AuditRedactionFailure</c>) and the central
|
||||
/// composition root's binding routes ALL central redactor throws
|
||||
/// (CentralAuditWriter + AuditLogIngestActor paths) into this counter.
|
||||
/// </summary>
|
||||
int AuditRedactionFailure { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-site latched stalled state: <c>true</c> when the
|
||||
/// <see cref="SiteAuditReconciliationActor"/> has observed two
|
||||
/// consecutive non-draining cycles for that site, <c>false</c> after the
|
||||
/// first draining cycle. Sites absent from the map are interpreted as
|
||||
/// healthy (<c>Stalled=false</c> default). Snapshot is a defensive
|
||||
/// copy — readers must not mutate.
|
||||
/// </summary>
|
||||
IReadOnlyDictionary<string, bool> SiteAuditTelemetryStalled { get; }
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T8) counter sink invoked by central-side audit
|
||||
/// writers (<see cref="CentralAuditWriter"/>, <see cref="AuditLogIngestActor"/>)
|
||||
/// every time a repository <c>InsertIfNotExistsAsync</c> throws. Mirrors the
|
||||
/// site-side <see cref="ScadaLink.AuditLog.Site.IAuditWriteFailureCounter"/>
|
||||
/// shape one-for-one — same one-method contract, same NoOp default, same
|
||||
/// must-never-abort-the-user-facing-action invariant.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Audit-write failures NEVER abort the user-facing action (alog.md §13) —
|
||||
/// the writer swallows the exception and surfaces the failure via this counter
|
||||
/// instead. A NoOp default is the correct safe fallback while the central
|
||||
/// health surface is being wired in; <see cref="AuditCentralHealthSnapshot"/>
|
||||
/// is the production binding that routes increments into the aggregated
|
||||
/// central health snapshot consumed by future M7+ pages.
|
||||
/// </remarks>
|
||||
public interface ICentralAuditWriteFailureCounter
|
||||
{
|
||||
/// <summary>Increment the central audit-write failure counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using ScadaLink.Commons.Messages.Integration;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Mockable abstraction over the central-side <c>PullAuditEvents</c> gRPC
|
||||
/// client surface that <see cref="SiteAuditReconciliationActor"/> uses to
|
||||
/// fetch the next reconciliation batch from a specific site. Extracted so the
|
||||
/// actor can be unit-tested against an in-memory stub without standing up a
|
||||
/// real <c>GrpcChannel</c> per site.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The production implementation (host wiring task) wraps the auto-generated
|
||||
/// <c>SiteStreamService.SiteStreamServiceClient</c>, multiplexing one
|
||||
/// <c>GrpcChannel</c> per site keyed on
|
||||
/// <see cref="SiteEntry.GrpcEndpoint"/>. Until that wiring lands the DI
|
||||
/// composition root binds a NoOp default that returns an empty response — the
|
||||
/// reconciliation tick is still scheduled and the cursor logic still runs, so
|
||||
/// regressions in the actor itself are caught even before the real client
|
||||
/// arrives.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Implementations MUST NOT throw on transport faults that the actor can
|
||||
/// tolerate (connection refused, deadline exceeded). The actor's contract is
|
||||
/// "one site's failure doesn't sink the rest of the tick"; an exception still
|
||||
/// won't crash the actor (the per-site try/catch catches it), but returning
|
||||
/// an empty response on a known-recoverable error keeps the logs cleaner.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IPullAuditEventsClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Issues a <c>PullAuditEvents</c> RPC against the site whose endpoint
|
||||
/// is registered against <paramref name="siteId"/>. Returns the next
|
||||
/// batch of <see cref="ScadaLink.Commons.Entities.Audit.AuditEvent"/>
|
||||
/// rows ordered oldest-first AND a <c>MoreAvailable</c> flag the actor
|
||||
/// uses to decide whether to fire another pull immediately.
|
||||
/// </summary>
|
||||
Task<PullAuditEventsResponse> PullAsync(
|
||||
string siteId,
|
||||
DateTime sinceUtc,
|
||||
int batchSize,
|
||||
CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Enumeration surface consumed by <see cref="SiteAuditReconciliationActor"/> to
|
||||
/// discover which sites to poll on each reconciliation tick. Extracted so the
|
||||
/// actor can be unit-tested against a static list without depending on the
|
||||
/// production <c>ISiteRepository</c> + EF Core DbContext.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The production implementation wraps <c>ISiteRepository.GetAllSitesAsync</c>
|
||||
/// and projects each <c>Site</c> to a <see cref="SiteEntry"/> using the
|
||||
/// site's configured <c>GrpcNodeAAddress</c> (falling back to
|
||||
/// <c>GrpcNodeBAddress</c> when NodeA is unset). Sites with NO gRPC address
|
||||
/// configured are silently skipped — the reconciliation pull cannot reach
|
||||
/// them, but absence of an address is a configuration decision, not a runtime
|
||||
/// error.
|
||||
/// </remarks>
|
||||
public interface ISiteEnumerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Returns the current set of sites the reconciliation puller should visit
|
||||
/// on the next tick. Implementations should reflect adds/removes promptly
|
||||
/// — the actor calls this once per tick.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SiteEntry>> EnumerateAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// One reconciliation target: the site identifier the actor uses as the
|
||||
/// cursor key and the gRPC endpoint <see cref="IPullAuditEventsClient"/> dials
|
||||
/// to issue the pull. Endpoint is the bare authority (e.g. <c>http://siteA:8083</c>);
|
||||
/// transport selection (TLS, keepalive, etc.) is the client's concern.
|
||||
/// </summary>
|
||||
public sealed record SiteEntry(string SiteId, string GrpcEndpoint);
|
||||
@@ -0,0 +1,17 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="ICentralAuditWriteFailureCounter"/> binding used when
|
||||
/// the central health surface (<see cref="AuditCentralHealthSnapshot"/>) has
|
||||
/// not been wired (test composition roots, site-only hosts that incidentally
|
||||
/// resolve a <see cref="CentralAuditWriter"/>). Drops every increment on the
|
||||
/// floor. Mirrors <see cref="ScadaLink.AuditLog.Site.NoOpAuditWriteFailureCounter"/>.
|
||||
/// </summary>
|
||||
public sealed class NoOpCentralAuditWriteFailureCounter : ICentralAuditWriteFailureCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment()
|
||||
{
|
||||
// intentional no-op
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,332 @@
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Repositories;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Central singleton (M6 Bundle B) that drives the audit-log reconciliation
|
||||
/// pull loop. On a configurable timer (default 5 minutes) the actor walks every
|
||||
/// known site, asks the site for any <see cref="AuditEvent"/> rows with
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/> >= the site's last reconciled
|
||||
/// cursor, ingests them idempotently into the central
|
||||
/// <see cref="IAuditLogRepository"/>, and advances the cursor.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Self-healing telemetry, not a dispatcher.</b> The push path
|
||||
/// (<see cref="ScadaLink.AuditLog.Site.Telemetry.SiteAuditTelemetryActor"/> +
|
||||
/// <c>IngestAuditEvents</c>) is the primary mechanism. This actor exists so a
|
||||
/// missed push (gRPC blip, central restart, site offline) is eventually
|
||||
/// repaired by central re-pulling whatever the site still has in
|
||||
/// <c>Pending</c>/<c>Forwarded</c> state. Idempotency on
|
||||
/// <see cref="AuditEvent.EventId"/> (M2 Bundle A's race-fix) makes duplicate
|
||||
/// arrivals from both paths a silent no-op.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Cursor lifetime.</b> The per-site <c>LastReconciledAt</c> watermark is
|
||||
/// kept in-memory for the actor's lifetime. The cluster singleton normally
|
||||
/// survives the host process; on a deliberate failover OR a singleton restart
|
||||
/// the cursors reset to <see cref="DateTime.MinValue"/>. That is conservative
|
||||
/// but correct — the next tick simply asks for everything the site still has,
|
||||
/// and idempotent ingest swallows the dupes. Persisting cursors to MS SQL was
|
||||
/// considered and rejected for M6: the cost of a write per tick outweighs the
|
||||
/// rare benefit of avoiding one over-broad pull after a restart.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Stalled detection.</b> The brief calls a site "stalled" when two
|
||||
/// consecutive pull cycles BOTH return non-empty AND <c>MoreAvailable=true</c>
|
||||
/// — i.e. the backlog isn't draining. The actor publishes
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> on the actor system's
|
||||
/// EventStream so a future <c>ICentralHealthCollector</c> bridge (M6 Bundle E)
|
||||
/// can flip the health metric without coupling this actor to the health
|
||||
/// collection surface today.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure isolation.</b> A single site that throws (DNS, transport,
|
||||
/// repository write) must NOT prevent other sites from being polled on the
|
||||
/// same tick. The per-site work runs inside its own try/catch; the actor's
|
||||
/// supervisor strategy keeps it alive across any leaked exception with
|
||||
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/>'s Restart
|
||||
/// semantics — restart resets the in-memory cursors, but as noted above that's
|
||||
/// a safe (over-pull, idempotent) recovery.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scopes.</b> <see cref="IAuditLogRepository"/> is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The singleton actor
|
||||
/// opens one DI scope per tick and reuses the same repository across all
|
||||
/// sites in that tick — one DbContext per tick mirrors the
|
||||
/// <c>AuditLogIngestActor</c> + <c>NotificationOutboxActor</c> pattern.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SiteAuditReconciliationActor : ReceiveActor
|
||||
{
|
||||
private readonly ISiteEnumerator _sites;
|
||||
private readonly IPullAuditEventsClient _client;
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly SiteAuditReconciliationOptions _options;
|
||||
private readonly ILogger<SiteAuditReconciliationActor> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Per-site reconciliation watermark — the highest
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/> seen for that site on a previous
|
||||
/// tick. Asking for <c>OccurredAtUtc >= cursor</c> rather than >
|
||||
/// is the site contract (<see cref="ScadaLink.Commons.Interfaces.Services.ISiteAuditQueue.ReadPendingSinceAsync"/>);
|
||||
/// duplicate-with-same-timestamp rows are filtered out by the idempotent
|
||||
/// repository write.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, DateTime> _cursors = new();
|
||||
|
||||
/// <summary>
|
||||
/// Per-site count of consecutive non-draining cycles. Resets to zero on the
|
||||
/// first draining (or empty) cycle.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, int> _nonDrainingCycles = new();
|
||||
|
||||
/// <summary>
|
||||
/// Per-site latched stalled state — used so the actor only publishes a
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> transition when the
|
||||
/// stalled flag actually changes, not on every tick while stalled.
|
||||
/// </summary>
|
||||
private readonly Dictionary<string, bool> _stalled = new();
|
||||
|
||||
private ICancelable? _timer;
|
||||
|
||||
public SiteAuditReconciliationActor(
|
||||
ISiteEnumerator sites,
|
||||
IPullAuditEventsClient client,
|
||||
IServiceProvider services,
|
||||
IOptions<SiteAuditReconciliationOptions> options,
|
||||
ILogger<SiteAuditReconciliationActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(sites);
|
||||
ArgumentNullException.ThrowIfNull(client);
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_sites = sites;
|
||||
_client = client;
|
||||
_services = services;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<ReconciliationTick>(_ => OnTickAsync());
|
||||
}
|
||||
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
var interval = _options.ReconciliationInterval;
|
||||
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||
initialDelay: interval,
|
||||
interval: interval,
|
||||
receiver: Self,
|
||||
message: ReconciliationTick.Instance,
|
||||
sender: Self);
|
||||
}
|
||||
|
||||
protected override void PostStop()
|
||||
{
|
||||
_timer?.Cancel();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
private async Task OnTickAsync()
|
||||
{
|
||||
// Capture EventStream BEFORE the first await. Accessing Context (and
|
||||
// therefore Context.System) after an await is unsafe because Akka's
|
||||
// ActorBase.Context throws "no active ActorContext" once the
|
||||
// continuation runs on a thread that isn't currently dispatching this
|
||||
// actor — mirrors the AuditLogPurgeActor.OnTickAsync fix and the
|
||||
// AuditLogIngestActor.OnIngestAsync Sender-capture pattern.
|
||||
var eventStream = Context.System.EventStream;
|
||||
|
||||
IReadOnlyList<SiteEntry> sites;
|
||||
try
|
||||
{
|
||||
sites = await _sites.EnumerateAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Site enumeration failed; skipping reconciliation tick.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (sites.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
IServiceScope? scope = null;
|
||||
IAuditLogRepository repository;
|
||||
try
|
||||
{
|
||||
scope = _services.CreateScope();
|
||||
repository = scope.ServiceProvider.GetRequiredService<IAuditLogRepository>();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to resolve IAuditLogRepository for reconciliation tick.");
|
||||
scope?.Dispose();
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var site in sites)
|
||||
{
|
||||
try
|
||||
{
|
||||
await PullSiteAsync(site, repository, eventStream).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all per the failure-isolation invariant: one site's
|
||||
// fault must not sink the rest of the tick. The cursor for
|
||||
// the failing site is left at its previous value so the
|
||||
// next tick retries the same window.
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Reconciliation pull failed for site {SiteId}; other sites continue.",
|
||||
site.SiteId);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
scope.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Issues one <c>PullAuditEvents</c> RPC against the site, ingests the
|
||||
/// returned rows idempotently into the central repository, and advances
|
||||
/// the cursor based on the maximum <see cref="AuditEvent.OccurredAtUtc"/>
|
||||
/// observed. The brief's "saturate until backlog clears" intent is met by
|
||||
/// the natural cadence — each tick issues one pull, and a backed-up site
|
||||
/// drains across consecutive ticks. The stalled signal (two non-draining
|
||||
/// ticks in a row) surfaces when that drain isn't keeping up.
|
||||
/// </summary>
|
||||
private async Task PullSiteAsync(SiteEntry site, IAuditLogRepository repository, Akka.Event.EventStream eventStream)
|
||||
{
|
||||
var since = _cursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
||||
var response = await _client.PullAsync(
|
||||
site.SiteId, since, _options.BatchSize, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var maxOccurred = since;
|
||||
var nowUtc = DateTime.UtcNow;
|
||||
foreach (var evt in response.Events)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Idempotent repository write: duplicate EventIds (from a
|
||||
// concurrent push, or a retry of this very pull) collapse to
|
||||
// a no-op courtesy of M2 Bundle A's race-fix on
|
||||
// InsertIfNotExistsAsync.
|
||||
var ingested = evt with { IngestedAtUtc = nowUtc };
|
||||
await repository.InsertIfNotExistsAsync(ingested).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Per-row catch so one bad event does not abandon the rest of
|
||||
// the batch. The cursor still advances based on OccurredAtUtc
|
||||
// — the row was returned by the site, so the next tick won't
|
||||
// re-fetch it; if it permanently fails to persist, that's an
|
||||
// operational concern surfaced by the log, not a hot-loop
|
||||
// trigger.
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Reconciliation ingest failed for AuditEvent {EventId} from site {SiteId}.",
|
||||
evt.EventId,
|
||||
site.SiteId);
|
||||
}
|
||||
|
||||
if (evt.OccurredAtUtc > maxOccurred)
|
||||
{
|
||||
maxOccurred = evt.OccurredAtUtc;
|
||||
}
|
||||
}
|
||||
|
||||
_cursors[site.SiteId] = maxOccurred;
|
||||
|
||||
var nonDraining = response.MoreAvailable && response.Events.Count > 0;
|
||||
UpdateStalledState(site.SiteId, draining: !nonDraining, eventStream);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Flips the per-site stalled flag based on whether this tick drained the
|
||||
/// queue. A "draining" cycle is one where the server reported no more rows
|
||||
/// available OR returned zero events. A "non-draining" cycle is the
|
||||
/// inverse (events returned AND <c>MoreAvailable=true</c>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The state machine: counter increments on each consecutive non-draining
|
||||
/// tick. On reaching <see cref="SiteAuditReconciliationOptions.StalledAfterNonDrainingCycles"/>
|
||||
/// the actor latches <c>Stalled=true</c> and publishes the transition; on
|
||||
/// any subsequent draining tick the counter resets to zero AND, if the
|
||||
/// latch is currently true, the actor publishes <c>Stalled=false</c>. Only
|
||||
/// transitions are published — repeated ticks in the same state are
|
||||
/// silent so a downstream subscriber doesn't see a flood of redundant
|
||||
/// notifications.
|
||||
/// </remarks>
|
||||
private void UpdateStalledState(string siteId, bool draining, Akka.Event.EventStream eventStream)
|
||||
{
|
||||
var wasStalled = _stalled.TryGetValue(siteId, out var prior) && prior;
|
||||
|
||||
if (draining)
|
||||
{
|
||||
_nonDrainingCycles[siteId] = 0;
|
||||
if (wasStalled)
|
||||
{
|
||||
_stalled[siteId] = false;
|
||||
eventStream.Publish(
|
||||
new SiteAuditTelemetryStalledChanged(siteId, Stalled: false));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
var consecutive = _nonDrainingCycles.GetValueOrDefault(siteId) + 1;
|
||||
_nonDrainingCycles[siteId] = consecutive;
|
||||
|
||||
if (consecutive >= _options.StalledAfterNonDrainingCycles && !wasStalled)
|
||||
{
|
||||
_stalled[siteId] = true;
|
||||
eventStream.Publish(
|
||||
new SiteAuditTelemetryStalledChanged(siteId, Stalled: true));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resume on any unhandled exception inside the receive — the singleton
|
||||
/// MUST stay alive even if the per-tick try/catch leaks. Restart would
|
||||
/// reset the cursors (safe but wasteful); Resume preserves them.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: 0,
|
||||
withinTimeRange: TimeSpan.Zero,
|
||||
decider: Akka.Actor.SupervisorStrategy.DefaultDecider);
|
||||
}
|
||||
|
||||
/// <summary>Self-tick triggering a reconciliation pass across all sites.</summary>
|
||||
internal sealed class ReconciliationTick
|
||||
{
|
||||
public static readonly ReconciliationTick Instance = new();
|
||||
private ReconciliationTick() { }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Published on the actor system EventStream when a site's reconciliation
|
||||
/// puller transitions into or out of the "stalled" state (backlog not
|
||||
/// draining across multiple cycles). The M6 Bundle E central health collector
|
||||
/// will subscribe to this and surface
|
||||
/// <c>SiteAuditTelemetryStalled</c> on the health-report payload.
|
||||
/// </summary>
|
||||
public sealed record SiteAuditTelemetryStalledChanged(string SiteId, bool Stalled);
|
||||
@@ -0,0 +1,60 @@
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Tuning knobs for the central <see cref="SiteAuditReconciliationActor"/> singleton.
|
||||
/// Defaults mirror the M6 Bundle B brief: pull every 5 minutes per site, 256 rows per
|
||||
/// batch, declare a site "stalled" after two consecutive pull cycles return non-empty
|
||||
/// AND <c>MoreAvailable=true</c> (the backlog is not draining).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Per the M6 plan the reconciliation actor is the fallback when push telemetry is
|
||||
/// lost; it is intentionally low-frequency. Lowering
|
||||
/// <see cref="ReconciliationIntervalSeconds"/> in production trades MS SQL load for
|
||||
/// fresher self-healing — keep the default unless a deployment can prove the extra
|
||||
/// load is acceptable.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <see cref="StalledAfterNonDrainingCycles"/> = 2 because a single non-draining
|
||||
/// cycle can happen on a surge (e.g. a backed-up site replays its hot queue); the
|
||||
/// stalled signal should only fire when the backlog persists across cycles, which is
|
||||
/// the symptom the central health surface is asking us to detect.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditReconciliationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Period of the reconciliation tick. Each tick visits every known site once.
|
||||
/// </summary>
|
||||
public int ReconciliationIntervalSeconds { get; set; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Test-only override for finer control over the tick cadence than
|
||||
/// whole-second resolution allows. When non-null, takes precedence over
|
||||
/// <see cref="ReconciliationIntervalSeconds"/>. Not bound from config —
|
||||
/// production config exposes <see cref="ReconciliationIntervalSeconds"/>
|
||||
/// only.
|
||||
/// </summary>
|
||||
public TimeSpan? ReconciliationIntervalOverride { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Resolves the effective tick interval, honouring the test override when
|
||||
/// set. Falls back to <see cref="ReconciliationIntervalSeconds"/>.
|
||||
/// </summary>
|
||||
public TimeSpan ReconciliationInterval =>
|
||||
ReconciliationIntervalOverride ?? TimeSpan.FromSeconds(ReconciliationIntervalSeconds);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of <see cref="ScadaLink.Commons.Entities.Audit.AuditEvent"/>
|
||||
/// rows requested in a single <c>PullAuditEvents</c> RPC call.
|
||||
/// </summary>
|
||||
public int BatchSize { get; set; } = 256;
|
||||
|
||||
/// <summary>
|
||||
/// Number of consecutive non-draining cycles (events returned AND
|
||||
/// <c>MoreAvailable=true</c>) that must accumulate for a site before the actor
|
||||
/// publishes <c>SiteAuditTelemetryStalledChanged(Stalled: true)</c> on the
|
||||
/// EventStream.
|
||||
/// </summary>
|
||||
public int StalledAfterNonDrainingCycles { get; set; } = 2;
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
|
||||
namespace ScadaLink.AuditLog.Central;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T7) — central singleton that subscribes to the
|
||||
/// actor system's EventStream for <see cref="SiteAuditTelemetryStalledChanged"/>
|
||||
/// publications and maintains a per-site latched stalled-state map readable
|
||||
/// via <see cref="Snapshot"/>. Consumed by the M6 Bundle E
|
||||
/// <see cref="AuditCentralHealthSnapshot"/> aggregator so the central health
|
||||
/// surface can surface per-site "reconciliation isn't draining" without
|
||||
/// coupling the publisher (<see cref="SiteAuditReconciliationActor"/>) to the
|
||||
/// health collection plumbing.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why an internal actor.</b> Akka.NET's <see cref="EventStream"/> only
|
||||
/// supports <see cref="IActorRef"/> subscribers — there is no callback or
|
||||
/// channel-based overload. The tracker therefore spawns a small subscriber
|
||||
/// actor that forwards each event into the shared
|
||||
/// <see cref="ConcurrentDictionary{TKey,TValue}"/> on the actor's thread, and
|
||||
/// readers (<see cref="Snapshot"/>) take a copy off that dictionary on any
|
||||
/// thread. Mirrors the <c>DeadLetterMonitorActor</c> shape — subscribe in
|
||||
/// <see cref="ActorBase.PreStart"/>, unsubscribe in
|
||||
/// <see cref="ActorBase.PostStop"/>, which the tracker triggers via a Stop
|
||||
/// at <see cref="Dispose"/>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Per-site latching.</b> The publisher (<see cref="SiteAuditReconciliationActor"/>)
|
||||
/// only publishes on stalled-state transitions, so the dictionary is the
|
||||
/// authoritative latched state. Sites that have never published are absent
|
||||
/// from the snapshot — the consumer surface treats absence as
|
||||
/// <c>Stalled=false</c> (default healthy), the same default the reconciliation
|
||||
/// actor's own internal latch uses.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Singleton lifecycle.</b> Registered as a singleton via
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLogCentralMaintenance"/>;
|
||||
/// <see cref="Dispose"/> tears the internal subscriber down at host shutdown.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditTelemetryStalledTracker : IDisposable
|
||||
{
|
||||
private readonly EventStream _eventStream;
|
||||
private readonly ConcurrentDictionary<string, bool> _state = new();
|
||||
private readonly IActorRef? _subscriber;
|
||||
private readonly AuditCentralHealthSnapshot? _snapshot;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Construct around a bare <see cref="EventStream"/>. Intended for unit
|
||||
/// tests where the caller wants to publish events without standing up an
|
||||
/// actor system — the tracker registers a transient subscriber actor only
|
||||
/// if the supplied stream is backed by an actor system. In the bare-stream
|
||||
/// mode (no actor system) the tracker still exposes the
|
||||
/// <see cref="Snapshot"/> surface but cannot self-subscribe; production
|
||||
/// callers always go through <see cref="SiteAuditTelemetryStalledTracker(ActorSystem)"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Subscribing to <see cref="EventStream"/> requires an <see cref="IActorRef"/>,
|
||||
/// which can only be created from an <see cref="ActorSystem"/>. The bare-
|
||||
/// stream ctor therefore can NOT itself wire the subscriber — tests that
|
||||
/// want event-driven updates must use the ActorSystem ctor (or push state
|
||||
/// directly via <see cref="Apply"/>). The tests in
|
||||
/// <c>SiteAuditTelemetryStalledTrackerTests</c> use the ActorSystem ctor
|
||||
/// via Akka.TestKit so they exercise the production subscribe path.
|
||||
/// </remarks>
|
||||
public SiteAuditTelemetryStalledTracker(EventStream eventStream)
|
||||
: this(eventStream, snapshot: null)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bare-stream ctor with an optional snapshot sink — the central
|
||||
/// composition root passes the singleton
|
||||
/// <see cref="AuditCentralHealthSnapshot"/> so every dictionary update
|
||||
/// also lands on the central health surface. The bare ctor still cannot
|
||||
/// subscribe (no actor system), but tests that drive the tracker via
|
||||
/// <see cref="Apply"/> get the snapshot push for free.
|
||||
/// </summary>
|
||||
public SiteAuditTelemetryStalledTracker(EventStream eventStream, AuditCentralHealthSnapshot? snapshot)
|
||||
{
|
||||
_eventStream = eventStream ?? throw new ArgumentNullException(nameof(eventStream));
|
||||
// No subscriber actor — see the remarks on the parameterless overload.
|
||||
_subscriber = null;
|
||||
_snapshot = snapshot;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Production ctor: subscribes a small internal actor to the supplied
|
||||
/// system's EventStream so every published
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> updates the latched
|
||||
/// per-site map. <see cref="Dispose"/> tears the subscriber down.
|
||||
/// </summary>
|
||||
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem)
|
||||
: this(actorSystem, snapshot: null)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Production ctor with a snapshot sink — every observed
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> is mirrored onto the
|
||||
/// shared <see cref="AuditCentralHealthSnapshot"/> so the central health
|
||||
/// surface sees per-site stalled state without re-reading the tracker.
|
||||
/// </summary>
|
||||
public SiteAuditTelemetryStalledTracker(ActorSystem actorSystem, AuditCentralHealthSnapshot? snapshot)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(actorSystem);
|
||||
_eventStream = actorSystem.EventStream;
|
||||
_snapshot = snapshot;
|
||||
// Anonymous subscriber actor scoped to the system; props build it
|
||||
// with a callback into THIS tracker's Apply method so the actor's
|
||||
// single-threaded receive serialises every dictionary write.
|
||||
_subscriber = actorSystem.ActorOf(
|
||||
Props.Create(() => new StalledChangedSubscriber(this)),
|
||||
name: $"site-audit-stalled-tracker-{Guid.NewGuid():N}");
|
||||
// Subscribe synchronously from the ctor so the subscription is in
|
||||
// place before the tracker is returned to the caller — the actor's
|
||||
// own PreStart runs asynchronously and would otherwise race the
|
||||
// first publish. EventStream.Subscribe is thread-safe.
|
||||
_eventStream.Subscribe(_subscriber, typeof(SiteAuditTelemetryStalledChanged));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a defensive copy of the per-site latched stalled state.
|
||||
/// Absent sites are interpreted as <c>Stalled=false</c> by consumers.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, bool> Snapshot() =>
|
||||
new Dictionary<string, bool>(_state);
|
||||
|
||||
/// <summary>
|
||||
/// Applied by the internal subscriber actor on every
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> publication. Exposed
|
||||
/// internally so tests against the bare-stream ctor can still drive the
|
||||
/// tracker, but the production path always goes through the actor.
|
||||
/// </summary>
|
||||
internal void Apply(SiteAuditTelemetryStalledChanged evt)
|
||||
{
|
||||
if (evt is null) return;
|
||||
_state[evt.SiteId] = evt.Stalled;
|
||||
// Mirror into the central health snapshot if wired so a reader of
|
||||
// IAuditCentralHealthSnapshot sees the same per-site state without
|
||||
// a second lookup. Snapshot is optional (test composition roots may
|
||||
// skip it) so the null-coalesce is the safe path.
|
||||
_snapshot?.ApplyStalled(evt);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
if (_subscriber is not null)
|
||||
{
|
||||
// Unsubscribe runs in PostStop on the subscriber actor; Stop is
|
||||
// fire-and-forget but the actor's PostStop hook is guaranteed to
|
||||
// run before its mailbox is collected.
|
||||
_subscriber.Tell(PoisonPill.Instance);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Internal subscriber actor — receives every
|
||||
/// <see cref="SiteAuditTelemetryStalledChanged"/> off the EventStream and
|
||||
/// forwards it into the parent <see cref="SiteAuditTelemetryStalledTracker"/>.
|
||||
/// Unlike <c>DeadLetterMonitorActor</c>, the subscription is registered by
|
||||
/// the tracker constructor BEFORE this actor begins processing messages so
|
||||
/// publishes that arrive between actor creation and PreStart cannot be
|
||||
/// missed. Unsubscribe still runs in <see cref="PostStop"/>.
|
||||
/// </summary>
|
||||
private sealed class StalledChangedSubscriber : ReceiveActor
|
||||
{
|
||||
private readonly SiteAuditTelemetryStalledTracker _parent;
|
||||
|
||||
public StalledChangedSubscriber(SiteAuditTelemetryStalledTracker parent)
|
||||
{
|
||||
_parent = parent;
|
||||
Receive<SiteAuditTelemetryStalledChanged>(evt => _parent.Apply(evt));
|
||||
}
|
||||
|
||||
protected override void PostStop()
|
||||
{
|
||||
Context.System.EventStream.Unsubscribe(Self, typeof(SiteAuditTelemetryStalledChanged));
|
||||
base.PostStop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
namespace ScadaLink.AuditLog.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for Audit Log (#23). Bound from the <c>AuditLog</c> section of
|
||||
/// <c>appsettings.json</c>. Defaults reflect the design (alog.md §6, §10): an
|
||||
/// 8 KiB payload-summary cap, a 64 KiB cap on error rows, and a 365-day central
|
||||
/// retention window with monthly partition-switch purge. The default
|
||||
/// header-redact list covers HTTP auth headers; per-target overrides extend
|
||||
/// (never replace) the global redactor set.
|
||||
/// </summary>
|
||||
public sealed class AuditLogOptions
|
||||
{
|
||||
/// <summary>Default payload-summary cap in bytes (default 8 KiB).</summary>
|
||||
public int DefaultCapBytes { get; set; } = 8192;
|
||||
|
||||
/// <summary>Payload-summary cap on error rows in bytes (default 64 KiB).</summary>
|
||||
public int ErrorCapBytes { get; set; } = 65536;
|
||||
|
||||
/// <summary>HTTP headers redacted by default before persistence.</summary>
|
||||
public List<string> HeaderRedactList { get; set; } = new()
|
||||
{
|
||||
"Authorization",
|
||||
"X-Api-Key",
|
||||
"Cookie",
|
||||
"Set-Cookie",
|
||||
};
|
||||
|
||||
/// <summary>Body-content redactors applied globally (regex patterns).</summary>
|
||||
public List<string> GlobalBodyRedactors { get; set; } = new();
|
||||
|
||||
/// <summary>Per-target redaction overrides keyed by target identifier.</summary>
|
||||
public Dictionary<string, PerTargetRedactionOverride> PerTargetOverrides { get; set; } = new();
|
||||
|
||||
/// <summary>Central retention window in days (default 365, range [30, 3650]).</summary>
|
||||
public int RetentionDays { get; set; } = 365;
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace ScadaLink.AuditLog.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Validates <see cref="AuditLogOptions"/> on startup. The caps drive payload
|
||||
/// truncation in the M2+ writers, so an unset/zero cap would let arbitrarily
|
||||
/// large blobs into the central <c>AuditLog</c> table. <see cref="AuditLogOptions.ErrorCapBytes"/>
|
||||
/// must be at least as large as <see cref="AuditLogOptions.DefaultCapBytes"/>
|
||||
/// because the error cap is meant to capture <em>more</em> detail than the
|
||||
/// happy-path summary, not less. <see cref="AuditLogOptions.RetentionDays"/> is
|
||||
/// bounded to <c>[30, 3650]</c> to keep purge windows sane: too short would
|
||||
/// drop in-flight investigations, too long would defeat the partition-switch
|
||||
/// purge's purpose.
|
||||
/// </summary>
|
||||
public sealed class AuditLogOptionsValidator : IValidateOptions<AuditLogOptions>
|
||||
{
|
||||
/// <summary>Inclusive lower bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
|
||||
public const int MinRetentionDays = 30;
|
||||
|
||||
/// <summary>Inclusive upper bound for <see cref="AuditLogOptions.RetentionDays"/>.</summary>
|
||||
public const int MaxRetentionDays = 3650;
|
||||
|
||||
/// <inheritdoc />
|
||||
public ValidateOptionsResult Validate(string? name, AuditLogOptions options)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
|
||||
var failures = new List<string>();
|
||||
|
||||
if (options.DefaultCapBytes <= 0)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}) " +
|
||||
"must be > 0; it drives payload-summary truncation in audit writers.");
|
||||
}
|
||||
|
||||
if (options.ErrorCapBytes < options.DefaultCapBytes)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.ErrorCapBytes)} ({options.ErrorCapBytes}) " +
|
||||
$"must be >= {nameof(AuditLogOptions.DefaultCapBytes)} ({options.DefaultCapBytes}); " +
|
||||
"the error-row cap is intended to capture more detail than the happy-path summary.");
|
||||
}
|
||||
|
||||
if (options.RetentionDays < MinRetentionDays || options.RetentionDays > MaxRetentionDays)
|
||||
{
|
||||
failures.Add(
|
||||
$"AuditLog:{nameof(AuditLogOptions.RetentionDays)} ({options.RetentionDays}) " +
|
||||
$"must be in [{MinRetentionDays}, {MaxRetentionDays}] days.");
|
||||
}
|
||||
|
||||
return failures.Count == 0
|
||||
? ValidateOptionsResult.Success
|
||||
: ValidateOptionsResult.Fail(failures);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
namespace ScadaLink.AuditLog.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Per-target redaction override applied additively on top of
|
||||
/// <see cref="AuditLogOptions.GlobalBodyRedactors"/> and the
|
||||
/// <see cref="AuditLogOptions.DefaultCapBytes"/> / <see cref="AuditLogOptions.ErrorCapBytes"/>
|
||||
/// caps. Targets are identified by the script-facing external-system /
|
||||
/// database / notification-list / inbound-API-key name.
|
||||
/// </summary>
|
||||
public sealed class PerTargetRedactionOverride
|
||||
{
|
||||
/// <summary>Optional payload cap override (bytes); null inherits the global cap.</summary>
|
||||
public int? CapBytes { get; set; }
|
||||
|
||||
/// <summary>Additional body redactor regex patterns (appended to the global list).</summary>
|
||||
public List<string>? AdditionalBodyRedactors { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Opt-in SQL parameter redaction: case-insensitive regex matched against
|
||||
/// each SQL parameter NAME in the M4 <c>AuditingDbCommand</c> RequestSummary
|
||||
/// JSON (<c>{"sql":"...","parameters":{"@name":"value", ...}}</c>); values
|
||||
/// whose name matches are replaced with <c><redacted></c>. Null (the
|
||||
/// default) means parameter values are captured verbatim. Only applied to
|
||||
/// <see cref="ScadaLink.Commons.Types.Enums.AuditChannel.DbOutbound"/>
|
||||
/// rows.
|
||||
/// </summary>
|
||||
public string? RedactSqlParamsMatching { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,573 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Text;
|
||||
using System.Text.Encodings.Web;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Nodes;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.AuditLog.Configuration;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
|
||||
namespace ScadaLink.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditPayloadFilter"/>. Bundle A established the
|
||||
/// truncation backbone; Bundle B chains HTTP header redaction (M5-T3) BEFORE
|
||||
/// truncation so redactors operate on the full payload and the cap then trims
|
||||
/// the redacted result.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Uses <see cref="IOptionsMonitor{TOptions}"/> (not <see cref="IOptions{TOptions}"/>)
|
||||
/// so the M5-T8 hot-reload path sees fresh values without re-resolving the
|
||||
/// singleton. <see cref="Apply"/> reads <see cref="IOptionsMonitor{T}.CurrentValue"/>
|
||||
/// on every call, and the regex cache is keyed by pattern string — patterns
|
||||
/// added via a live config change compile on first use of the next event;
|
||||
/// patterns removed simply stop being looked up. No <c>OnChange</c> subscription
|
||||
/// or explicit cache invalidation is required (the
|
||||
/// <c>AuditLogOptionsBindingTests</c> fixture in <c>ScadaLink.AuditLog.Tests</c>
|
||||
/// pins this behaviour).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// "Error row" = <see cref="AuditEvent.Status"/> NOT IN (<c>Delivered</c>,
|
||||
/// <c>Submitted</c>, <c>Forwarded</c>) — every other status, including the
|
||||
/// non-terminal <c>Attempted</c>, the parked/discarded terminals, and the
|
||||
/// short-circuit <c>Skipped</c>, receives the larger error cap so a verbose
|
||||
/// error body survives.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Apply MUST NOT throw — on internal failure the filter over-redacts by
|
||||
/// returning the input with <see cref="AuditEvent.PayloadTruncated"/> set and
|
||||
/// increments the <c>AuditRedactionFailure</c> health metric via the injected
|
||||
/// <see cref="IAuditRedactionFailureCounter"/>. Each redactor stage runs in
|
||||
/// its own try/catch — a failure in (say) the header redactor still lets the
|
||||
/// SQL parameter redactor and the truncator run on the remaining fields.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Stage order (each runs on every applicable field):
|
||||
/// header redaction → body regex redaction → truncation. The SQL-parameter
|
||||
/// stage piggybacks on the body-redactor path; both run BEFORE truncation so
|
||||
/// the cap trims the redacted result, never bytes the redactor intended to
|
||||
/// hide.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class DefaultAuditPayloadFilter : IAuditPayloadFilter
|
||||
{
|
||||
private const string RedactedMarker = "<redacted>";
|
||||
private const string RedactorErrorMarker = "<redacted: redactor error>";
|
||||
|
||||
/// <summary>
|
||||
/// Per-match regex timeout. Catastrophic-backtracking patterns trip a
|
||||
/// <see cref="RegexMatchTimeoutException"/> when a single match takes
|
||||
/// longer than this; the offending field is then over-redacted with
|
||||
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
|
||||
/// 50 ms is generous for normal patterns yet short enough that the
|
||||
/// audit hot-path isn't held up by a misconfigured regex.
|
||||
/// </summary>
|
||||
private static readonly TimeSpan RegexMatchTimeout = TimeSpan.FromMilliseconds(50);
|
||||
|
||||
/// <summary>
|
||||
/// JSON serializer options used to re-emit redacted summaries. The
|
||||
/// UnsafeRelaxedJsonEscaping encoder is required so the redaction marker
|
||||
/// (which contains <c><</c> / <c>></c>) survives unescaped — the
|
||||
/// header-redaction tests grep for the literal marker, and the downstream
|
||||
/// UI / log readers would rather see <c><redacted></c> than
|
||||
/// <c><redacted></c>. The summaries are persisted to the audit
|
||||
/// table and rendered in trusted-internal contexts only, so the relaxed
|
||||
/// HTML-escaping rules do not introduce an XSS surface.
|
||||
/// </summary>
|
||||
private static readonly JsonSerializerOptions RedactedSummaryJsonOptions = new()
|
||||
{
|
||||
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
|
||||
};
|
||||
|
||||
private readonly IOptionsMonitor<AuditLogOptions> _options;
|
||||
private readonly ILogger<DefaultAuditPayloadFilter> _logger;
|
||||
private readonly IAuditRedactionFailureCounter _failureCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Compiled-regex cache keyed by pattern string. Lazy population: each
|
||||
/// pattern is compiled on first use and cached forever (the entry's
|
||||
/// <see cref="CompiledRegex"/> carries either the working <see cref="Regex"/>
|
||||
/// or a sentinel marking the pattern as invalid so we don't retry the
|
||||
/// failing compile on every call). ConcurrentDictionary is the right
|
||||
/// thread-safety primitive here because the filter is a DI singleton
|
||||
/// shared across the audit hot-path.
|
||||
/// </summary>
|
||||
private readonly ConcurrentDictionary<string, CompiledRegex> _regexCache = new();
|
||||
|
||||
/// <summary>
|
||||
/// Primary constructor used by DI — pulls the optional redaction-failure
|
||||
/// counter from the container; a NoOp default is registered in
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/>.
|
||||
/// </summary>
|
||||
public DefaultAuditPayloadFilter(
|
||||
IOptionsMonitor<AuditLogOptions> options,
|
||||
ILogger<DefaultAuditPayloadFilter> logger,
|
||||
IAuditRedactionFailureCounter? failureCounter = null)
|
||||
{
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_failureCounter = failureCounter ?? new NoOpAuditRedactionFailureCounter();
|
||||
}
|
||||
|
||||
public AuditEvent Apply(AuditEvent rawEvent)
|
||||
{
|
||||
try
|
||||
{
|
||||
var opts = _options.CurrentValue;
|
||||
var cap = IsErrorStatus(rawEvent.Status) ? opts.ErrorCapBytes : opts.DefaultCapBytes;
|
||||
|
||||
// --- Header-redaction stage (runs BEFORE truncation) ----------
|
||||
var request = RedactHeaders(rawEvent.RequestSummary, opts.HeaderRedactList);
|
||||
var response = RedactHeaders(rawEvent.ResponseSummary, opts.HeaderRedactList);
|
||||
var errorDetail = rawEvent.ErrorDetail;
|
||||
var extra = rawEvent.Extra;
|
||||
|
||||
// --- Body-regex stage (also runs BEFORE truncation) -----------
|
||||
// Resolves the active regex set per event so per-target overrides
|
||||
// bound to AuditEvent.Target are picked up; effectively a no-op
|
||||
// when neither GlobalBodyRedactors nor the per-target additions
|
||||
// are configured.
|
||||
var bodyRegexes = ResolveBodyRegexes(opts, rawEvent.Target);
|
||||
if (bodyRegexes.Count > 0)
|
||||
{
|
||||
request = RedactBody(request, bodyRegexes);
|
||||
response = RedactBody(response, bodyRegexes);
|
||||
errorDetail = RedactBody(errorDetail, bodyRegexes);
|
||||
extra = RedactBody(extra, bodyRegexes);
|
||||
}
|
||||
|
||||
// --- SQL parameter redaction stage (DbOutbound only) ----------
|
||||
// Parses the M4 AuditingDbCommand RequestSummary shape
|
||||
// {"sql":"...","parameters":{...}} and redacts parameter VALUES
|
||||
// whose NAME matches the per-connection regex. Opt-in: no
|
||||
// PerTargetOverrides[connectionName].RedactSqlParamsMatching =>
|
||||
// no-op. Channel-guarded so the same regex can never accidentally
|
||||
// touch an ApiOutbound row.
|
||||
if (rawEvent.Channel == AuditChannel.DbOutbound
|
||||
&& TryGetSqlParamRedactor(opts, rawEvent.Target, out var sqlParamRegex))
|
||||
{
|
||||
request = RedactSqlParameters(request, sqlParamRegex!);
|
||||
}
|
||||
|
||||
// --- Truncation stage -----------------------------------------
|
||||
var truncated = false;
|
||||
request = TruncateField(request, cap, ref truncated);
|
||||
response = TruncateField(response, cap, ref truncated);
|
||||
errorDetail = TruncateField(errorDetail, cap, ref truncated);
|
||||
extra = TruncateField(extra, cap, ref truncated);
|
||||
|
||||
return rawEvent with
|
||||
{
|
||||
RequestSummary = request,
|
||||
ResponseSummary = response,
|
||||
ErrorDetail = errorDetail,
|
||||
Extra = extra,
|
||||
PayloadTruncated = rawEvent.PayloadTruncated || truncated,
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Audit is best-effort: over-redact rather than fail the caller.
|
||||
// The per-stage try/catches above already handle redactor faults
|
||||
// and increment the counter; this catch covers any unexpected
|
||||
// surprise in the surrounding orchestration code.
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Payload filter failed; returning raw event with PayloadTruncated=true");
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return rawEvent with { PayloadTruncated = true };
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parse <paramref name="json"/> as the documented
|
||||
/// <c>{"headers": {...}, "body": ...}</c> shape and replace values whose
|
||||
/// header NAME (case-insensitive) is in <paramref name="redactList"/> with
|
||||
/// <see cref="RedactedMarker"/>. Re-serialises and returns the result.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// No-op pass-through for inputs that aren't JSON-shaped — emitters that
|
||||
/// have not yet adopted the convention (the M2 site emitters today, which
|
||||
/// leave RequestSummary null on outbound API calls) get a transparent
|
||||
/// pass. If the redactor itself throws, we over-redact the whole field
|
||||
/// with <see cref="RedactorErrorMarker"/> and bump the failure counter.
|
||||
/// </remarks>
|
||||
private string? RedactHeaders(string? json, IList<string> redactList)
|
||||
{
|
||||
if (json is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Cheap structural pre-check: only attempt JSON parsing when the input
|
||||
// actually looks like a JSON object. Saves the JsonDocument allocation
|
||||
// on the (very common) non-JSON ErrorDetail / Extra fields.
|
||||
var trimmed = json.AsSpan().TrimStart();
|
||||
if (trimmed.Length == 0 || trimmed[0] != '{')
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
JsonNode? root;
|
||||
try
|
||||
{
|
||||
root = JsonNode.Parse(json);
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
// Not parseable JSON — leave the field alone (no error, no
|
||||
// redaction). Emitters not yet using the documented shape get
|
||||
// a transparent pass; Bundle C will update them.
|
||||
return json;
|
||||
}
|
||||
|
||||
if (root is not JsonObject obj || obj["headers"] is not JsonObject headers)
|
||||
{
|
||||
// No "headers" object at the top level — nothing to redact.
|
||||
return json;
|
||||
}
|
||||
|
||||
// Build a case-insensitive lookup of the redact list so we can do
|
||||
// one O(1) check per header name without an inner Any() loop.
|
||||
var redactSet = new HashSet<string>(redactList, StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Take a snapshot of names first — we cannot mutate while
|
||||
// enumerating the JsonObject.
|
||||
var names = new List<string>(headers.Count);
|
||||
foreach (var kvp in headers)
|
||||
{
|
||||
names.Add(kvp.Key);
|
||||
}
|
||||
foreach (var name in names)
|
||||
{
|
||||
if (redactSet.Contains(name))
|
||||
{
|
||||
headers[name] = JsonValue.Create(RedactedMarker);
|
||||
}
|
||||
}
|
||||
|
||||
return obj.ToJsonString(RedactedSummaryJsonOptions);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Header redactor faulted; over-redacting field with '{Marker}'",
|
||||
RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Combine the global and per-target body-redactor lists for a single
|
||||
/// event, returning the compiled-regex set to apply. Patterns that failed
|
||||
/// compilation are silently skipped — the compile-time failure was logged
|
||||
/// once on first encounter; we never let one bad pattern starve the rest.
|
||||
/// </summary>
|
||||
private IReadOnlyList<Regex> ResolveBodyRegexes(AuditLogOptions opts, string? target)
|
||||
{
|
||||
var hasGlobal = opts.GlobalBodyRedactors is { Count: > 0 };
|
||||
var perTargetAdditions = (target != null
|
||||
&& opts.PerTargetOverrides.TryGetValue(target, out var over)
|
||||
&& over.AdditionalBodyRedactors is { Count: > 0 })
|
||||
? over.AdditionalBodyRedactors
|
||||
: null;
|
||||
|
||||
if (!hasGlobal && perTargetAdditions == null)
|
||||
{
|
||||
return Array.Empty<Regex>();
|
||||
}
|
||||
|
||||
var result = new List<Regex>();
|
||||
if (hasGlobal)
|
||||
{
|
||||
foreach (var pattern in opts.GlobalBodyRedactors)
|
||||
{
|
||||
if (TryGetCompiledRegex(pattern, out var rx))
|
||||
{
|
||||
result.Add(rx!);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (perTargetAdditions != null)
|
||||
{
|
||||
foreach (var pattern in perTargetAdditions)
|
||||
{
|
||||
if (TryGetCompiledRegex(pattern, out var rx))
|
||||
{
|
||||
result.Add(rx!);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolve a compiled regex from the cache, compiling it on first use.
|
||||
/// Returns <c>false</c> for patterns that are invalid OR whose compile
|
||||
/// took longer than 100 ms (the spec calls catastrophic-backtracking
|
||||
/// guesses at compile time "invalid"); the failure is logged once and
|
||||
/// the sentinel cache entry prevents repeat compile attempts.
|
||||
/// </summary>
|
||||
private bool TryGetCompiledRegex(string pattern, out Regex? regex)
|
||||
{
|
||||
var entry = _regexCache.GetOrAdd(pattern, CompileRegex);
|
||||
regex = entry.Regex;
|
||||
return entry.Regex != null;
|
||||
}
|
||||
|
||||
private CompiledRegex CompileRegex(string pattern)
|
||||
{
|
||||
try
|
||||
{
|
||||
var swStart = System.Diagnostics.Stopwatch.GetTimestamp();
|
||||
var rx = new Regex(pattern, RegexOptions.Compiled, RegexMatchTimeout);
|
||||
var elapsedMs = (System.Diagnostics.Stopwatch.GetTimestamp() - swStart)
|
||||
* 1000d / System.Diagnostics.Stopwatch.Frequency;
|
||||
if (elapsedMs > 100)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Body redactor pattern compiled in {Elapsed}ms (> 100ms cap); rejecting '{Pattern}'",
|
||||
elapsedMs, pattern);
|
||||
return CompiledRegex.Invalid;
|
||||
}
|
||||
return new CompiledRegex(rx);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Body redactor pattern '{Pattern}' failed to compile; skipping",
|
||||
pattern);
|
||||
return CompiledRegex.Invalid;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Apply each compiled body-redactor regex to <paramref name="value"/> in
|
||||
/// turn, replacing every match with <see cref="RedactedMarker"/>. If any
|
||||
/// single regex match throws (most commonly
|
||||
/// <see cref="RegexMatchTimeoutException"/>) the field is over-redacted
|
||||
/// with <see cref="RedactorErrorMarker"/> and the failure counter is
|
||||
/// incremented — the user-facing action is never aborted.
|
||||
/// </summary>
|
||||
private string? RedactBody(string? value, IReadOnlyList<Regex> regexes)
|
||||
{
|
||||
if (value is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
var current = value;
|
||||
foreach (var rx in regexes)
|
||||
{
|
||||
try
|
||||
{
|
||||
current = rx.Replace(current, RedactedMarker);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Body redactor '{Pattern}' faulted; over-redacting field with '{Marker}'",
|
||||
rx.ToString(), RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolve the per-connection SQL parameter redaction regex for the given
|
||||
/// DbOutbound event target. Target shape (M4 AuditingDbCommand): the
|
||||
/// connection name optionally followed by <c>.<sql-snippet></c> for
|
||||
/// disambiguation; the per-target dictionary is keyed by the connection
|
||||
/// name alone, so we strip the snippet suffix before lookup. Patterns are
|
||||
/// compiled with case-insensitive matching to match the documented
|
||||
/// behaviour.
|
||||
/// </summary>
|
||||
private bool TryGetSqlParamRedactor(AuditLogOptions opts, string? target, out Regex? regex)
|
||||
{
|
||||
regex = null;
|
||||
if (string.IsNullOrEmpty(target))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var dot = target.IndexOf('.');
|
||||
var connectionKey = dot < 0 ? target : target[..dot];
|
||||
|
||||
if (!opts.PerTargetOverrides.TryGetValue(connectionKey, out var over)
|
||||
|| string.IsNullOrEmpty(over.RedactSqlParamsMatching))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Force case-insensitivity per the spec — even if the operator wrote
|
||||
// the pattern without an IgnoreCase flag. The compile cache key folds
|
||||
// the option to keep the entries unambiguous.
|
||||
var cacheKey = "(?i)" + over.RedactSqlParamsMatching;
|
||||
if (!TryGetCompiledRegex(cacheKey, out regex))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Walk the M4 <c>{"sql":"...","parameters":{...}}</c> RequestSummary
|
||||
/// shape; for each parameter whose NAME matches
|
||||
/// <paramref name="paramNameRegex"/>, replace its value with
|
||||
/// <see cref="RedactedMarker"/>. Re-serialise.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// No-op pass-through when the input isn't parseable JSON, isn't a JSON
|
||||
/// object, or doesn't carry a top-level <c>"parameters"</c> object. On
|
||||
/// any unexpected fault the field is over-redacted with
|
||||
/// <see cref="RedactorErrorMarker"/> and the failure counter is bumped.
|
||||
/// </remarks>
|
||||
private string? RedactSqlParameters(string? json, Regex paramNameRegex)
|
||||
{
|
||||
if (json is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = json.AsSpan().TrimStart();
|
||||
if (trimmed.Length == 0 || trimmed[0] != '{')
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
JsonNode? root;
|
||||
try
|
||||
{
|
||||
root = JsonNode.Parse(json);
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
if (root is not JsonObject obj || obj["parameters"] is not JsonObject parameters)
|
||||
{
|
||||
return json;
|
||||
}
|
||||
|
||||
// Snapshot the names — mutating during enumeration is unsupported.
|
||||
var names = new List<string>(parameters.Count);
|
||||
foreach (var kvp in parameters)
|
||||
{
|
||||
names.Add(kvp.Key);
|
||||
}
|
||||
var anyChanged = false;
|
||||
foreach (var name in names)
|
||||
{
|
||||
bool matched;
|
||||
try
|
||||
{
|
||||
matched = paramNameRegex.IsMatch(name);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
|
||||
RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
if (matched)
|
||||
{
|
||||
parameters[name] = JsonValue.Create(RedactedMarker);
|
||||
anyChanged = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid re-serialising (which would normalise whitespace / order)
|
||||
// when no parameter matched — keeps the on-disk row byte-identical
|
||||
// to the emitter's output on the no-match path.
|
||||
return anyChanged ? obj.ToJsonString(RedactedSummaryJsonOptions) : json;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"SQL parameter redactor faulted; over-redacting field with '{Marker}'",
|
||||
RedactorErrorMarker);
|
||||
try { _failureCounter.Increment(); } catch { /* swallow per §7 */ }
|
||||
return RedactorErrorMarker;
|
||||
}
|
||||
}
|
||||
|
||||
private static string? TruncateField(string? value, int cap, ref bool truncated)
|
||||
{
|
||||
if (value is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
var result = TruncateUtf8(value, cap);
|
||||
if (result.Length != value.Length)
|
||||
{
|
||||
truncated = true;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// UTF-8 byte-safe truncation. Encodes the input to UTF-8, walks back from
|
||||
/// the cap position until the byte is NOT a continuation byte
|
||||
/// (<c>byte & 0xC0 == 0x80</c>), and decodes the resulting prefix —
|
||||
/// guaranteeing the returned string never splits a multi-byte sequence.
|
||||
/// </summary>
|
||||
private static string TruncateUtf8(string value, int capBytes)
|
||||
{
|
||||
if (string.IsNullOrEmpty(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
var bytes = Encoding.UTF8.GetBytes(value);
|
||||
if (bytes.Length <= capBytes)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
var boundary = capBytes;
|
||||
while (boundary > 0 && (bytes[boundary] & 0xC0) == 0x80)
|
||||
{
|
||||
boundary--;
|
||||
}
|
||||
return Encoding.UTF8.GetString(bytes, 0, boundary);
|
||||
}
|
||||
|
||||
private static bool IsErrorStatus(AuditStatus status) => status switch
|
||||
{
|
||||
AuditStatus.Delivered or AuditStatus.Submitted or AuditStatus.Forwarded => false,
|
||||
_ => true,
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Cache entry for a body-redactor pattern. Carries the working
|
||||
/// <see cref="Regex"/> on the success path, or the
|
||||
/// <see cref="Invalid"/> sentinel for patterns that failed to compile
|
||||
/// (or exceeded the 100 ms compile budget). The sentinel lets us skip
|
||||
/// repeat compile attempts on every event without re-throwing on the
|
||||
/// hot-path.
|
||||
/// </summary>
|
||||
private readonly struct CompiledRegex
|
||||
{
|
||||
public static readonly CompiledRegex Invalid = new(null);
|
||||
|
||||
public Regex? Regex { get; }
|
||||
|
||||
public CompiledRegex(Regex? regex) => Regex = regex;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
|
||||
namespace ScadaLink.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Filters an <see cref="AuditEvent"/> between construction and persistence —
|
||||
/// truncates oversized payload fields, applies header/body/SQL-parameter
|
||||
/// redaction, sets <see cref="AuditEvent.PayloadTruncated"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Pure function: returns a filtered COPY of the input via <c>with</c>
|
||||
/// expressions; never throws (over-redacts on internal failure and increments
|
||||
/// the <c>AuditRedactionFailure</c> health metric).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Wired in M5 between event construction and the writer chain
|
||||
/// (<c>FallbackAuditWriter.WriteAsync</c>, <c>CentralAuditWriter.WriteAsync</c>,
|
||||
/// and the <c>AuditLogIngestActor</c> handlers).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IAuditPayloadFilter
|
||||
{
|
||||
/// <summary>
|
||||
/// Apply the configured truncation + redaction policy to <paramref name="rawEvent"/>
|
||||
/// and return a filtered copy. MUST NOT throw — on internal failure, over-redact
|
||||
/// and surface the failure via the audit-redaction-failure health metric.
|
||||
/// </summary>
|
||||
AuditEvent Apply(AuditEvent rawEvent);
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
namespace ScadaLink.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Counter sink invoked by <see cref="DefaultAuditPayloadFilter"/> every time
|
||||
/// a redactor (header / body regex / SQL parameter) throws and the filter has
|
||||
/// to over-redact the offending field with the
|
||||
/// <c><redacted: redactor error></c> marker. Bundle C bridges this into
|
||||
/// the Site Health Monitoring report payload as <c>AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Redaction failures must NEVER abort the user-facing action (alog.md §7) —
|
||||
/// the filter over-redacts the field and surfaces the failure via this counter
|
||||
/// instead. A NoOp default is the correct safe fallback while the health
|
||||
/// metric is being wired in.
|
||||
/// </remarks>
|
||||
public interface IAuditRedactionFailureCounter
|
||||
{
|
||||
/// <summary>Increment the audit-redaction failure counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
namespace ScadaLink.AuditLog.Payload;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditRedactionFailureCounter"/> binding used when the
|
||||
/// Site Health Monitoring bridge has not been wired yet. Bundle C replaces
|
||||
/// this registration with the real counter that surfaces in the site health
|
||||
/// report payload as <c>AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
public sealed class NoOpAuditRedactionFailureCounter : IAuditRedactionFailureCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment()
|
||||
{
|
||||
// Intentionally empty — Bundle C overrides this binding with the real
|
||||
// health-metric counter.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!-- Bundle D D1: SiteAuditTelemetryActor + (D2) AuditLogIngestActor live
|
||||
in this project, so Akka is an explicit dependency. -->
|
||||
<PackageReference Include="Akka" />
|
||||
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../ScadaLink.Commons/ScadaLink.Commons.csproj" />
|
||||
<!-- Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call Audit (#22).
|
||||
IAuditLogRepository is registered by ScadaLink.ConfigurationDatabase; the project
|
||||
reference is documented here so M2 writers + telemetry actors can depend on it. -->
|
||||
<ProjectReference Include="../ScadaLink.ConfigurationDatabase/ScadaLink.ConfigurationDatabase.csproj" />
|
||||
<!-- Communication carries the IngestAuditEvents proto + DTOs (#23 M2 site sync). -->
|
||||
<ProjectReference Include="../ScadaLink.Communication/ScadaLink.Communication.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ScadaLink.AuditLog.Tests" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,310 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.AuditLog.Central;
|
||||
using ScadaLink.AuditLog.Configuration;
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
using ScadaLink.AuditLog.Site;
|
||||
using ScadaLink.AuditLog.Site.Telemetry;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
|
||||
namespace ScadaLink.AuditLog;
|
||||
|
||||
/// <summary>
|
||||
/// Composition root for the Audit Log (#23) component.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// M1 registered <see cref="AuditLogOptions"/> + the validator. M2 Bundle E
|
||||
/// extends the surface with the site-side writer chain
|
||||
/// (<see cref="SqliteAuditWriter"/> + <see cref="RingBufferFallback"/> +
|
||||
/// <see cref="FallbackAuditWriter"/>) and the telemetry collaborators
|
||||
/// (<see cref="ISiteAuditQueue"/>, <see cref="ISiteStreamAuditClient"/>,
|
||||
/// <see cref="IAuditWriteFailureCounter"/>, <see cref="SiteAuditTelemetryOptions"/>,
|
||||
/// <see cref="SqliteAuditWriterOptions"/>).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Audit Log (#23) sits alongside Notification Outbox (#21) and Site Call
|
||||
/// Audit (#22). <c>IAuditLogRepository</c> is registered by
|
||||
/// <c>ScadaLink.ConfigurationDatabase.ServiceCollectionExtensions.AddConfigurationDatabase</c>,
|
||||
/// so the caller (the Host on the central node) must also call that.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>Configuration section bound to <see cref="AuditLogOptions"/>.</summary>
|
||||
public const string ConfigSectionName = "AuditLog";
|
||||
|
||||
/// <summary>Configuration section bound to <see cref="SqliteAuditWriterOptions"/>.</summary>
|
||||
public const string SiteWriterSectionName = "AuditLog:SiteWriter";
|
||||
|
||||
/// <summary>Configuration section bound to <see cref="SiteAuditTelemetryOptions"/>.</summary>
|
||||
public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry";
|
||||
|
||||
/// <summary>Configuration section bound to <see cref="AuditLogPartitionMaintenanceOptions"/>.</summary>
|
||||
public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance";
|
||||
|
||||
/// <summary>
|
||||
/// Registers the Audit Log (#23) component services: options, the site
|
||||
/// SQLite writer chain (primary + ring fallback + failure-counter sink),
|
||||
/// and the site-→central telemetry collaborators. Idempotent re-registration
|
||||
/// is not supported; call this exactly once per <see cref="IServiceCollection"/>.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddAuditLog(this IServiceCollection services, IConfiguration config)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(config);
|
||||
|
||||
// M1: top-level AuditLogOptions + validator (redaction policy, payload caps, etc.).
|
||||
services.AddOptions<AuditLogOptions>()
|
||||
.Bind(config.GetSection(ConfigSectionName))
|
||||
.ValidateOnStart();
|
||||
services.AddSingleton<IValidateOptions<AuditLogOptions>, AuditLogOptionsValidator>();
|
||||
|
||||
// M5 Bundle A: payload filter — truncates oversized RequestSummary /
|
||||
// ResponseSummary / ErrorDetail / Extra fields between event
|
||||
// construction and persistence. Bundle B layers header / body /
|
||||
// SQL-parameter redaction onto the same singleton; Bundle C wires it
|
||||
// into the FallbackAuditWriter / CentralAuditWriter / IngestActor
|
||||
// paths. Singleton — the filter is stateless and the IOptionsMonitor
|
||||
// dependency picks up M5-T8 hot reloads on its own.
|
||||
services.AddSingleton<IAuditPayloadFilter, DefaultAuditPayloadFilter>();
|
||||
|
||||
// M5 Bundle B: per-stage redactor-failure counter. NoOp default;
|
||||
// Bundle C replaces this binding with the Site Health Monitoring
|
||||
// bridge that surfaces failures as AuditRedactionFailure on the site
|
||||
// health report.
|
||||
services.TryAddSingleton<IAuditRedactionFailureCounter, NoOpAuditRedactionFailureCounter>();
|
||||
|
||||
// M2 Bundle E: site writer + telemetry options bindings.
|
||||
// BindConfiguration is not used because the configuration root supplied
|
||||
// by the caller may not be the application root — we go through the
|
||||
// section explicitly so a partial IConfiguration (e.g. a test stub
|
||||
// anchored on the AuditLog section's parent) still works.
|
||||
services.AddOptions<SqliteAuditWriterOptions>()
|
||||
.Bind(config.GetSection(SiteWriterSectionName));
|
||||
services.AddOptions<SiteAuditTelemetryOptions>()
|
||||
.Bind(config.GetSection(SiteTelemetrySectionName));
|
||||
|
||||
// SqliteAuditWriter is a singleton with a single owned SqliteConnection
|
||||
// and a background writer Task; multiple instances would race on the
|
||||
// same file. Registered concretely so the ISiteAuditQueue + IAuditWriter
|
||||
// forwards below resolve to the same instance — the actor must observe
|
||||
// the writes made via the hot-path interface.
|
||||
services.AddSingleton<SqliteAuditWriter>();
|
||||
services.AddSingleton<ISiteAuditQueue>(sp => sp.GetRequiredService<SqliteAuditWriter>());
|
||||
|
||||
// RingBufferFallback: drop-oldest in-memory ring used by
|
||||
// FallbackAuditWriter when the primary SQLite writer throws. Default
|
||||
// capacity is fine for M2 (1024).
|
||||
services.AddSingleton<RingBufferFallback>();
|
||||
|
||||
// IAuditWriteFailureCounter: NoOp default. Bundle G overrides this
|
||||
// binding with the real Site Health Monitoring counter. Registered
|
||||
// before FallbackAuditWriter so the factory can resolve it.
|
||||
services.AddSingleton<IAuditWriteFailureCounter, NoOpAuditWriteFailureCounter>();
|
||||
|
||||
// The script-thread surface is FallbackAuditWriter (primary + ring +
|
||||
// counter), not the raw SqliteAuditWriter — primary failures must NEVER
|
||||
// abort the user-facing action.
|
||||
// Bundle C (M5-T6): the IAuditPayloadFilter singleton above is wired
|
||||
// through the factory so every event written through this surface is
|
||||
// truncated + redacted before it hits SQLite (and the ring on
|
||||
// failure).
|
||||
services.AddSingleton<IAuditWriter>(sp => new FallbackAuditWriter(
|
||||
primary: sp.GetRequiredService<SqliteAuditWriter>(),
|
||||
ring: sp.GetRequiredService<RingBufferFallback>(),
|
||||
failureCounter: sp.GetRequiredService<IAuditWriteFailureCounter>(),
|
||||
logger: sp.GetRequiredService<ILogger<FallbackAuditWriter>>(),
|
||||
filter: sp.GetRequiredService<IAuditPayloadFilter>()));
|
||||
|
||||
// ISiteStreamAuditClient: NoOp default. This binding remains correct for
|
||||
// central/test composition roots that have no SiteCommunicationActor.
|
||||
// The real implementation is ClusterClientSiteAuditClient, which pushes
|
||||
// audit telemetry to central over Akka ClusterClient via the site's
|
||||
// SiteCommunicationActor — the Host wires it directly into the
|
||||
// SiteAuditTelemetryActor's Props.Create call for site roles (it cannot
|
||||
// be a DI singleton because it needs the SiteCommunicationActor IActorRef,
|
||||
// created during Akka bootstrap, not at DI-composition time).
|
||||
services.AddSingleton<ISiteStreamAuditClient, NoOpSiteStreamAuditClient>();
|
||||
|
||||
// M3 Bundle F: site-side dual emitter for cached-call lifecycle
|
||||
// telemetry. ScriptRuntimeContext.ExternalSystem.CachedCall /
|
||||
// Database.CachedWrite resolves this through DI and pushes one combined
|
||||
// packet per lifecycle event; the forwarder writes the audit half
|
||||
// through IAuditWriter and the operational half through the
|
||||
// IOperationTrackingStore. The audit writer is always wired (the M2
|
||||
// chain above); the operational tracking store is SITE-ONLY (registered
|
||||
// by ScadaLink.SiteRuntime). On a Central composition root the tracking
|
||||
// store has no registration, so the factory resolves it with GetService
|
||||
// (returning null) — the forwarder degrades to "audit-only" emission,
|
||||
// mirroring the lazy IAuditWriter chain established in M2.
|
||||
services.AddSingleton<ICachedCallTelemetryForwarder>(sp =>
|
||||
new CachedCallTelemetryForwarder(
|
||||
sp.GetRequiredService<IAuditWriter>(),
|
||||
sp.GetService<ScadaLink.Commons.Interfaces.IOperationTrackingStore>(),
|
||||
sp.GetRequiredService<ILogger<CachedCallTelemetryForwarder>>()));
|
||||
|
||||
// M3 Bundle F: bridge the store-and-forward retry-loop observer hook
|
||||
// to the cached-call forwarder so per-attempt + terminal telemetry
|
||||
// emitted from the S&F retry sweep lands on the same SQLite hot-path
|
||||
// as the script-thread CachedSubmit row. Registered as a singleton
|
||||
// and also bound to ICachedCallLifecycleObserver so AddStoreAndForward
|
||||
// can resolve it through DI (Bundle F StoreAndForward wiring change).
|
||||
services.AddSingleton<CachedCallLifecycleBridge>();
|
||||
services.AddSingleton<ICachedCallLifecycleObserver>(
|
||||
sp => sp.GetRequiredService<CachedCallLifecycleBridge>());
|
||||
|
||||
// M6 Bundle E (T8): central audit-write failure counter — NoOp default
|
||||
// for site/test composition roots that don't wire the central health
|
||||
// snapshot. AddAuditLogCentralMaintenance below replaces this binding
|
||||
// with the AuditCentralHealthSnapshot implementation so increments
|
||||
// surface on the central dashboard.
|
||||
services.TryAddSingleton<ICentralAuditWriteFailureCounter, NoOpCentralAuditWriteFailureCounter>();
|
||||
|
||||
// M4 Bundle B: central direct-write audit writer used by
|
||||
// NotificationOutboxActor (Bundle B) and Inbound API (Bundle C/D) to
|
||||
// emit AuditLog rows that originate ON central, not via site telemetry.
|
||||
// Singleton — the writer is stateless; its per-call scope opens a fresh
|
||||
// IAuditLogRepository (a SCOPED EF Core service registered by
|
||||
// ScadaLink.ConfigurationDatabase). The interface (ICentralAuditWriter)
|
||||
// is intentionally distinct from IAuditWriter so site composition roots
|
||||
// do not accidentally bind it; central composition roots that include
|
||||
// AddConfigurationDatabase get a working implementation transparently.
|
||||
// Bundle C (M5-T6): wire the IAuditPayloadFilter into the factory so
|
||||
// NotificationOutboxActor + Inbound API rows are truncated + redacted
|
||||
// before they hit MS SQL.
|
||||
// M6 Bundle E (T8): also wire the ICentralAuditWriteFailureCounter
|
||||
// so swallowed repo throws bump the central health counter.
|
||||
services.AddSingleton<ICentralAuditWriter>(sp => new CentralAuditWriter(
|
||||
sp,
|
||||
sp.GetRequiredService<ILogger<CentralAuditWriter>>(),
|
||||
sp.GetRequiredService<IAuditPayloadFilter>(),
|
||||
sp.GetRequiredService<ICentralAuditWriteFailureCounter>()));
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M2 Bundle G + M5 Bundle C — swap the default
|
||||
/// <see cref="NoOpAuditWriteFailureCounter"/> and
|
||||
/// <see cref="NoOpAuditRedactionFailureCounter"/> registrations for the
|
||||
/// real <see cref="HealthMetricsAuditWriteFailureCounter"/> /
|
||||
/// <see cref="HealthMetricsAuditRedactionFailureCounter"/> bridges so the
|
||||
/// FallbackAuditWriter primary-failure counter AND the
|
||||
/// DefaultAuditPayloadFilter redactor-failure counter both surface in the
|
||||
/// site health report payload as
|
||||
/// <c>SiteHealthReport.SiteAuditWriteFailures</c> +
|
||||
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Must be called AFTER both <see cref="AddAuditLog"/> (registers the
|
||||
/// NoOp defaults this method replaces) and
|
||||
/// <c>ScadaLink.HealthMonitoring.ServiceCollectionExtensions.AddHealthMonitoring</c>
|
||||
/// or <c>AddSiteHealthMonitoring</c> (registers the
|
||||
/// <see cref="ISiteHealthCollector"/> the bridges depend on). Resolving
|
||||
/// <see cref="IAuditWriteFailureCounter"/> or
|
||||
/// <see cref="IAuditRedactionFailureCounter"/> without the latter throws
|
||||
/// <see cref="InvalidOperationException"/> at <c>GetRequiredService</c>
|
||||
/// time — by design, since a silent NoOp would mask a misconfiguration.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Idempotent — calling twice replaces each descriptor without piling up
|
||||
/// registrations.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Site-side only for M5: the central composition root keeps the NoOp
|
||||
/// defaults; the central health-metric surface that would expose
|
||||
/// <c>AuditRedactionFailure</c> next to the existing central counters
|
||||
/// ships in M6.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static IServiceCollection AddAuditLogHealthMetricsBridge(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
services.Replace(
|
||||
ServiceDescriptor.Singleton<IAuditWriteFailureCounter, HealthMetricsAuditWriteFailureCounter>());
|
||||
services.Replace(
|
||||
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
|
||||
// M6 Bundle E (T6): the site-side backlog reporter polls the
|
||||
// SqliteAuditWriter every 30 s and pushes the snapshot into the
|
||||
// collector so the next SiteHealthReport carries a fresh
|
||||
// SiteAuditBacklog field. Registered alongside the other site-only
|
||||
// metric bridges so AddAuditLog (which runs on central too) stays
|
||||
// free of hosted-service registrations that would resolve a missing
|
||||
// ISiteHealthCollector on central.
|
||||
services.AddHostedService<SiteAuditBacklogReporter>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6-T5 Bundle D — central-only registration for the
|
||||
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service plus
|
||||
/// its <see cref="AuditLogPartitionMaintenanceOptions"/> binding. Must be
|
||||
/// called from the Central role's composition root (not from a site
|
||||
/// composition root); the underlying <c>IPartitionMaintenance</c>
|
||||
/// implementation is registered by <c>AddConfigurationDatabase</c> and
|
||||
/// only exists on the central node.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Separated from <see cref="AddAuditLog"/> because <c>AddAuditLog</c> is
|
||||
/// also invoked from site composition roots — silently starting a
|
||||
/// hosted service that resolves an unregistered dependency on a site
|
||||
/// would fail every tick. Keeping the central-only registration in its
|
||||
/// own helper preserves the "every <c>Add*</c> call is safe to issue
|
||||
/// from any composition root" invariant.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public static IServiceCollection AddAuditLogCentralMaintenance(
|
||||
this IServiceCollection services,
|
||||
IConfiguration config)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(config);
|
||||
|
||||
services.AddOptions<AuditLogPartitionMaintenanceOptions>()
|
||||
.Bind(config.GetSection(PartitionMaintenanceSectionName));
|
||||
services.AddHostedService<AuditLogPartitionMaintenanceService>();
|
||||
|
||||
// M6 Bundle E (T8 + T9): central health snapshot — a single object
|
||||
// that owns the CentralAuditWriteFailures + AuditRedactionFailure
|
||||
// Interlocked counters AND surfaces them on
|
||||
// IAuditCentralHealthSnapshot. The same instance is bound to BOTH
|
||||
// writer-side interfaces (ICentralAuditWriteFailureCounter +
|
||||
// IAuditRedactionFailureCounter) so every central-side increment
|
||||
// routes into the shared counters; site nodes keep their existing
|
||||
// Site bridges (registered by AddAuditLogHealthMetricsBridge) so
|
||||
// the same counter type does not shadow the site-side metric.
|
||||
// The snapshot itself has no actor-system dependency — the
|
||||
// per-site stalled latch is fed by SiteAuditTelemetryStalledTracker
|
||||
// which the Akka bootstrap wires up after ActorSystem.Create returns
|
||||
// (the tracker is NOT registered here because its construction
|
||||
// requires ActorSystem, which is not a DI-resolvable singleton).
|
||||
services.AddSingleton<AuditCentralHealthSnapshot>();
|
||||
services.AddSingleton<IAuditCentralHealthSnapshot>(
|
||||
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>());
|
||||
services.Replace(ServiceDescriptor.Singleton<ICentralAuditWriteFailureCounter>(
|
||||
sp => sp.GetRequiredService<AuditCentralHealthSnapshot>()));
|
||||
// M6 Bundle E (T9): override the NoOp IAuditRedactionFailureCounter
|
||||
// (registered by AddAuditLog) with the CentralAuditRedactionFailureCounter
|
||||
// bridge so payload-filter throws on CentralAuditWriter /
|
||||
// AuditLogIngestActor paths surface on the central dashboard. The
|
||||
// bridge is a thin wrapper around the AuditCentralHealthSnapshot
|
||||
// singleton so all central redactor failures route into the same
|
||||
// counter as CentralAuditWriteFailures. The site composition root
|
||||
// overrides this binding AGAIN via AddAuditLogHealthMetricsBridge —
|
||||
// central nodes do not call that bridge, so this is the final
|
||||
// binding on a central host. Mirrors the M5 Bundle C
|
||||
// HealthMetricsAuditRedactionFailureCounter shape one-for-one.
|
||||
services.Replace(ServiceDescriptor.Singleton<IAuditRedactionFailureCounter,
|
||||
CentralAuditRedactionFailureCounter>());
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Composes the primary <see cref="SqliteAuditWriter"/> with a drop-oldest
|
||||
/// <see cref="RingBufferFallback"/>. Audit writes are best-effort by contract
|
||||
/// (see <see cref="IAuditWriter"/>) — a primary failure must NEVER bubble out
|
||||
/// to the calling script. Failed events are stashed in the ring; on the next
|
||||
/// successful primary write the ring is drained back through the primary in
|
||||
/// FIFO order.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Each primary failure increments <see cref="IAuditWriteFailureCounter"/> so
|
||||
/// Site Health Monitoring can surface a sustained outage as
|
||||
/// <c>SiteAuditWriteFailures</c> (Bundle G).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Errors raised by the ring drain on recovery are logged and silently dropped
|
||||
/// so we don't loop the failure mode — the trigger event itself succeeded, and
|
||||
/// retrying the drain on the NEXT successful write is the recovery path.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class FallbackAuditWriter : IAuditWriter
|
||||
{
|
||||
private readonly IAuditWriter _primary;
|
||||
private readonly RingBufferFallback _ring;
|
||||
private readonly IAuditWriteFailureCounter _failureCounter;
|
||||
private readonly ILogger<FallbackAuditWriter> _logger;
|
||||
private readonly IAuditPayloadFilter? _filter;
|
||||
private readonly SemaphoreSlim _drainGate = new(1, 1);
|
||||
|
||||
/// <summary>
|
||||
/// Bundle C (M5-T6) wires the singleton <see cref="IAuditPayloadFilter"/>
|
||||
/// here so every event written via the site hot path is truncated +
|
||||
/// header/body/SQL-param redacted before it hits both the primary SQLite
|
||||
/// writer AND the ring fallback. The parameter is optional (defaults to
|
||||
/// no filtering) so the long tail of test composition roots that don't
|
||||
/// care about the filter need no change — the production
|
||||
/// <see cref="ServiceCollectionExtensions.AddAuditLog"/> registration
|
||||
/// always passes the real filter through.
|
||||
/// </summary>
|
||||
public FallbackAuditWriter(
|
||||
IAuditWriter primary,
|
||||
RingBufferFallback ring,
|
||||
IAuditWriteFailureCounter failureCounter,
|
||||
ILogger<FallbackAuditWriter> logger,
|
||||
IAuditPayloadFilter? filter = null)
|
||||
{
|
||||
_primary = primary ?? throw new ArgumentNullException(nameof(primary));
|
||||
_ring = ring ?? throw new ArgumentNullException(nameof(ring));
|
||||
_failureCounter = failureCounter ?? throw new ArgumentNullException(nameof(failureCounter));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_filter = filter; // null = no-op pass-through; see WriteAsync.
|
||||
}
|
||||
|
||||
public async Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
// Filter once, up-front. The filtered event flows BOTH to the primary
|
||||
// and (on failure) to the ring buffer — so a primary outage that
|
||||
// drains later still hands the SqliteAuditWriter a row that has
|
||||
// already been truncated and redacted. The filter contract is
|
||||
// "MUST NOT throw"; the null-coalesce keeps test composition roots
|
||||
// that don't wire a filter working unchanged.
|
||||
var filtered = _filter?.Apply(evt) ?? evt;
|
||||
|
||||
try
|
||||
{
|
||||
await _primary.WriteAsync(filtered, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Primary down: record the failure, stash in the ring, return
|
||||
// success to the caller. Audit-write failures NEVER abort the
|
||||
// user-facing action (alog.md §7). DO NOT attempt the ring drain
|
||||
// here — primary is throwing, draining would just scramble FIFO
|
||||
// order across re-enqueues.
|
||||
_failureCounter.Increment();
|
||||
_logger.LogWarning(ex,
|
||||
"Primary audit writer threw; routing EventId {EventId} to drop-oldest ring.",
|
||||
filtered.EventId);
|
||||
// Ring stores the filtered copy so the eventual drain replays a
|
||||
// payload that has already been capped/redacted — no second
|
||||
// filter pass needed on recovery, and no risk of the ring
|
||||
// holding the raw oversized blob in memory.
|
||||
_ring.TryEnqueue(filtered);
|
||||
return;
|
||||
}
|
||||
|
||||
// Primary succeeded — opportunistically drain anything that piled up
|
||||
// in the ring during the outage. Best-effort: a failure during the
|
||||
// drain re-enqueues the popped event and is logged; the next
|
||||
// successful write will retry. Drain order in the audit log is
|
||||
// therefore: <triggering event>, <backlog FIFO>.
|
||||
if (_ring.Count > 0)
|
||||
{
|
||||
await TryDrainRingAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryDrainRingAsync(CancellationToken ct)
|
||||
{
|
||||
// Serialise drains so two concurrent recoveries don't double-replay.
|
||||
if (!await _drainGate.WaitAsync(0, ct).ConfigureAwait(false))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Pull only what is currently buffered; do NOT wait for new events.
|
||||
// We iterate with a snapshot of Count so we never starve under
|
||||
// concurrent enqueues.
|
||||
var pending = _ring.Count;
|
||||
for (var i = 0; i < pending; i++)
|
||||
{
|
||||
if (!_ring.TryDequeue(out var queued))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await _primary.WriteAsync(queued, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Primary fell over again. Put the event back at the head
|
||||
// of the queue is impossible with Channel<T>; route to the
|
||||
// tail (drop-oldest preserves the most-recent picture).
|
||||
_failureCounter.Increment();
|
||||
_logger.LogWarning(ex,
|
||||
"Ring drain re-throw on EventId {EventId}; re-enqueuing.",
|
||||
queued.EventId);
|
||||
_ring.TryEnqueue(queued);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_drainGate.Release();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
using ScadaLink.AuditLog.Payload;
|
||||
using ScadaLink.HealthMonitoring;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M5 Bundle C — bridges
|
||||
/// <see cref="IAuditRedactionFailureCounter"/> (incremented by
|
||||
/// <see cref="DefaultAuditPayloadFilter"/> every time a header / body / SQL
|
||||
/// parameter redactor stage throws and the filter has to over-redact the
|
||||
/// offending field) into <see cref="ISiteHealthCollector"/> so the count
|
||||
/// surfaces in the site health report payload as
|
||||
/// <c>SiteHealthReport.AuditRedactionFailure</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
|
||||
/// callers must register <c>AddHealthMonitoring()</c> first so
|
||||
/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="ServiceCollectionExtensions.AddAuditLog"/>
|
||||
/// registration keeps <see cref="NoOpAuditRedactionFailureCounter"/> for nodes
|
||||
/// where Site Health Monitoring is not wired (the silent-sink contract —
|
||||
/// redaction failures must NEVER abort the user-facing action, alog.md §7).
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Mirrors the M2 Bundle G <see cref="HealthMetricsAuditWriteFailureCounter"/>
|
||||
/// shape one-for-one so the two health-metric bridges age together.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Site-side only for M5: the redaction filter also runs on the central
|
||||
/// writers (CentralAuditWriter + AuditLogIngestActor), but the central
|
||||
/// health-metric surface that would expose <c>AuditRedactionFailure</c>
|
||||
/// alongside the existing central counters ships in M6. Until then, the
|
||||
/// central composition root keeps the NoOp default — the redactions still
|
||||
/// happen, they just don't get counted into a health report.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class HealthMetricsAuditRedactionFailureCounter : IAuditRedactionFailureCounter
|
||||
{
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
|
||||
public HealthMetricsAuditRedactionFailureCounter(ISiteHealthCollector collector)
|
||||
{
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Increment() => _collector.IncrementAuditRedactionFailure();
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
using ScadaLink.HealthMonitoring;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M2 Bundle G — bridges <see cref="IAuditWriteFailureCounter"/>
|
||||
/// (incremented by <see cref="FallbackAuditWriter"/> every time the primary
|
||||
/// SQLite writer throws) into <see cref="ISiteHealthCollector"/> so the count
|
||||
/// surfaces in the site health report payload as
|
||||
/// <c>SiteHealthReport.SiteAuditWriteFailures</c>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Registered by <see cref="ServiceCollectionExtensions.AddAuditLogHealthMetricsBridge"/>;
|
||||
/// callers must register <c>AddHealthMonitoring()</c> first so
|
||||
/// <see cref="ISiteHealthCollector"/> resolves. The default <see cref="AddAuditLog"/>
|
||||
/// registration keeps <see cref="NoOpAuditWriteFailureCounter"/> for nodes
|
||||
/// where Site Health Monitoring is not wired (the silent-sink contract — audit
|
||||
/// write failures must NEVER abort the user-facing action, alog.md §7).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class HealthMetricsAuditWriteFailureCounter : IAuditWriteFailureCounter
|
||||
{
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
|
||||
public HealthMetricsAuditWriteFailureCounter(ISiteHealthCollector collector)
|
||||
{
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Increment() => _collector.IncrementSiteAuditWriteFailures();
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Lightweight counter sink invoked by <see cref="FallbackAuditWriter"/> every
|
||||
/// time the primary <see cref="SqliteAuditWriter"/> throws on an audit write.
|
||||
/// Bundle G (M2-T11) implements this as a thread-safe Interlocked counter
|
||||
/// bridged into the Site Health Monitoring report payload as
|
||||
/// <c>SiteAuditWriteFailures</c>.
|
||||
/// </summary>
|
||||
public interface IAuditWriteFailureCounter
|
||||
{
|
||||
/// <summary>Increment the audit-write failure counter by one.</summary>
|
||||
void Increment();
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Default <see cref="IAuditWriteFailureCounter"/> registered by
|
||||
/// <see cref="ScadaLink.AuditLog.ServiceCollectionExtensions.AddAuditLog"/> on
|
||||
/// every node. Bundle G replaces this binding with a real counter that bridges
|
||||
/// into the Site Health Monitoring report payload as
|
||||
/// <c>SiteAuditWriteFailures</c> — until then,
|
||||
/// <see cref="FallbackAuditWriter"/> emits to a silent sink rather than NRE-ing
|
||||
/// on a null collaborator.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Audit-write failures must NEVER abort the user-facing action (alog.md §7),
|
||||
/// so the counter is best-effort by contract. A NoOp default is the correct
|
||||
/// safe fallback while the health metric is being wired in.
|
||||
/// </remarks>
|
||||
public sealed class NoOpAuditWriteFailureCounter : IAuditWriteFailureCounter
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public void Increment()
|
||||
{
|
||||
// Intentionally empty. Bundle G overrides this binding with the real
|
||||
// counter once Site Health Monitoring is wired.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Threading.Channels;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Drop-oldest in-memory ring buffer used by <see cref="FallbackAuditWriter"/>
|
||||
/// when the primary SQLite writer is throwing. Capacity is fixed at construction
|
||||
/// (default 1024). When full, the oldest event is silently dropped to make room
|
||||
/// for the newest — preserving the most recent picture of activity in the face
|
||||
/// of an extended SQLite outage — and <see cref="RingBufferOverflowed"/> is
|
||||
/// raised so a health counter can record the loss.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Backed by a <see cref="Channel{T}"/> with
|
||||
/// <see cref="BoundedChannelFullMode.DropOldest"/>. The channel doesn't natively
|
||||
/// notify on drop, so this class compares <c>Reader.Count</c> before and after
|
||||
/// each enqueue: any time we hit capacity and a subsequent enqueue keeps the
|
||||
/// count at capacity, exactly one event has been dropped.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Per the M2 plan: the ring is the absolute-last-resort buffer for the
|
||||
/// hot-path; it is NOT a substitute for the bounded
|
||||
/// <see cref="SqliteAuditWriter"/> write queue.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class RingBufferFallback
|
||||
{
|
||||
private readonly Channel<AuditEvent> _channel;
|
||||
private readonly int _capacity;
|
||||
|
||||
/// <summary>
|
||||
/// Raised once each time a drop-oldest overflow occurs. Hooked by
|
||||
/// <see cref="FallbackAuditWriter"/>'s health counter wiring.
|
||||
/// </summary>
|
||||
public event Action? RingBufferOverflowed;
|
||||
|
||||
public RingBufferFallback(int capacity = 1024)
|
||||
{
|
||||
if (capacity <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(capacity), "capacity must be > 0.");
|
||||
}
|
||||
|
||||
_capacity = capacity;
|
||||
_channel = Channel.CreateBounded<AuditEvent>(new BoundedChannelOptions(capacity)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.DropOldest,
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>Current event count in the ring (for diagnostics/tests).</summary>
|
||||
public int Count => _channel.Reader.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Try to enqueue an event. Returns <see langword="true"/> on success (even
|
||||
/// when an overflow caused an older event to be dropped); returns
|
||||
/// <see langword="false"/> only when the ring has been
|
||||
/// <see cref="Complete"/>-d.
|
||||
/// </summary>
|
||||
public bool TryEnqueue(AuditEvent evt)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
// DropOldest TryWrite always succeeds unless the channel is completed.
|
||||
// Detect overflow by comparing the count before vs. after: if we were
|
||||
// already at capacity and remain at capacity, exactly one event was
|
||||
// dropped to make room for evt.
|
||||
var beforeCount = _channel.Reader.Count;
|
||||
if (!_channel.Writer.TryWrite(evt))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (beforeCount >= _capacity)
|
||||
{
|
||||
// The new event displaced an existing one.
|
||||
RingBufferOverflowed?.Invoke();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Drain the ring in FIFO order. Yields available events immediately and
|
||||
/// then completes when the channel is empty AND <see cref="Complete"/> has
|
||||
/// been called. Callers that only want to drain what's currently buffered
|
||||
/// must call <see cref="Complete"/> first.
|
||||
/// </summary>
|
||||
public async IAsyncEnumerable<AuditEvent> DrainAsync(
|
||||
[EnumeratorCancellation] CancellationToken cancellationToken)
|
||||
{
|
||||
await foreach (var evt in _channel.Reader.ReadAllAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
yield return evt;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Non-blocking single-item dequeue used by the
|
||||
/// <see cref="FallbackAuditWriter"/> recovery path. Returns
|
||||
/// <see langword="false"/> when the ring is empty.
|
||||
/// </summary>
|
||||
public bool TryDequeue(out AuditEvent evt) => _channel.Reader.TryRead(out evt!);
|
||||
|
||||
/// <summary>
|
||||
/// Mark the ring as no-more-writes. <see cref="DrainAsync"/> will yield the
|
||||
/// remaining events and then complete.
|
||||
/// </summary>
|
||||
public void Complete() => _channel.Writer.TryComplete();
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.HealthMonitoring;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log (#23) M6 Bundle E (T6) — site-side hosted service that
|
||||
/// periodically pulls a backlog snapshot from <see cref="ISiteAuditQueue"/>
|
||||
/// and pushes it into <see cref="ISiteHealthCollector"/> so the next
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> emits a fresh
|
||||
/// <c>SiteAuditBacklog</c> field on the site health report.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Why a hosted service, not the report sender.</b> Querying SQLite for the
|
||||
/// backlog requires the queue's write lock; doing it inline in
|
||||
/// <see cref="ISiteHealthCollector.CollectReport"/> would couple the collector
|
||||
/// to <see cref="ISiteAuditQueue"/> and turn an in-memory snapshot read into
|
||||
/// a synchronous I/O call on the report path. The hosted-service pattern keeps
|
||||
/// the report path pure and the SQL probe off the report timing budget.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Cadence.</b> 30 s by default — coarse enough to amortise the SQL probe
|
||||
/// across many reports, fine enough that the central dashboard never lags by
|
||||
/// more than one health-report interval. Tunable via
|
||||
/// <see cref="ScadaLink.AuditLog.Site.SqliteAuditWriterOptions"/> in a follow-up
|
||||
/// if ops needs a different cadence; for M6 we hard-code the value because the
|
||||
/// brief calls it out explicitly.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Failure containment.</b> The probe call is wrapped in a try/catch so a
|
||||
/// transient SQLite error never tears down the hosted service — the next tick
|
||||
/// retries. Mirrors <see cref="ScadaLink.AuditLog.Central.AuditLogPartitionMaintenanceService"/>'s
|
||||
/// "exception logged, not propagated" contract.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class SiteAuditBacklogReporter : IHostedService, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Default poll cadence. Half a typical 60 s health-report interval keeps
|
||||
/// the snapshot fresh without spinning the SQL probe more often than
|
||||
/// necessary.
|
||||
/// </summary>
|
||||
internal static readonly TimeSpan DefaultRefreshInterval = TimeSpan.FromSeconds(30);
|
||||
|
||||
private readonly ISiteAuditQueue _queue;
|
||||
private readonly ISiteHealthCollector _collector;
|
||||
private readonly ILogger<SiteAuditBacklogReporter> _logger;
|
||||
private readonly TimeSpan _refreshInterval;
|
||||
private CancellationTokenSource? _cts;
|
||||
private Task? _loop;
|
||||
|
||||
public SiteAuditBacklogReporter(
|
||||
ISiteAuditQueue queue,
|
||||
ISiteHealthCollector collector,
|
||||
ILogger<SiteAuditBacklogReporter> logger,
|
||||
TimeSpan? refreshInterval = null)
|
||||
{
|
||||
_queue = queue ?? throw new ArgumentNullException(nameof(queue));
|
||||
_collector = collector ?? throw new ArgumentNullException(nameof(collector));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_refreshInterval = refreshInterval ?? DefaultRefreshInterval;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StartAsync(CancellationToken ct)
|
||||
{
|
||||
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
|
||||
// token both terminate the loop; either side firing aborts the
|
||||
// pending Task.Delay.
|
||||
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task RunLoopAsync(CancellationToken ct)
|
||||
{
|
||||
// First tick runs immediately so the very first health report after
|
||||
// process start carries a real backlog snapshot — without this the
|
||||
// dashboard would show null for the first 30 s after a deploy.
|
||||
await SafeProbeAsync(ct).ConfigureAwait(false);
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_refreshInterval, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await SafeProbeAsync(ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SafeProbeAsync(CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var snapshot = await _queue.GetBacklogStatsAsync(ct).ConfigureAwait(false);
|
||||
_collector.UpdateSiteAuditBacklog(snapshot);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Shutdown — let the outer loop exit cleanly.
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Catch-all is deliberate: the hosted service must survive every
|
||||
// class of probe failure (transient SQLite lock contention, disk
|
||||
// I/O hiccup, …) so the next tick gets a chance.
|
||||
_logger.LogWarning(ex, "SiteAuditBacklogReporter probe failed; next tick will retry.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StopAsync(CancellationToken ct)
|
||||
{
|
||||
_cts?.Cancel();
|
||||
return _loop ?? Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_cts?.Dispose();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,754 @@
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Data.Sqlite;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side SQLite hot-path writer for Audit Log (#23) events. Mirrors the
|
||||
/// <see cref="ScadaLink.SiteEventLogging.SiteEventLogger"/> design — a single
|
||||
/// owned <see cref="SqliteConnection"/> serialised behind a write lock, fed by a
|
||||
/// bounded <see cref="Channel{T}"/> drained on a dedicated background writer
|
||||
/// task — so script-thread callers never block on disk I/O.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The schema is bootstrapped in the constructor (Bundle B-T1). The
|
||||
/// Channel-based <see cref="WriteAsync"/> hot-path + Bundle D
|
||||
/// <see cref="ReadPendingAsync"/> / <see cref="MarkForwardedAsync"/> support
|
||||
/// surface are wired in Bundle B-T2.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Site rows always carry <see cref="AuditForwardState.Pending"/> on first
|
||||
/// insert; the central row-shape's <c>IngestedAtUtc</c> column does NOT live in
|
||||
/// the site SQLite schema — central stamps it on ingest.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class SqliteAuditWriter : IAuditWriter, ISiteAuditQueue, IAsyncDisposable, IDisposable
|
||||
{
|
||||
// Microsoft.Data.Sqlite reports a generic SQLITE_CONSTRAINT (error code 19)
|
||||
// on a PRIMARY KEY violation; the extended subcode 1555 (SQLITE_CONSTRAINT_PRIMARYKEY)
|
||||
// is exposed via SqliteException.SqliteExtendedErrorCode but isn't reliably
|
||||
// surfaced across all SQLite builds. We treat any constraint error on insert
|
||||
// as a duplicate-eventid race and swallow it (first-write-wins) — the index
|
||||
// on EventId is the only constraint on this table, so this scope is precise.
|
||||
private const int SqliteErrorConstraint = 19;
|
||||
|
||||
private readonly SqliteConnection _connection;
|
||||
private readonly SqliteAuditWriterOptions _options;
|
||||
private readonly ILogger<SqliteAuditWriter> _logger;
|
||||
private readonly object _writeLock = new();
|
||||
private readonly Channel<PendingAuditEvent> _writeQueue;
|
||||
private readonly Task _writerLoop;
|
||||
private bool _disposed;
|
||||
|
||||
public SqliteAuditWriter(
|
||||
IOptions<SqliteAuditWriterOptions> options,
|
||||
ILogger<SqliteAuditWriter> logger,
|
||||
string? connectionStringOverride = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
|
||||
var connectionString = connectionStringOverride
|
||||
?? $"Data Source={_options.DatabasePath};Cache=Shared";
|
||||
_connection = new SqliteConnection(connectionString);
|
||||
_connection.Open();
|
||||
|
||||
InitializeSchema();
|
||||
|
||||
_writeQueue = Channel.CreateBounded<PendingAuditEvent>(
|
||||
new BoundedChannelOptions(_options.ChannelCapacity)
|
||||
{
|
||||
// The hot-path enqueue must back-pressure if the background
|
||||
// writer falls behind; a higher-level fallback (Bundle B-T4)
|
||||
// handles truly catastrophic primary failure with a drop-oldest
|
||||
// ring buffer.
|
||||
FullMode = BoundedChannelFullMode.Wait,
|
||||
SingleReader = true,
|
||||
SingleWriter = false,
|
||||
});
|
||||
_writerLoop = Task.Run(ProcessWriteQueueAsync);
|
||||
}
|
||||
|
||||
private void InitializeSchema()
|
||||
{
|
||||
// auto_vacuum must be set before any table is created for it to take
|
||||
// effect on a fresh database. INCREMENTAL lets a future
|
||||
// `PRAGMA incremental_vacuum` shrink the file after the 7-day retention
|
||||
// purge — see alog.md §10.
|
||||
using (var pragmaCmd = _connection.CreateCommand())
|
||||
{
|
||||
pragmaCmd.CommandText = "PRAGMA auto_vacuum = INCREMENTAL";
|
||||
pragmaCmd.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
CREATE TABLE IF NOT EXISTS AuditLog (
|
||||
EventId TEXT NOT NULL,
|
||||
OccurredAtUtc TEXT NOT NULL,
|
||||
Channel TEXT NOT NULL,
|
||||
Kind TEXT NOT NULL,
|
||||
CorrelationId TEXT NULL,
|
||||
SourceSiteId TEXT NULL,
|
||||
SourceInstanceId TEXT NULL,
|
||||
SourceScript TEXT NULL,
|
||||
Actor TEXT NULL,
|
||||
Target TEXT NULL,
|
||||
Status TEXT NOT NULL,
|
||||
HttpStatus INTEGER NULL,
|
||||
DurationMs INTEGER NULL,
|
||||
ErrorMessage TEXT NULL,
|
||||
ErrorDetail TEXT NULL,
|
||||
RequestSummary TEXT NULL,
|
||||
ResponseSummary TEXT NULL,
|
||||
PayloadTruncated INTEGER NOT NULL,
|
||||
Extra TEXT NULL,
|
||||
ForwardState TEXT NOT NULL,
|
||||
ExecutionId TEXT NULL,
|
||||
PRIMARY KEY (EventId)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS IX_SiteAuditLog_ForwardState_Occurred
|
||||
ON AuditLog (ForwardState, OccurredAtUtc);
|
||||
""";
|
||||
cmd.ExecuteNonQuery();
|
||||
|
||||
// Audit Log #23 (ExecutionId): additively add the ExecutionId column.
|
||||
// CREATE TABLE IF NOT EXISTS above does NOT add columns to an AuditLog
|
||||
// table that already exists from a pre-ExecutionId build, so an
|
||||
// auditlog.db created by an older build needs the column ALTER-ed in.
|
||||
// The file is durable across restart/failover by design (7-day
|
||||
// retention), so without this step every WriteAsync on an upgraded
|
||||
// deployment would bind $ExecutionId against a missing column and the
|
||||
// best-effort write path would silently drop every site audit row.
|
||||
// SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
|
||||
// probed first and the ALTER skipped when already there. The column is
|
||||
// nullable with no default, so any row written before this migration
|
||||
// reads back ExecutionId = null (back-compat).
|
||||
AddColumnIfMissing("ExecutionId", "TEXT NULL");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (ExecutionId): adds a column to <c>AuditLog</c> only when
|
||||
/// it is not already present. SQLite lacks <c>ADD COLUMN IF NOT EXISTS</c>,
|
||||
/// so the schema is probed via <c>PRAGMA table_info</c> first. Idempotent —
|
||||
/// safe to run on every <see cref="InitializeSchema"/>. Mirrors
|
||||
/// <c>StoreAndForwardStorage.AddColumnIfMissingAsync</c>; kept synchronous
|
||||
/// here to match the rest of this writer's bootstrap DDL.
|
||||
/// </summary>
|
||||
private void AddColumnIfMissing(string columnName, string columnDefinition)
|
||||
{
|
||||
using var probe = _connection.CreateCommand();
|
||||
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('AuditLog') WHERE name = $name";
|
||||
probe.Parameters.AddWithValue("$name", columnName);
|
||||
var exists = Convert.ToInt32(probe.ExecuteScalar()) > 0;
|
||||
if (exists)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
using var alter = _connection.CreateCommand();
|
||||
// Column name + definition are caller-controlled constants, never user
|
||||
// input — safe to interpolate (parameters are not permitted in DDL).
|
||||
alter.CommandText = $"ALTER TABLE AuditLog ADD COLUMN {columnName} {columnDefinition}";
|
||||
alter.ExecuteNonQuery();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enqueue an event for durable persistence. The returned <see cref="Task"/>
|
||||
/// completes once the event has been INSERTed (or, in the duplicate-EventId
|
||||
/// case, recognised as already present); it faults only if the writer loop
|
||||
/// itself collapses. The enqueue side never blocks on disk I/O — it only
|
||||
/// awaits the bounded channel's back-pressure when the writer is briefly
|
||||
/// behind.
|
||||
/// </summary>
|
||||
public Task WriteAsync(AuditEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
// Site rows always carry a non-null ForwardState; central rows leave it
|
||||
// null. Force Pending on enqueue so callers can pass a bare AuditEvent
|
||||
// without thinking about site-vs-central provenance.
|
||||
var siteEvt = evt.ForwardState is null
|
||||
? evt with { ForwardState = AuditForwardState.Pending }
|
||||
: evt;
|
||||
|
||||
var pending = new PendingAuditEvent(siteEvt);
|
||||
|
||||
// CreateBounded(FullMode=Wait) means WriteAsync will await room rather
|
||||
// than throw when full — exactly the hot-path back-pressure semantics
|
||||
// we want.
|
||||
if (!_writeQueue.Writer.TryWrite(pending))
|
||||
{
|
||||
// The writer is either completed (logger disposed) or the channel
|
||||
// is at capacity. Fall back to the async path which honours the
|
||||
// FullMode=Wait policy.
|
||||
return WriteSlowPathAsync(pending, ct);
|
||||
}
|
||||
|
||||
return pending.Completion.Task;
|
||||
}
|
||||
|
||||
private async Task WriteSlowPathAsync(PendingAuditEvent pending, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _writeQueue.Writer.WriteAsync(pending, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (ChannelClosedException)
|
||||
{
|
||||
pending.Completion.TrySetException(
|
||||
new ObjectDisposedException(nameof(SqliteAuditWriter),
|
||||
"Event could not be recorded: the audit writer has been disposed."));
|
||||
}
|
||||
|
||||
await pending.Completion.Task.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task ProcessWriteQueueAsync()
|
||||
{
|
||||
var batch = new List<PendingAuditEvent>(_options.BatchSize);
|
||||
|
||||
// ReadAllAsync completes when the channel is marked complete (Dispose).
|
||||
await foreach (var first in _writeQueue.Reader.ReadAllAsync().ConfigureAwait(false))
|
||||
{
|
||||
batch.Clear();
|
||||
batch.Add(first);
|
||||
|
||||
// Pull additional ready events up to BatchSize. TryRead is non-
|
||||
// blocking and lets us amortise the transaction overhead across a
|
||||
// burst of concurrent enqueues.
|
||||
while (batch.Count < _options.BatchSize &&
|
||||
_writeQueue.Reader.TryRead(out var next))
|
||||
{
|
||||
batch.Add(next);
|
||||
}
|
||||
|
||||
FlushBatch(batch);
|
||||
}
|
||||
}
|
||||
|
||||
private void FlushBatch(IReadOnlyList<PendingAuditEvent> batch)
|
||||
{
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
foreach (var pending in batch)
|
||||
{
|
||||
pending.Completion.TrySetException(
|
||||
new ObjectDisposedException(nameof(SqliteAuditWriter),
|
||||
"Event could not be recorded: the audit writer was disposed before the write completed."));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
using var transaction = _connection.BeginTransaction();
|
||||
try
|
||||
{
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.Transaction = transaction;
|
||||
cmd.CommandText = """
|
||||
INSERT INTO AuditLog (
|
||||
EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId
|
||||
) VALUES (
|
||||
$EventId, $OccurredAtUtc, $Channel, $Kind, $CorrelationId,
|
||||
$SourceSiteId, $SourceInstanceId, $SourceScript, $Actor, $Target,
|
||||
$Status, $HttpStatus, $DurationMs, $ErrorMessage, $ErrorDetail,
|
||||
$RequestSummary, $ResponseSummary, $PayloadTruncated, $Extra, $ForwardState,
|
||||
$ExecutionId
|
||||
);
|
||||
""";
|
||||
|
||||
var pEventId = cmd.Parameters.Add("$EventId", SqliteType.Text);
|
||||
var pOccurredAt = cmd.Parameters.Add("$OccurredAtUtc", SqliteType.Text);
|
||||
var pChannel = cmd.Parameters.Add("$Channel", SqliteType.Text);
|
||||
var pKind = cmd.Parameters.Add("$Kind", SqliteType.Text);
|
||||
var pCorrelationId = cmd.Parameters.Add("$CorrelationId", SqliteType.Text);
|
||||
var pSourceSiteId = cmd.Parameters.Add("$SourceSiteId", SqliteType.Text);
|
||||
var pSourceInstanceId = cmd.Parameters.Add("$SourceInstanceId", SqliteType.Text);
|
||||
var pSourceScript = cmd.Parameters.Add("$SourceScript", SqliteType.Text);
|
||||
var pActor = cmd.Parameters.Add("$Actor", SqliteType.Text);
|
||||
var pTarget = cmd.Parameters.Add("$Target", SqliteType.Text);
|
||||
var pStatus = cmd.Parameters.Add("$Status", SqliteType.Text);
|
||||
var pHttpStatus = cmd.Parameters.Add("$HttpStatus", SqliteType.Integer);
|
||||
var pDurationMs = cmd.Parameters.Add("$DurationMs", SqliteType.Integer);
|
||||
var pErrorMessage = cmd.Parameters.Add("$ErrorMessage", SqliteType.Text);
|
||||
var pErrorDetail = cmd.Parameters.Add("$ErrorDetail", SqliteType.Text);
|
||||
var pRequestSummary = cmd.Parameters.Add("$RequestSummary", SqliteType.Text);
|
||||
var pResponseSummary = cmd.Parameters.Add("$ResponseSummary", SqliteType.Text);
|
||||
var pPayloadTruncated = cmd.Parameters.Add("$PayloadTruncated", SqliteType.Integer);
|
||||
var pExtra = cmd.Parameters.Add("$Extra", SqliteType.Text);
|
||||
var pForwardState = cmd.Parameters.Add("$ForwardState", SqliteType.Text);
|
||||
var pExecutionId = cmd.Parameters.Add("$ExecutionId", SqliteType.Text);
|
||||
|
||||
foreach (var pending in batch)
|
||||
{
|
||||
var e = pending.Event;
|
||||
pEventId.Value = e.EventId.ToString();
|
||||
pOccurredAt.Value = e.OccurredAtUtc.ToString("o");
|
||||
pChannel.Value = e.Channel.ToString();
|
||||
pKind.Value = e.Kind.ToString();
|
||||
pCorrelationId.Value = (object?)e.CorrelationId?.ToString() ?? DBNull.Value;
|
||||
pSourceSiteId.Value = (object?)e.SourceSiteId ?? DBNull.Value;
|
||||
pSourceInstanceId.Value = (object?)e.SourceInstanceId ?? DBNull.Value;
|
||||
pSourceScript.Value = (object?)e.SourceScript ?? DBNull.Value;
|
||||
pActor.Value = (object?)e.Actor ?? DBNull.Value;
|
||||
pTarget.Value = (object?)e.Target ?? DBNull.Value;
|
||||
pStatus.Value = e.Status.ToString();
|
||||
pHttpStatus.Value = (object?)e.HttpStatus ?? DBNull.Value;
|
||||
pDurationMs.Value = (object?)e.DurationMs ?? DBNull.Value;
|
||||
pErrorMessage.Value = (object?)e.ErrorMessage ?? DBNull.Value;
|
||||
pErrorDetail.Value = (object?)e.ErrorDetail ?? DBNull.Value;
|
||||
pRequestSummary.Value = (object?)e.RequestSummary ?? DBNull.Value;
|
||||
pResponseSummary.Value = (object?)e.ResponseSummary ?? DBNull.Value;
|
||||
pPayloadTruncated.Value = e.PayloadTruncated ? 1 : 0;
|
||||
pExtra.Value = (object?)e.Extra ?? DBNull.Value;
|
||||
pForwardState.Value = (e.ForwardState ?? AuditForwardState.Pending).ToString();
|
||||
pExecutionId.Value = (object?)e.ExecutionId?.ToString() ?? DBNull.Value;
|
||||
|
||||
try
|
||||
{
|
||||
cmd.ExecuteNonQuery();
|
||||
pending.Completion.TrySetResult();
|
||||
}
|
||||
catch (SqliteException ex) when (ex.SqliteErrorCode == SqliteErrorConstraint)
|
||||
{
|
||||
// Duplicate EventId — first-write-wins (alog.md §11).
|
||||
// Treat as success: the lifecycle event is durably
|
||||
// recorded under the first writer's payload.
|
||||
_logger.LogDebug(ex,
|
||||
"Duplicate EventId {EventId} swallowed by SqliteAuditWriter",
|
||||
e.EventId);
|
||||
pending.Completion.TrySetResult();
|
||||
}
|
||||
}
|
||||
|
||||
transaction.Commit();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
transaction.Rollback();
|
||||
_logger.LogError(ex, "SqliteAuditWriter batch insert failed; faulting {Count} pending events", batch.Count);
|
||||
foreach (var pending in batch)
|
||||
{
|
||||
pending.Completion.TrySetException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns up to <paramref name="limit"/> rows in <see cref="AuditForwardState.Pending"/>,
|
||||
/// oldest <see cref="AuditEvent.OccurredAtUtc"/> first, with <see cref="AuditEvent.EventId"/>
|
||||
/// as the deterministic tiebreaker. Called by Bundle D's site telemetry
|
||||
/// actor to build a batch for the gRPC push.
|
||||
/// </summary>
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadPendingAsync(int limit, CancellationToken ct = default)
|
||||
{
|
||||
if (limit <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
|
||||
}
|
||||
|
||||
// SqliteConnection is not thread-safe so we go through the same write
|
||||
// lock the batch INSERTer uses. The actor caller is single-threaded,
|
||||
// so contention is bounded.
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $pending
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$limit", limit);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(limit, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns up to <paramref name="limit"/> rows in
|
||||
/// <see cref="AuditForwardState.Forwarded"/>, oldest
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/> first, with
|
||||
/// <see cref="AuditEvent.EventId"/> as the deterministic tiebreaker. The
|
||||
/// <see cref="AuditForwardState.Forwarded"/>-specific counterpart of
|
||||
/// <see cref="ReadPendingAsync"/>; used by tests to assert a row reached the
|
||||
/// <see cref="AuditForwardState.Forwarded"/> state specifically (unlike
|
||||
/// <see cref="ReadPendingSinceAsync"/>, which also returns
|
||||
/// <see cref="AuditForwardState.Pending"/> rows).
|
||||
/// </summary>
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadForwardedAsync(int limit, CancellationToken ct = default)
|
||||
{
|
||||
if (limit <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(limit), "limit must be > 0.");
|
||||
}
|
||||
|
||||
// Mirror ReadPendingAsync: the write lock guards the single connection.
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $forwarded
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
cmd.Parameters.AddWithValue("$limit", limit);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(limit, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Flips the supplied EventIds from <see cref="AuditForwardState.Pending"/> to
|
||||
/// <see cref="AuditForwardState.Forwarded"/> in a single UPDATE. Non-existent
|
||||
/// or already-forwarded ids are no-ops.
|
||||
/// </summary>
|
||||
public Task MarkForwardedAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(eventIds);
|
||||
if (eventIds.Count == 0)
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
// Build a single IN (...) parameter list so we issue one UPDATE per
|
||||
// batch regardless of size. Each id is bound as its own parameter,
|
||||
// so no string concatenation of user data ever enters the SQL.
|
||||
var sb = new System.Text.StringBuilder();
|
||||
sb.Append("UPDATE AuditLog SET ForwardState = $forwarded WHERE EventId IN (");
|
||||
for (int i = 0; i < eventIds.Count; i++)
|
||||
{
|
||||
if (i > 0) sb.Append(',');
|
||||
var p = $"$id{i}";
|
||||
sb.Append(p);
|
||||
cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
|
||||
}
|
||||
sb.Append(");");
|
||||
cmd.CommandText = sb.ToString();
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
|
||||
cmd.ExecuteNonQuery();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M6 reconciliation-pull read: returns up to <paramref name="batchSize"/> rows
|
||||
/// whose <c>OccurredAtUtc >= sinceUtc</c> and whose <see cref="AuditForwardState"/>
|
||||
/// is still <see cref="AuditForwardState.Pending"/> or
|
||||
/// <see cref="AuditForwardState.Forwarded"/>. Forwarded rows are included so the
|
||||
/// brief race window between a site-Forwarded ack and central ingest cannot
|
||||
/// silently drop rows; central dedups on <see cref="AuditEvent.EventId"/>.
|
||||
/// Ordered oldest <see cref="AuditEvent.OccurredAtUtc"/> first, EventId tiebreaker.
|
||||
/// </summary>
|
||||
public Task<IReadOnlyList<AuditEvent>> ReadPendingSinceAsync(
|
||||
DateTime sinceUtc, int batchSize, CancellationToken ct = default)
|
||||
{
|
||||
if (batchSize <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(batchSize), "batchSize must be > 0.");
|
||||
}
|
||||
|
||||
// Mirror ReadPendingAsync: the write lock guards the single connection.
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT EventId, OccurredAtUtc, Channel, Kind, CorrelationId,
|
||||
SourceSiteId, SourceInstanceId, SourceScript, Actor, Target,
|
||||
Status, HttpStatus, DurationMs, ErrorMessage, ErrorDetail,
|
||||
RequestSummary, ResponseSummary, PayloadTruncated, Extra, ForwardState,
|
||||
ExecutionId
|
||||
FROM AuditLog
|
||||
WHERE ForwardState IN ($pending, $forwarded)
|
||||
AND OccurredAtUtc >= $since
|
||||
ORDER BY OccurredAtUtc ASC, EventId ASC
|
||||
LIMIT $limit;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
// Normalise to UTC ISO-8601 round-trip format to match how OccurredAtUtc
|
||||
// is stored on insert ("o" format) — string comparison is monotonic for
|
||||
// that encoding so we can index-scan against it.
|
||||
cmd.Parameters.AddWithValue("$since", EnsureUtc(sinceUtc).ToString(
|
||||
"o", System.Globalization.CultureInfo.InvariantCulture));
|
||||
cmd.Parameters.AddWithValue("$limit", batchSize);
|
||||
|
||||
var rows = new List<AuditEvent>(Math.Min(batchSize, 256));
|
||||
using var reader = cmd.ExecuteReader();
|
||||
while (reader.Read())
|
||||
{
|
||||
rows.Add(MapRow(reader));
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<AuditEvent>>(rows);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M6 reconciliation-pull commit: flips the supplied EventIds to
|
||||
/// <see cref="AuditForwardState.Reconciled"/>, but ONLY for rows currently in
|
||||
/// <see cref="AuditForwardState.Pending"/> or <see cref="AuditForwardState.Forwarded"/>.
|
||||
/// Rows already in <see cref="AuditForwardState.Reconciled"/> are left untouched
|
||||
/// (idempotent re-call). Non-existent ids are silent no-ops.
|
||||
/// </summary>
|
||||
public Task MarkReconciledAsync(IReadOnlyList<Guid> eventIds, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(eventIds);
|
||||
if (eventIds.Count == 0)
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
using var cmd = _connection.CreateCommand();
|
||||
var sb = new System.Text.StringBuilder();
|
||||
sb.Append("UPDATE AuditLog SET ForwardState = $reconciled ")
|
||||
.Append("WHERE ForwardState IN ($pending, $forwarded) AND EventId IN (");
|
||||
for (int i = 0; i < eventIds.Count; i++)
|
||||
{
|
||||
if (i > 0) sb.Append(',');
|
||||
var p = $"$id{i}";
|
||||
sb.Append(p);
|
||||
cmd.Parameters.AddWithValue(p, eventIds[i].ToString());
|
||||
}
|
||||
sb.Append(");");
|
||||
cmd.CommandText = sb.ToString();
|
||||
cmd.Parameters.AddWithValue("$reconciled", AuditForwardState.Reconciled.ToString());
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
cmd.Parameters.AddWithValue("$forwarded", AuditForwardState.Forwarded.ToString());
|
||||
|
||||
cmd.ExecuteNonQuery();
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// M6 Bundle E (T6) health-metric surface: returns a point-in-time snapshot
|
||||
/// of the site queue's pending count, the oldest pending row's
|
||||
/// <see cref="AuditEvent.OccurredAtUtc"/>, and the on-disk file size. Called
|
||||
/// by the site-side <c>SiteAuditBacklogReporter</c> hosted service on its
|
||||
/// 30 s tick to refresh the <c>SiteHealthReport.SiteAuditBacklog</c> field.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The pending-count + oldest-row queries run inside the same write lock as
|
||||
/// the hot-path INSERT batch so the snapshot is consistent against the
|
||||
/// connection's view (no torn read of an in-flight transaction). The on-disk
|
||||
/// size lookup happens OUTSIDE the lock — it's a stat() call on the file
|
||||
/// path and doesn't touch the connection. In-memory and missing files
|
||||
/// return 0 bytes (the snapshot is for ops dashboards, not a correctness
|
||||
/// invariant).
|
||||
/// </remarks>
|
||||
public Task<SiteAuditBacklogSnapshot> GetBacklogStatsAsync(CancellationToken ct = default)
|
||||
{
|
||||
int pendingCount;
|
||||
DateTime? oldestPending;
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
// Single round-trip — COUNT(*) + MIN(OccurredAtUtc) over the same
|
||||
// index range avoids a second scan. The IX_SiteAuditLog_ForwardState_Occurred
|
||||
// index makes both aggregates cheap (count is a covering scan, min
|
||||
// is the first key).
|
||||
using var cmd = _connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT COUNT(*), MIN(OccurredAtUtc)
|
||||
FROM AuditLog
|
||||
WHERE ForwardState = $pending;
|
||||
""";
|
||||
cmd.Parameters.AddWithValue("$pending", AuditForwardState.Pending.ToString());
|
||||
|
||||
using var reader = cmd.ExecuteReader();
|
||||
reader.Read();
|
||||
pendingCount = reader.GetInt32(0);
|
||||
oldestPending = reader.IsDBNull(1)
|
||||
? null
|
||||
: DateTime.Parse(reader.GetString(1),
|
||||
System.Globalization.CultureInfo.InvariantCulture,
|
||||
System.Globalization.DateTimeStyles.RoundtripKind);
|
||||
}
|
||||
|
||||
// File-size lookup outside the lock — the DatabasePath option is the
|
||||
// canonical source. The connection-string-override branch (used by
|
||||
// some tests) keeps the same DatabasePath value, so this works
|
||||
// uniformly. In-memory / mode=memory paths return 0 because the file
|
||||
// doesn't exist on disk.
|
||||
long onDiskBytes = 0;
|
||||
try
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.DatabasePath) &&
|
||||
!_options.DatabasePath.StartsWith(":memory:", StringComparison.Ordinal) &&
|
||||
!_options.DatabasePath.Contains("mode=memory", StringComparison.OrdinalIgnoreCase) &&
|
||||
File.Exists(_options.DatabasePath))
|
||||
{
|
||||
onDiskBytes = new FileInfo(_options.DatabasePath).Length;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// File system probe is a best-effort health-metric — never abort
|
||||
// a backlog snapshot because stat() failed. Log and report 0.
|
||||
_logger.LogDebug(ex,
|
||||
"SqliteAuditWriter could not stat DB path {Path} for backlog snapshot.",
|
||||
_options.DatabasePath);
|
||||
}
|
||||
|
||||
return Task.FromResult(new SiteAuditBacklogSnapshot(
|
||||
PendingCount: pendingCount,
|
||||
OldestPendingUtc: oldestPending,
|
||||
OnDiskBytes: onDiskBytes));
|
||||
}
|
||||
|
||||
private static DateTime EnsureUtc(DateTime value) =>
|
||||
value.Kind == DateTimeKind.Utc
|
||||
? value
|
||||
: DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc);
|
||||
|
||||
private static AuditEvent MapRow(SqliteDataReader reader)
|
||||
{
|
||||
return new AuditEvent
|
||||
{
|
||||
EventId = Guid.Parse(reader.GetString(0)),
|
||||
OccurredAtUtc = DateTime.Parse(reader.GetString(1),
|
||||
System.Globalization.CultureInfo.InvariantCulture,
|
||||
System.Globalization.DateTimeStyles.RoundtripKind),
|
||||
Channel = Enum.Parse<AuditChannel>(reader.GetString(2)),
|
||||
Kind = Enum.Parse<AuditKind>(reader.GetString(3)),
|
||||
CorrelationId = reader.IsDBNull(4) ? null : Guid.Parse(reader.GetString(4)),
|
||||
SourceSiteId = reader.IsDBNull(5) ? null : reader.GetString(5),
|
||||
SourceInstanceId = reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
SourceScript = reader.IsDBNull(7) ? null : reader.GetString(7),
|
||||
Actor = reader.IsDBNull(8) ? null : reader.GetString(8),
|
||||
Target = reader.IsDBNull(9) ? null : reader.GetString(9),
|
||||
Status = Enum.Parse<AuditStatus>(reader.GetString(10)),
|
||||
HttpStatus = reader.IsDBNull(11) ? null : reader.GetInt32(11),
|
||||
DurationMs = reader.IsDBNull(12) ? null : reader.GetInt32(12),
|
||||
ErrorMessage = reader.IsDBNull(13) ? null : reader.GetString(13),
|
||||
ErrorDetail = reader.IsDBNull(14) ? null : reader.GetString(14),
|
||||
RequestSummary = reader.IsDBNull(15) ? null : reader.GetString(15),
|
||||
ResponseSummary = reader.IsDBNull(16) ? null : reader.GetString(16),
|
||||
PayloadTruncated = reader.GetInt32(17) != 0,
|
||||
Extra = reader.IsDBNull(18) ? null : reader.GetString(18),
|
||||
ForwardState = Enum.Parse<AuditForwardState>(reader.GetString(19)),
|
||||
ExecutionId = reader.IsDBNull(20) ? null : Guid.Parse(reader.GetString(20)),
|
||||
};
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
Task? writerLoop;
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return;
|
||||
// Stop accepting new events. Setting _disposed first ensures any
|
||||
// FlushBatch entered after we mark disposed will fault its pending
|
||||
// events rather than touching the about-to-close connection.
|
||||
_writeQueue.Writer.TryComplete();
|
||||
writerLoop = _writerLoop;
|
||||
}
|
||||
|
||||
// Wait outside the lock — the loop reacquires it for each batch.
|
||||
try
|
||||
{
|
||||
if (writerLoop is not null)
|
||||
{
|
||||
await writerLoop.WaitAsync(TimeSpan.FromSeconds(5)).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
_logger.LogWarning("SqliteAuditWriter writer loop did not drain within 5s of dispose.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// The loop's per-batch try/catch already routed individual failures
|
||||
// to pending TCSes; a top-level fault here is unexpected.
|
||||
_logger.LogError(ex, "SqliteAuditWriter writer loop faulted during dispose.");
|
||||
}
|
||||
|
||||
lock (_writeLock)
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_connection.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>An audit event awaiting persistence by the background writer.</summary>
|
||||
private sealed class PendingAuditEvent
|
||||
{
|
||||
public PendingAuditEvent(AuditEvent evt)
|
||||
{
|
||||
Event = evt;
|
||||
Completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
}
|
||||
|
||||
public AuditEvent Event { get; }
|
||||
public TaskCompletionSource Completion { get; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
namespace ScadaLink.AuditLog.Site;
|
||||
|
||||
/// <summary>
|
||||
/// Options for the site-side SQLite hot-path audit writer.
|
||||
/// Mirrors the ScadaLink.SiteEventLogging pattern: a single SQLite connection
|
||||
/// fed by a background writer task draining a bounded
|
||||
/// <see cref="System.Threading.Channels.Channel{T}"/> so script-thread enqueues
|
||||
/// never block on disk I/O.
|
||||
/// </summary>
|
||||
public sealed class SqliteAuditWriterOptions
|
||||
{
|
||||
/// <summary>SQLite database path (or in-memory URI for tests).</summary>
|
||||
public string DatabasePath { get; set; } = "auditlog.db";
|
||||
|
||||
/// <summary>
|
||||
/// Capacity of the bounded write queue. Set high enough that ordinary
|
||||
/// script bursts never fill it; <see cref="System.Threading.Channels.BoundedChannelFullMode.Wait"/>
|
||||
/// applies when the writer falls behind.
|
||||
/// </summary>
|
||||
public int ChannelCapacity { get; set; } = 4096;
|
||||
|
||||
/// <summary>Max number of pending events the writer drains in one transaction.</summary>
|
||||
public int BatchSize { get; set; } = 256;
|
||||
|
||||
/// <summary>Soft flush interval the writer enforces when fewer than BatchSize events are queued.</summary>
|
||||
public int FlushIntervalMs { get; set; } = 50;
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Messages.Integration;
|
||||
using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): translates per-attempt
|
||||
/// notifications from the store-and-forward retry loop into one (or two)
|
||||
/// <see cref="CachedCallTelemetry"/> packets and pushes them through
|
||||
/// <see cref="ICachedCallTelemetryForwarder"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The S&F loop's <see cref="ICachedCallLifecycleObserver"/> reports a
|
||||
/// single coarse outcome per attempt; the audit pipeline however models the
|
||||
/// lifecycle as TWO rows on terminal outcomes — an <c>Attempted</c>
|
||||
/// (<see cref="AuditKind.ApiCallCached"/> / <see cref="AuditKind.DbWriteCached"/>)
|
||||
/// row capturing the per-attempt mechanics, plus a <see cref="AuditKind.CachedResolve"/>
|
||||
/// row marking the terminal state for downstream consumers. The bridge fans
|
||||
/// out per outcome:
|
||||
/// </para>
|
||||
/// <list type="bullet">
|
||||
/// <item><description><c>TransientFailure</c> -> one Attempted(Failed) row.</description></item>
|
||||
/// <item><description><c>Delivered</c> -> Attempted(Delivered) + CachedResolve(Delivered).</description></item>
|
||||
/// <item><description><c>PermanentFailure</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
|
||||
/// <item><description><c>ParkedMaxRetries</c> -> Attempted(Failed) + CachedResolve(Parked).</description></item>
|
||||
/// </list>
|
||||
/// <para>
|
||||
/// <b>Best-effort emission (alog.md §7):</b> the bridge itself never throws;
|
||||
/// the underlying forwarder swallows + logs its own failures.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CachedCallLifecycleBridge : ICachedCallLifecycleObserver
|
||||
{
|
||||
private readonly ICachedCallTelemetryForwarder _forwarder;
|
||||
private readonly ILogger<CachedCallLifecycleBridge> _logger;
|
||||
|
||||
public CachedCallLifecycleBridge(
|
||||
ICachedCallTelemetryForwarder forwarder,
|
||||
ILogger<CachedCallLifecycleBridge> logger)
|
||||
{
|
||||
_forwarder = forwarder ?? throw new ArgumentNullException(nameof(forwarder));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task OnAttemptCompletedAsync(
|
||||
CachedCallAttemptContext context, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
try
|
||||
{
|
||||
await EmitAttemptedAsync(context, ct).ConfigureAwait(false);
|
||||
|
||||
if (IsTerminal(context.Outcome))
|
||||
{
|
||||
await EmitResolveAsync(context, ct).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Defensive — both EmitX paths call the forwarder which is itself
|
||||
// best-effort. A throw here is unexpected, but the alog.md §7
|
||||
// contract requires we never propagate.
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallLifecycleBridge: unexpected throw for {TrackedOperationId} (Outcome {Outcome})",
|
||||
context.TrackedOperationId, context.Outcome);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task EmitAttemptedAsync(CachedCallAttemptContext context, CancellationToken ct)
|
||||
{
|
||||
// Per-attempt row: kind discriminates channel; status is always
|
||||
// Attempted regardless of outcome (success vs. failure is captured
|
||||
// by the companion HttpStatus / ErrorMessage fields, NOT by flipping
|
||||
// the status — CachedResolve carries the terminal Status). Per the
|
||||
// M3 brief and alog.md §4.
|
||||
var kind = ChannelToAttemptKind(context.Channel);
|
||||
var status = AuditStatus.Attempted;
|
||||
|
||||
var packet = BuildPacket(
|
||||
context,
|
||||
kind: kind,
|
||||
status: status,
|
||||
// Operational status mirror — for the per-attempt row the
|
||||
// operational state is the running status; the bridge always
|
||||
// writes "Attempted" so reconciliation can't roll back.
|
||||
operationalStatus: "Attempted",
|
||||
terminalAtUtc: null,
|
||||
lastError: context.LastError,
|
||||
httpStatus: context.HttpStatus);
|
||||
|
||||
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task EmitResolveAsync(CachedCallAttemptContext context, CancellationToken ct)
|
||||
{
|
||||
var (auditStatus, operationalStatus) = TerminalOutcomeToStatuses(context.Outcome);
|
||||
|
||||
var packet = BuildPacket(
|
||||
context,
|
||||
kind: AuditKind.CachedResolve,
|
||||
status: auditStatus,
|
||||
operationalStatus: operationalStatus,
|
||||
terminalAtUtc: context.OccurredAtUtc,
|
||||
lastError: context.LastError,
|
||||
httpStatus: context.HttpStatus);
|
||||
|
||||
await _forwarder.ForwardAsync(packet, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static CachedCallTelemetry BuildPacket(
|
||||
CachedCallAttemptContext context,
|
||||
AuditKind kind,
|
||||
AuditStatus status,
|
||||
string operationalStatus,
|
||||
DateTime? terminalAtUtc,
|
||||
string? lastError,
|
||||
int? httpStatus)
|
||||
{
|
||||
var channel = ChannelStringToEnum(context.Channel);
|
||||
|
||||
return new CachedCallTelemetry(
|
||||
Audit: new AuditEvent
|
||||
{
|
||||
EventId = Guid.NewGuid(),
|
||||
OccurredAtUtc = DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
|
||||
Channel = channel,
|
||||
Kind = kind,
|
||||
CorrelationId = context.TrackedOperationId.Value,
|
||||
// Audit Log #23 (ExecutionId Task 4): the originating script
|
||||
// execution's per-run correlation id, threaded through the S&F
|
||||
// buffer; null on rows buffered before Task 4 (back-compat).
|
||||
ExecutionId = context.ExecutionId,
|
||||
SourceSiteId = string.IsNullOrEmpty(context.SourceSite) ? null : context.SourceSite,
|
||||
SourceInstanceId = context.SourceInstanceId,
|
||||
// Audit Log #23 (ExecutionId Task 4): SourceScript is now
|
||||
// threaded through the S&F buffer alongside ExecutionId — the
|
||||
// retry-loop cached rows carry the same provenance the
|
||||
// script-side cached rows do. Null on pre-Task-4 buffered rows.
|
||||
SourceScript = context.SourceScript,
|
||||
Target = context.Target,
|
||||
Status = status,
|
||||
HttpStatus = httpStatus,
|
||||
DurationMs = context.DurationMs,
|
||||
ErrorMessage = lastError,
|
||||
ForwardState = AuditForwardState.Pending,
|
||||
},
|
||||
Operational: new SiteCallOperational(
|
||||
TrackedOperationId: context.TrackedOperationId,
|
||||
Channel: context.Channel,
|
||||
Target: context.Target,
|
||||
SourceSite: context.SourceSite,
|
||||
Status: operationalStatus,
|
||||
RetryCount: context.RetryCount,
|
||||
LastError: lastError,
|
||||
HttpStatus: httpStatus,
|
||||
CreatedAtUtc: DateTime.SpecifyKind(context.CreatedAtUtc, DateTimeKind.Utc),
|
||||
UpdatedAtUtc: DateTime.SpecifyKind(context.OccurredAtUtc, DateTimeKind.Utc),
|
||||
TerminalAtUtc: terminalAtUtc is null
|
||||
? null
|
||||
: DateTime.SpecifyKind(terminalAtUtc.Value, DateTimeKind.Utc)));
|
||||
}
|
||||
|
||||
private static AuditKind ChannelToAttemptKind(string channel) => channel switch
|
||||
{
|
||||
"ApiOutbound" => AuditKind.ApiCallCached,
|
||||
"DbOutbound" => AuditKind.DbWriteCached,
|
||||
// Defensive default — the S&F observer is filtered to cached-call
|
||||
// categories so this branch shouldn't fire in practice.
|
||||
_ => AuditKind.ApiCallCached,
|
||||
};
|
||||
|
||||
private static AuditChannel ChannelStringToEnum(string channel) => channel switch
|
||||
{
|
||||
"ApiOutbound" => AuditChannel.ApiOutbound,
|
||||
"DbOutbound" => AuditChannel.DbOutbound,
|
||||
_ => AuditChannel.ApiOutbound,
|
||||
};
|
||||
|
||||
private static (AuditStatus auditStatus, string operationalStatus) TerminalOutcomeToStatuses(
|
||||
CachedCallAttemptOutcome outcome) => outcome switch
|
||||
{
|
||||
CachedCallAttemptOutcome.Delivered =>
|
||||
(AuditStatus.Delivered, "Delivered"),
|
||||
CachedCallAttemptOutcome.PermanentFailure =>
|
||||
(AuditStatus.Parked, "Parked"),
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries =>
|
||||
(AuditStatus.Parked, "Parked"),
|
||||
// TransientFailure isn't terminal — see IsTerminal — but the switch
|
||||
// is exhaustive so we route it through Failed for safety.
|
||||
CachedCallAttemptOutcome.TransientFailure =>
|
||||
(AuditStatus.Failed, "Failed"),
|
||||
_ => (AuditStatus.Failed, "Failed"),
|
||||
};
|
||||
|
||||
private static bool IsTerminal(CachedCallAttemptOutcome outcome) => outcome switch
|
||||
{
|
||||
CachedCallAttemptOutcome.Delivered => true,
|
||||
CachedCallAttemptOutcome.PermanentFailure => true,
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries => true,
|
||||
CachedCallAttemptOutcome.TransientFailure => false,
|
||||
_ => false,
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Interfaces;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Messages.Integration;
|
||||
using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Site-side dual emitter for cached-call lifecycle telemetry (Audit Log #23 /
|
||||
/// M3). Sister to <see cref="SiteAuditTelemetryActor"/>: where the M2 actor
|
||||
/// drains audit-only events, this forwarder takes a combined
|
||||
/// <see cref="CachedCallTelemetry"/> packet and fans it out to the two
|
||||
/// site-local stores in a single call:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>The <see cref="AuditEvent"/> row is written via
|
||||
/// <see cref="IAuditWriter"/> (the site <c>FallbackAuditWriter</c> +
|
||||
/// <c>SqliteAuditWriter</c> chain established in M2).</description></item>
|
||||
/// <item><description>The operational <see cref="SiteCallOperational"/> half
|
||||
/// updates the site-local <c>OperationTracking</c> SQLite store via
|
||||
/// <see cref="IOperationTrackingStore"/>, with the per-lifecycle method
|
||||
/// (<c>Enqueue</c> / <c>Attempt</c> / <c>Terminal</c>) selected from the
|
||||
/// audit row's <see cref="AuditKind"/>.</description></item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Best-effort contract (alog.md §7):</b> a thrown writer OR a thrown
|
||||
/// tracking store must never propagate to the calling script. Both emission
|
||||
/// halves are wrapped in independent try/catch blocks so a SQLite outage on
|
||||
/// one side cannot starve the other — the failure is logged and the call
|
||||
/// returns normally.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Local-write only — the wire push is the drain actor's job.</b> This
|
||||
/// forwarder is deliberately synchronous against the two site-local SQLite
|
||||
/// stores and never pushes to central itself. The site→central transport is
|
||||
/// now live: <c>ClusterClientSiteAuditClient</c> is the production binding of
|
||||
/// <see cref="ISiteStreamAuditClient"/> on site roles (with
|
||||
/// <c>NoOpSiteStreamAuditClient</c> retained only for central/test composition
|
||||
/// roots). The push happens out-of-band: <see cref="SiteAuditTelemetryActor"/>
|
||||
/// sweeps the <c>AuditEvent</c> rows this forwarder wrote — they live in SQLite
|
||||
/// tagged <see cref="AuditForwardState.Pending"/> — and drains them to central
|
||||
/// via that client. A single drain loop therefore covers both the audit-only
|
||||
/// emissions and the cached-call emissions this forwarder produces.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class CachedCallTelemetryForwarder : ICachedCallTelemetryForwarder
|
||||
{
|
||||
private readonly IAuditWriter _auditWriter;
|
||||
private readonly IOperationTrackingStore? _trackingStore;
|
||||
private readonly ILogger<CachedCallTelemetryForwarder> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Construct the forwarder. <paramref name="trackingStore"/> is optional —
|
||||
/// when null only the audit half of the packet is emitted, which matches
|
||||
/// the M3 Bundle F composition-root contract on Central nodes: the
|
||||
/// AuditLog DI surface registers the forwarder unconditionally (mirroring
|
||||
/// the IAuditWriter chain) but the site-only tracking store has no central
|
||||
/// registration. Production site nodes wire both — the central lazy
|
||||
/// resolution is a no-op path kept symmetric with the M2 writer chain.
|
||||
/// </summary>
|
||||
public CachedCallTelemetryForwarder(
|
||||
IAuditWriter auditWriter,
|
||||
IOperationTrackingStore? trackingStore,
|
||||
ILogger<CachedCallTelemetryForwarder> logger)
|
||||
{
|
||||
_auditWriter = auditWriter ?? throw new ArgumentNullException(nameof(auditWriter));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_trackingStore = trackingStore;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fan out one combined-telemetry packet to the audit writer and the
|
||||
/// tracking store. Returns once both halves have been attempted (success
|
||||
/// OR logged failure). NEVER throws — exceptions are caught per-half and
|
||||
/// logged at warning level so the calling script's outbound action is not
|
||||
/// disturbed.
|
||||
/// </summary>
|
||||
public async Task ForwardAsync(CachedCallTelemetry telemetry, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(telemetry);
|
||||
|
||||
// Independent try/catch — a thrown audit writer must not prevent the
|
||||
// tracking-store update from running (and vice-versa). Both halves
|
||||
// are best-effort.
|
||||
await TryEmitAuditAsync(telemetry, ct).ConfigureAwait(false);
|
||||
await TryEmitTrackingAsync(telemetry, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task TryEmitAuditAsync(CachedCallTelemetry telemetry, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _auditWriter.WriteAsync(telemetry.Audit, ct).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// alog.md §7 best-effort contract — log and swallow. The audit
|
||||
// pipeline's own retry/recovery (RingBufferFallback in the
|
||||
// FallbackAuditWriter) handles transient writer failures upstream;
|
||||
// a throw bubbling up here means the writer's own swallow contract
|
||||
// failed, which is itself best-effort-handled.
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallTelemetryForwarder: audit emission threw for EventId {EventId} (Kind {Kind}, Status {Status})",
|
||||
telemetry.Audit.EventId, telemetry.Audit.Kind, telemetry.Audit.Status);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task TryEmitTrackingAsync(CachedCallTelemetry telemetry, CancellationToken ct)
|
||||
{
|
||||
if (_trackingStore is null)
|
||||
{
|
||||
// No site-local tracking store wired — Central composition root or
|
||||
// an integration-test host that skipped AddSiteRuntime. Emitting
|
||||
// through the audit half is still meaningful; the tracking half
|
||||
// is a no-op rather than an error.
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
switch (telemetry.Audit.Kind)
|
||||
{
|
||||
case AuditKind.CachedSubmit:
|
||||
// Enqueue — insert-if-not-exists with the operational
|
||||
// channel as the kind discriminator. RetryCount is fixed
|
||||
// at 0 by the tracking store's INSERT contract.
|
||||
await _trackingStore.RecordEnqueueAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Channel,
|
||||
telemetry.Operational.Target,
|
||||
telemetry.Audit.SourceInstanceId,
|
||||
telemetry.Audit.SourceScript,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
case AuditKind.ApiCallCached:
|
||||
case AuditKind.DbWriteCached:
|
||||
// Attempt — advance retry counter + last-error/HTTP-status.
|
||||
// Terminal rows are guarded by the store's WHERE clause.
|
||||
await _trackingStore.RecordAttemptAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Status,
|
||||
telemetry.Operational.RetryCount,
|
||||
telemetry.Operational.LastError,
|
||||
telemetry.Operational.HttpStatus,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
case AuditKind.CachedResolve:
|
||||
// Terminal — first-write-wins on the resolve flip.
|
||||
await _trackingStore.RecordTerminalAsync(
|
||||
telemetry.Operational.TrackedOperationId,
|
||||
telemetry.Operational.Status,
|
||||
telemetry.Operational.LastError,
|
||||
telemetry.Operational.HttpStatus,
|
||||
ct).ConfigureAwait(false);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Defensive — only the four cached-lifecycle kinds are
|
||||
// expected on this path. Anything else is logged so a
|
||||
// mis-routed packet is visible but never crashes the
|
||||
// forwarder.
|
||||
_logger.LogWarning(
|
||||
"CachedCallTelemetryForwarder: unexpected audit kind {Kind} on tracking emission for EventId {EventId}",
|
||||
telemetry.Audit.Kind, telemetry.Audit.EventId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"CachedCallTelemetryForwarder: tracking-store emission threw for TrackedOperationId {Id} (Status {Status})",
|
||||
telemetry.Operational.TrackedOperationId, telemetry.Operational.Status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
using Akka.Actor;
|
||||
using ScadaLink.Commons.Entities.Audit;
|
||||
using ScadaLink.Commons.Messages.Audit;
|
||||
using ScadaLink.Communication.Grpc;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Production <see cref="ISiteStreamAuditClient"/> binding for site composition
|
||||
/// roots: pushes audit telemetry to central over Akka <c>ClusterClient</c> via
|
||||
/// the site's <c>SiteCommunicationActor</c>. The actor forwards the command to
|
||||
/// <c>/user/central-communication</c> and the central
|
||||
/// <c>CentralCommunicationActor</c> Asks the <c>AuditLogIngestActor</c> proxy —
|
||||
/// the same command/control transport notifications already use. Wired by the
|
||||
/// Host for site roles; central and test composition roots keep the
|
||||
/// <see cref="NoOpSiteStreamAuditClient"/> DI default (they have no
|
||||
/// <c>SiteCommunicationActor</c>).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Throw-on-failure contract.</b> An Ask timeout or a faulted reply
|
||||
/// (<see cref="Status.Failure"/>) propagates as a thrown exception out of the
|
||||
/// <c>Ingest*Async</c> methods — it is NOT caught and turned into an empty ack.
|
||||
/// The <see cref="SiteAuditTelemetryActor"/> drain loop treats a thrown
|
||||
/// exception as transient and leaves the rows <c>Pending</c> for the next tick.
|
||||
/// Swallowing the fault into an empty ack would be indistinguishable from "zero
|
||||
/// rows accepted" and would silently lose the retry signal. Task 1 confirmed
|
||||
/// the central receiving end does not collapse an ingest fault into an empty
|
||||
/// ack either, so a site-side Ask through the whole path faults cleanly on a
|
||||
/// central-side timeout.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The batches arrive as proto DTOs (<see cref="AuditEventBatch"/> /
|
||||
/// <see cref="CachedTelemetryBatch"/>) because the
|
||||
/// <see cref="SiteAuditTelemetryActor"/> builds them with
|
||||
/// <see cref="AuditEventDtoMapper.ToDto"/>. This client converts them back into
|
||||
/// the <see cref="AuditEvent"/> / <see cref="SiteCall"/> entities the Akka
|
||||
/// command messages carry — the same DTO→entity translation the
|
||||
/// <c>SiteStreamGrpcServer</c> performs for the gRPC reconciliation path.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ClusterClientSiteAuditClient : ISiteStreamAuditClient
|
||||
{
|
||||
private readonly IActorRef _siteCommunicationActor;
|
||||
private readonly TimeSpan _askTimeout;
|
||||
|
||||
/// <param name="siteCommunicationActor">
|
||||
/// The site's <c>SiteCommunicationActor</c> — it forwards the ingest command
|
||||
/// over the registered central ClusterClient and routes the reply back to
|
||||
/// this client's Ask.
|
||||
/// </param>
|
||||
/// <param name="askTimeout">
|
||||
/// Ask timeout for the round-trip to central. On expiry the Ask throws
|
||||
/// <see cref="Akka.Actor.AskTimeoutException"/>, which the drain loop treats
|
||||
/// as transient (rows stay <c>Pending</c>).
|
||||
/// </param>
|
||||
public ClusterClientSiteAuditClient(IActorRef siteCommunicationActor, TimeSpan askTimeout)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(siteCommunicationActor);
|
||||
_siteCommunicationActor = siteCommunicationActor;
|
||||
_askTimeout = askTimeout;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
|
||||
var events = new List<AuditEvent>(batch.Events.Count);
|
||||
foreach (var dto in batch.Events)
|
||||
{
|
||||
events.Add(AuditEventDtoMapper.FromDto(dto));
|
||||
}
|
||||
|
||||
// Ask<T> throws AskTimeoutException on timeout and rethrows a
|
||||
// Status.Failure's inner cause — both surface as a thrown exception so
|
||||
// the drain loop keeps the rows Pending. We deliberately do NOT catch.
|
||||
var reply = await _siteCommunicationActor
|
||||
.Ask<IngestAuditEventsReply>(new IngestAuditEventsCommand(events), _askTimeout, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return ToAck(reply.AcceptedEventIds);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
|
||||
var entries = new List<CachedTelemetryEntry>(batch.Packets.Count);
|
||||
foreach (var packet in batch.Packets)
|
||||
{
|
||||
var audit = AuditEventDtoMapper.FromDto(packet.AuditEvent);
|
||||
var siteCall = SiteCallDtoMapper.FromDto(packet.Operational);
|
||||
entries.Add(new CachedTelemetryEntry(audit, siteCall));
|
||||
}
|
||||
|
||||
// Same throw-on-failure contract as IngestAuditEventsAsync. The reply
|
||||
// type is IngestCachedTelemetryReply (the central dual-write reply),
|
||||
// distinct from IngestAuditEventsReply.
|
||||
var reply = await _siteCommunicationActor
|
||||
.Ask<IngestCachedTelemetryReply>(new IngestCachedTelemetryCommand(entries), _askTimeout, ct)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return ToAck(reply.AcceptedEventIds);
|
||||
}
|
||||
|
||||
private static IngestAck ToAck(IReadOnlyList<Guid> acceptedEventIds)
|
||||
{
|
||||
var ack = new IngestAck();
|
||||
foreach (var id in acceptedEventIds)
|
||||
{
|
||||
ack.AcceptedEventIds.Add(id.ToString());
|
||||
}
|
||||
return ack;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
using ScadaLink.Communication.Grpc;
|
||||
|
||||
namespace ScadaLink.AuditLog.Site.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Mockable abstraction over the central site-audit push surface that
|
||||
/// <see cref="SiteAuditTelemetryActor"/> uses to forward <see cref="AuditEventBatch"/>
|
||||
/// payloads. The production implementation is
|
||||
/// <see cref="ClusterClientSiteAuditClient"/> — a ClusterClient-based client,
|
||||
/// wired in the Host for site roles, that forwards batches to central via the
|
||||
/// site's <c>SiteCommunicationActor</c>. Unit tests substitute via NSubstitute
|
||||
/// against this interface so the actor never needs a live transport.
|
||||
/// </summary>
|
||||
public interface ISiteStreamAuditClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Forwards <paramref name="batch"/> to the central audit-ingest path. The
|
||||
/// returned <see cref="IngestAck"/> carries the <c>accepted_event_ids</c>
|
||||
/// the actor will flip to
|
||||
/// <see cref="ScadaLink.Commons.Types.Enums.AuditForwardState.Forwarded"/>
|
||||
/// in the site SQLite queue.
|
||||
/// </summary>
|
||||
Task<IngestAck> IngestAuditEventsAsync(AuditEventBatch batch, CancellationToken ct);
|
||||
|
||||
/// <summary>
|
||||
/// Forwards the combined <see cref="CachedTelemetryBatch"/> (Audit Log #23)
|
||||
/// to the central cached-telemetry ingest path. Each packet carries both the
|
||||
/// audit row and the operational <c>SiteCalls</c> upsert; central writes both
|
||||
/// in a single MS SQL transaction. Returns the same <see cref="IngestAck"/>
|
||||
/// shape as <see cref="IngestAuditEventsAsync"/> so the site-side forwarder
|
||||
/// can flip the underlying audit rows to
|
||||
/// <see cref="ScadaLink.Commons.Types.Enums.AuditForwardState.Forwarded"/>
|
||||
/// once central has acknowledged them.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The production <see cref="ClusterClientSiteAuditClient"/> forwards over
|
||||
/// the ClusterClient transport; the <see cref="NoOpSiteStreamAuditClient"/>
|
||||
/// DI default (used by central and test composition roots) returns an empty
|
||||
/// ack so no rows are flipped.
|
||||
/// </remarks>
|
||||
Task<IngestAck> IngestCachedTelemetryAsync(CachedTelemetryBatch batch, CancellationToken ct);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user