Phase 1 Streams B–E scaffold + Phase 2 Streams A–C scaffold — 8 new projects with ~70 new tests, all green alongside the 494 v1 IntegrationTests baseline (parity preserved: no v1 tests broken; legacy OtOpcUa.Host untouched). Phase 1 finish: Configuration project (16 entities + 10 enums + DbContext + DesignTimeDbContextFactory + InitialSchema/StoredProcedures/AuthorizationGrants migrations — 8 procs including sp_PublishGeneration with MERGE on ExternalIdReservation per decision #124, sp_RollbackToGeneration cloning rows into a new published generation, sp_ValidateDraft with cross-cluster-namespace + EquipmentUuid-immutability + ZTag/SAPID reservation pre-flight, sp_ComputeGenerationDiff with CHECKSUM-based row signature — plus OtOpcUaNode/OtOpcUaAdmin SQL roles with EXECUTE grants scoped to per-principal-class proc sets and DENY UPDATE/DELETE/INSERT/SELECT on dbo schema); managed DraftValidator covering UNS segment regex, path length, EquipmentUuid immutability across generations, same-cluster namespace binding (decision #122), reservation pre-flight, EquipmentId derivation (decision #125), driver↔namespace compatibility — returning every failing rule in one pass; LiteDB local cache with round-trip + ring pruning + corruption-fast-fail; GenerationApplier with per-entity Added/Removed/Modified diff and dependency-ordered callbacks (namespace → driver → device → equipment → poll-group → tag, Removed before Added); Core project with GenericDriverNodeManager (scaffold for the Phase 2 Galaxy port) and DriverHost lifecycle registry; Server project using Microsoft.Extensions.Hosting BackgroundService replacing TopShelf, with NodeBootstrap that falls back to LiteDB cache when the central DB is unreachable (decision #79); Admin project scaffolded as Blazor Server with Bootstrap 5 sidebar layout, cookie auth, three admin roles (ConfigViewer/ConfigEditor/FleetAdmin), Cluster + Generation services fronting the stored procs. Phase 2 scaffold: Driver.Galaxy.Shared (netstandard2.0) with full MessagePack IPC contract surface — Hello version negotiation, Open/CloseSession, Heartbeat, DiscoverHierarchy + GalaxyObjectInfo/GalaxyAttributeInfo, Read/WriteValues, Subscribe/Unsubscribe/OnDataChange, AlarmSubscribe/Event/Ack, HistoryRead, HostConnectivityStatus, Recycle — plus length-prefixed framing (decision #28) with a 16 MiB cap and thread-safe FrameWriter/FrameReader; Driver.Galaxy.Host (net48) implementing the Tier C cross-cutting protections from driver-stability.md — strict PipeAcl (allow configured server SID only, explicit deny on LocalSystem + Administrators), PipeServer with caller-SID verification via pipe.RunAsClient + WindowsIdentity.GetCurrent and per-process shared-secret Hello, Galaxy-specific MemoryWatchdog (warn at max(1.5×baseline, +200 MB), soft-recycle at max(2×baseline, +200 MB), hard ceiling 1.5 GB, slope ≥5 MB/min over 30-min rolling window), RecyclePolicy (1 soft recycle per hour cap + 03:00 local daily scheduled), PostMortemMmf (1000-entry ring buffer in %ProgramData%\OtOpcUa\driver-postmortem\galaxy.mmf, survives hard crash, readable cross-process), MxAccessHandle : SafeHandle (ReleaseHandle loops Marshal.ReleaseComObject until refcount=0 then calls optional unregister callback), StaPump with responsiveness probe (BlockingCollection dispatcher for Phase 1 — real Win32 GetMessage/DispatchMessage pump slots in with the same semantics when the Galaxy code lift happens), IsExternalInit shim for init setters on .NET 4.8; Driver.Galaxy.Proxy (net10) implementing IDriver + ITagDiscovery forwarding over the IPC channel with MX data-type and security-classification mapping, plus Supervisor pieces — Backoff (5s → 15s → 60s capped, reset-on-stable-run), CircuitBreaker (3 crashes per 5 min opens; 1h → 4h → manual cooldown escalation; sticky alert doesn't auto-clear), HeartbeatMonitor (2s cadence, 3 consecutive misses = host dead per driver-stability.md). Infrastructure: docker SQL Server remapped to host port 14330 to coexist with the native MSSQL14 Galaxy ZB DB instance on 1433; NuGetAuditSuppress applied per-project for two System.Security.Cryptography.Xml advisories that only reach via EF Core Design with PrivateAssets=all (fix ships in 11.0.0-preview); .slnx gains 14 project registrations. Deferred with explicit TODOs in docs/v2/implementation/phase-2-partial-exit-evidence.md: Phase 1 Stream E Admin UI pages (Generations listing + draft-diff-publish, Equipment CRUD with OPC 40010 fields, UNS Areas/Lines tabs, ACLs + permission simulator, Generic JSON config editor, SignalR real-time, Release-Reservation + Merge-Equipment workflows, LDAP login page, AppServer smoke test per decision #142), Phase 2 Stream D (Galaxy MXAccess code lift out of legacy OtOpcUa.Host, dual-service installer, appsettings → DriverConfig migration script, legacy Host deletion — blocked by parity), Phase 2 Stream E (v1 IntegrationTests against v2 topology, Client.CLI walkthrough diff, four 2026-04-13 stability findings regression tests, adversarial review — requires live MXAccess runtime).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-17 21:35:25 -04:00
parent fc0ce36308
commit 01fd90c178
128 changed files with 12352 additions and 4 deletions

View File

@@ -0,0 +1,64 @@
using System;
using System.Collections.Generic;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Stability;
/// <summary>
/// Galaxy-specific RSS watchdog per <c>driver-stability.md §"Memory Watchdog Thresholds"</c>.
/// Baseline-relative + absolute caps. Sustained-slope detection uses a rolling 30-min window.
/// Pluggable RSS source keeps it unit-testable.
/// </summary>
public sealed class MemoryWatchdog
{
/// <summary>Absolute hard ceiling — process is force-killed above this.</summary>
public long HardCeilingBytes { get; init; } = 1_500L * 1024 * 1024;
/// <summary>Sustained slope (bytes/min) above which soft recycle is scheduled.</summary>
public long SustainedSlopeBytesPerMinute { get; init; } = 5L * 1024 * 1024;
public TimeSpan SlopeWindow { get; init; } = TimeSpan.FromMinutes(30);
private readonly long _baselineBytes;
private readonly Queue<RssSample> _samples = new();
public MemoryWatchdog(long baselineBytes)
{
_baselineBytes = baselineBytes;
}
/// <summary>Called every 30s with the current RSS. Returns the action the supervisor should take.</summary>
public WatchdogAction Sample(long rssBytes, DateTime utcNow)
{
_samples.Enqueue(new RssSample(utcNow, rssBytes));
while (_samples.Count > 0 && utcNow - _samples.Peek().TimestampUtc > SlopeWindow)
_samples.Dequeue();
if (rssBytes >= HardCeilingBytes)
return WatchdogAction.HardKill;
var softThreshold = Math.Max(_baselineBytes * 2, _baselineBytes + 200L * 1024 * 1024);
var warnThreshold = Math.Max((long)(_baselineBytes * 1.5), _baselineBytes + 200L * 1024 * 1024);
if (rssBytes >= softThreshold) return WatchdogAction.SoftRecycle;
if (rssBytes >= warnThreshold) return WatchdogAction.Warn;
if (_samples.Count >= 2)
{
var oldest = _samples.Peek();
var span = (utcNow - oldest.TimestampUtc).TotalMinutes;
if (span >= SlopeWindow.TotalMinutes * 0.9) // need ~full window to trust the slope
{
var delta = rssBytes - oldest.RssBytes;
var bytesPerMin = delta / span;
if (bytesPerMin >= SustainedSlopeBytesPerMinute)
return WatchdogAction.SoftRecycle;
}
}
return WatchdogAction.None;
}
private readonly record struct RssSample(DateTime TimestampUtc, long RssBytes);
}
public enum WatchdogAction { None, Warn, SoftRecycle, HardKill }

View File

@@ -0,0 +1,121 @@
using System;
using System.IO;
using System.IO.MemoryMappedFiles;
using System.Runtime.InteropServices;
using System.Text;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Stability;
/// <summary>
/// Ring-buffer of the last <see cref="Capacity"/> IPC operations, written into a
/// memory-mapped file. On hard crash the supervisor reads the MMF after the corpse is gone
/// to see what was in flight. Thread-safe for the single-writer, multi-reader pattern.
/// </summary>
/// <remarks>
/// File layout:
/// <code>
/// [16-byte header: magic(4) | version(4) | capacity(4) | writeIndex(4)]
/// [capacity × 256-byte entries: each is [8-byte utcUnixMs | 8-byte opKind | 240-byte UTF-8 message]]
/// </code>
/// </remarks>
public sealed class PostMortemMmf : IDisposable
{
private const int Magic = 0x4F505043; // 'OPPC'
private const int Version = 1;
private const int HeaderBytes = 16;
public const int EntryBytes = 256;
private const int MessageOffset = 16;
private const int MessageCapacity = EntryBytes - MessageOffset;
public int Capacity { get; }
public string Path { get; }
private readonly MemoryMappedFile _mmf;
private readonly MemoryMappedViewAccessor _accessor;
private readonly object _writeGate = new();
public PostMortemMmf(string path, int capacity = 1000)
{
if (capacity <= 0) throw new ArgumentOutOfRangeException(nameof(capacity));
Capacity = capacity;
Path = path;
var fileBytes = HeaderBytes + capacity * EntryBytes;
Directory.CreateDirectory(System.IO.Path.GetDirectoryName(path)!);
var fs = new FileStream(path, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.Read);
fs.SetLength(fileBytes);
_mmf = MemoryMappedFile.CreateFromFile(fs, null, fileBytes,
MemoryMappedFileAccess.ReadWrite, HandleInheritability.None, leaveOpen: false);
_accessor = _mmf.CreateViewAccessor(0, fileBytes, MemoryMappedFileAccess.ReadWrite);
// Initialize header if blank/garbage.
if (_accessor.ReadInt32(0) != Magic)
{
_accessor.Write(0, Magic);
_accessor.Write(4, Version);
_accessor.Write(8, capacity);
_accessor.Write(12, 0); // writeIndex
}
}
public void Write(long opKind, string message)
{
lock (_writeGate)
{
var idx = _accessor.ReadInt32(12);
var offset = HeaderBytes + idx * EntryBytes;
_accessor.Write(offset + 0, DateTimeOffset.UtcNow.ToUnixTimeMilliseconds());
_accessor.Write(offset + 8, opKind);
var msgBytes = Encoding.UTF8.GetBytes(message ?? string.Empty);
var copy = Math.Min(msgBytes.Length, MessageCapacity - 1);
_accessor.WriteArray(offset + MessageOffset, msgBytes, 0, copy);
_accessor.Write(offset + MessageOffset + copy, (byte)0); // null terminator
var next = (idx + 1) % Capacity;
_accessor.Write(12, next);
}
}
/// <summary>Reads all entries in order (oldest → newest). Safe to call from another process.</summary>
public PostMortemEntry[] ReadAll()
{
var magic = _accessor.ReadInt32(0);
if (magic != Magic) return [];
var capacity = _accessor.ReadInt32(8);
var writeIndex = _accessor.ReadInt32(12);
var entries = new PostMortemEntry[capacity];
var count = 0;
for (var i = 0; i < capacity; i++)
{
var slot = (writeIndex + i) % capacity;
var offset = HeaderBytes + slot * EntryBytes;
var ts = _accessor.ReadInt64(offset + 0);
if (ts == 0) continue; // unwritten
var op = _accessor.ReadInt64(offset + 8);
var msgBuf = new byte[MessageCapacity];
_accessor.ReadArray(offset + MessageOffset, msgBuf, 0, MessageCapacity);
var nulTerm = Array.IndexOf<byte>(msgBuf, 0);
var msg = Encoding.UTF8.GetString(msgBuf, 0, nulTerm < 0 ? MessageCapacity : nulTerm);
entries[count++] = new PostMortemEntry(ts, op, msg);
}
Array.Resize(ref entries, count);
return entries;
}
public void Dispose()
{
_accessor.Dispose();
_mmf.Dispose();
}
}
public readonly record struct PostMortemEntry(long UtcUnixMs, long OpKind, string Message);

View File

@@ -0,0 +1,40 @@
using System;
using System.Collections.Generic;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Host.Stability;
/// <summary>
/// Frequency-capped soft-recycle decision per <c>driver-stability.md §"Recycle Policy"</c>.
/// Default cap: 1 soft recycle per hour. Scheduled recycle at 03:00 local; supervisor reads
/// <see cref="ShouldSoftRecycleScheduled"/> to decide.
/// </summary>
public sealed class RecyclePolicy
{
public TimeSpan SoftRecycleCap { get; init; } = TimeSpan.FromHours(1);
public int DailyRecycleHourLocal { get; init; } = 3;
private readonly List<DateTime> _recentRecyclesUtc = new();
/// <summary>Returns true if a soft recycle would be allowed under the frequency cap.</summary>
public bool TryRequestSoftRecycle(DateTime utcNow, out string? reason)
{
_recentRecyclesUtc.RemoveAll(t => utcNow - t > SoftRecycleCap);
if (_recentRecyclesUtc.Count > 0)
{
reason = $"soft-recycle frequency cap: last recycle was {(utcNow - _recentRecyclesUtc[_recentRecyclesUtc.Count - 1]).TotalMinutes:F1} min ago";
return false;
}
_recentRecyclesUtc.Add(utcNow);
reason = null;
return true;
}
public bool ShouldSoftRecycleScheduled(DateTime localNow, ref DateTime lastScheduledDateLocal)
{
if (localNow.Hour != DailyRecycleHourLocal) return false;
if (localNow.Date <= lastScheduledDateLocal.Date) return false;
lastScheduledDateLocal = localNow.Date;
return true;
}
}