Instrument the historian plugin with runtime query health counters and read-only cluster failover so operators can detect silent query degradation and keep serving history when a single cluster node goes down

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-13 14:08:32 -04:00
parent 4fe37fd1b7
commit 8f340553d9
20 changed files with 1526 additions and 32 deletions

View File

@@ -19,7 +19,10 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva.Tests
ServerName = "test-historian",
Port = 32568,
IntegratedSecurity = true,
CommandTimeoutSeconds = 5
CommandTimeoutSeconds = 5,
// Zero cooldown so reconnect-after-error tests can retry through the cluster picker
// on the very next call, matching the pre-cluster behavior they were written against.
FailureCooldownSeconds = 0
};
[Fact]
@@ -174,5 +177,105 @@ namespace ZB.MOM.WW.LmxOpcUa.Historian.Aveva.Tests
// Dispose should handle the null connection gracefully
Should.NotThrow(() => ds.Dispose());
}
// ---------- HistorianHealthSnapshot instrumentation ----------
[Fact]
public void GetHealthSnapshot_FreshDataSource_ReportsZeroCounters()
{
var ds = new HistorianDataSource(DefaultConfig, new FakeHistorianConnectionFactory());
var snap = ds.GetHealthSnapshot();
snap.TotalQueries.ShouldBe(0);
snap.TotalSuccesses.ShouldBe(0);
snap.TotalFailures.ShouldBe(0);
snap.ConsecutiveFailures.ShouldBe(0);
snap.LastSuccessTime.ShouldBeNull();
snap.LastFailureTime.ShouldBeNull();
snap.LastError.ShouldBeNull();
snap.ProcessConnectionOpen.ShouldBeFalse();
snap.EventConnectionOpen.ShouldBeFalse();
}
[Fact]
public void GetHealthSnapshot_AfterConnectionFailure_RecordsFailure()
{
var factory = new FakeHistorianConnectionFactory
{
ConnectException = new InvalidOperationException("Connection refused")
};
var ds = new HistorianDataSource(DefaultConfig, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 100)
.GetAwaiter().GetResult();
var snap = ds.GetHealthSnapshot();
snap.TotalQueries.ShouldBe(1);
snap.TotalFailures.ShouldBe(1);
snap.TotalSuccesses.ShouldBe(0);
snap.ConsecutiveFailures.ShouldBe(1);
snap.LastFailureTime.ShouldNotBeNull();
snap.LastError.ShouldContain("Connection refused");
snap.ProcessConnectionOpen.ShouldBeFalse();
}
[Fact]
public void GetHealthSnapshot_AfterMultipleFailures_IncrementsConsecutive()
{
var factory = new FakeHistorianConnectionFactory
{
ConnectException = new InvalidOperationException("boom")
};
var ds = new HistorianDataSource(DefaultConfig, factory);
for (var i = 0; i < 4; i++)
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 100)
.GetAwaiter().GetResult();
var snap = ds.GetHealthSnapshot();
snap.TotalFailures.ShouldBe(4);
snap.ConsecutiveFailures.ShouldBe(4);
snap.TotalSuccesses.ShouldBe(0);
}
[Fact]
public void GetHealthSnapshot_AcrossReadPaths_CountsAllFailures()
{
var factory = new FakeHistorianConnectionFactory
{
ConnectException = new InvalidOperationException("sdk down")
};
var ds = new HistorianDataSource(DefaultConfig, factory);
ds.ReadRawAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
ds.ReadAggregateAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 60000, "Average")
.GetAwaiter().GetResult();
ds.ReadAtTimeAsync("Tag1", new[] { DateTime.UtcNow })
.GetAwaiter().GetResult();
ds.ReadEventsAsync(null, DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 10)
.GetAwaiter().GetResult();
var snap = ds.GetHealthSnapshot();
snap.TotalFailures.ShouldBe(4);
snap.TotalQueries.ShouldBe(4);
snap.LastError.ShouldContain("sdk down");
}
[Fact]
public void GetHealthSnapshot_ErrorMessageCarriesReadPath()
{
var factory = new FakeHistorianConnectionFactory
{
ConnectException = new InvalidOperationException("unreachable")
};
var ds = new HistorianDataSource(DefaultConfig, factory);
ds.ReadAggregateAsync("Tag1", DateTime.UtcNow.AddHours(-1), DateTime.UtcNow, 60000, "Average")
.GetAwaiter().GetResult();
var snap = ds.GetHealthSnapshot();
snap.LastError.ShouldStartWith("aggregate:");
}
}
}