fix(configuration): resolve Medium code-review findings (Configuration-002, -003, -006, -009)

Configuration-002: sp_PublishGeneration is transaction-nesting aware
(BEGIN TRANSACTION vs SAVE TRANSACTION on @@TRANCOUNT) so a caller's outer
transaction survives a publish failure; sp_ValidateDraft wrapped in TRY/CATCH.
Configuration-003: ValidatePathLength uses the cluster's actual Enterprise/Site
lengths when available, falling back to the conservative approximation.
Configuration-006: ResilientConfigReader treats a command-timeout
TaskCanceledException as a fault (not caller cancellation) and falls back.
Configuration-009: removed the checked-in plaintext sa connection string;
CreateDbContext now requires OTOPCUA_CONFIG_CONNECTION.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-05-22 08:13:27 -04:00
parent 7e54e1e4a0
commit c126fc7a7d
9 changed files with 274 additions and 31 deletions

View File

@@ -5,19 +5,27 @@ namespace ZB.MOM.WW.OtOpcUa.Configuration;
/// <summary>
/// Used by <c>dotnet ef</c> at design time (migrations, scaffolding). Reads the connection string
/// from the <c>OTOPCUA_CONFIG_CONNECTION</c> environment variable, falling back to the local dev
/// container on <c>localhost:1433</c>.
/// from the <c>OTOPCUA_CONFIG_CONNECTION</c> environment variable.
/// </summary>
/// <remarks>
/// Set the variable before running migration commands, e.g.:
/// <code>
/// $env:OTOPCUA_CONFIG_CONNECTION = "Server=10.100.0.35,14330;Database=OtOpcUaConfig;Trusted_Connection=True;TrustServerCertificate=True;"
/// dotnet ef migrations add MyMigration --project src/Core/ZB.MOM.WW.OtOpcUa.Configuration
/// </code>
/// No credential is embedded in source. Do not add a plaintext password as a fallback.
/// </remarks>
public sealed class DesignTimeDbContextFactory : IDesignTimeDbContextFactory<OtOpcUaConfigDbContext>
{
// Host-port 14330 avoids collision with the native MSSQL14 instance on 1433 (Galaxy "ZB" DB).
private const string DefaultConnectionString =
"Server=localhost,14330;Database=OtOpcUaConfig;User Id=sa;Password=OtOpcUaDev_2026!;TrustServerCertificate=True;Encrypt=False;";
public OtOpcUaConfigDbContext CreateDbContext(string[] args)
{
var connection = Environment.GetEnvironmentVariable("OTOPCUA_CONFIG_CONNECTION")
?? DefaultConnectionString;
var connection = Environment.GetEnvironmentVariable("OTOPCUA_CONFIG_CONNECTION");
if (string.IsNullOrWhiteSpace(connection))
throw new InvalidOperationException(
"OTOPCUA_CONFIG_CONNECTION is not set. " +
"Export the variable before running 'dotnet ef' commands. Example: " +
"Server=10.100.0.35,14330;Database=OtOpcUaConfig;Trusted_Connection=True;TrustServerCertificate=True;");
var options = new DbContextOptionsBuilder<OtOpcUaConfigDbContext>()
.UseSqlServer(connection, sql => sql.MigrationsAssembly(typeof(OtOpcUaConfigDbContext).Assembly.FullName))

View File

@@ -48,7 +48,13 @@ public sealed class ResilientConfigReader
UseJitter = true,
Delay = TimeSpan.FromMilliseconds(100),
MaxDelay = TimeSpan.FromSeconds(1),
ShouldHandle = new PredicateBuilder().Handle<Exception>(ex => ex is not OperationCanceledException),
// Handle ALL exceptions including OperationCanceledException. A SQL command-level
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
// when the caller's token is NOT cancelled, and must be retried just like any other
// transient error. Polly itself checks the cancellation token between retries and
// stops with OperationCanceledException on genuine caller cancellation regardless of
// this predicate.
ShouldHandle = new PredicateBuilder().Handle<Exception>(),
});
}
@@ -76,7 +82,11 @@ public sealed class ResilientConfigReader
_staleFlag.MarkFresh();
return result;
}
catch (Exception ex) when (ex is not OperationCanceledException)
// Catch all exceptions that are NOT genuine caller cancellations. A SQL command-level
// timeout surfaces as TaskCanceledException (derives from OperationCanceledException)
// but the caller's token is NOT cancelled — we must fall back to the sealed cache for
// that case, not propagate. Only rethrow if the caller actually requested cancellation.
catch (Exception ex) when (ex is not OperationCanceledException || !cancellationToken.IsCancellationRequested)
{
_logger.LogWarning(ex, "Central-DB read failed after retries; falling back to sealed cache for cluster {ClusterId}", clusterId);
// GenerationCacheUnavailableException surfaces intentionally — fails the caller's

View File

@@ -270,7 +270,22 @@ BEGIN
SET NOCOUNT ON;
SET XACT_ABORT ON;
BEGIN TRANSACTION;
-- Transaction-nesting awareness: if a caller (e.g. sp_RollbackToGeneration) already
-- holds a transaction, we use SAVE TRANSACTION so our failure path rolls back only to
-- the savepoint instead of issuing a bare ROLLBACK that wipes the caller's transaction
-- (which sets @@TRANCOUNT = 0 and causes error 3902 on the caller's subsequent COMMIT).
DECLARE @OwnsTxn bit = 0;
DECLARE @SaveName nvarchar(32) = N'sp_PublishGeneration';
IF @@TRANCOUNT = 0
BEGIN
BEGIN TRANSACTION;
SET @OwnsTxn = 1;
END
ELSE
BEGIN
SAVE TRANSACTION sp_PublishGeneration;
END
DECLARE @Lock nvarchar(255) = N'OtOpcUa_Publish_' + @ClusterId;
DECLARE @LockResult int;
@@ -278,20 +293,22 @@ BEGIN
IF @LockResult < 0
BEGIN
RAISERROR('PublishConflict: another publish is in progress for cluster %s', 16, 1, @ClusterId);
ROLLBACK;
IF @OwnsTxn = 1 ROLLBACK;
ELSE ROLLBACK TRANSACTION sp_PublishGeneration;
RETURN;
END
-- sp_ValidateDraft signals every rejection with RAISERROR(..., 16, 1) — a severity-16 error is
-- NOT batch-aborting and SET XACT_ABORT ON does not abort the transaction for it, so without a
-- TRY/CATCH control would return here and the draft would publish despite failed validation.
-- Catch the validation error, roll back the publish transaction, and re-raise so the caller sees
-- the real validation failure.
-- Catch the validation error, roll back the publish transaction (only to our savepoint when a
-- caller owns the outer transaction), and re-raise so the caller sees the real validation failure.
BEGIN TRY
EXEC dbo.sp_ValidateDraft @DraftGenerationId = @DraftGenerationId;
END TRY
BEGIN CATCH
IF @@TRANCOUNT > 0 ROLLBACK;
IF @OwnsTxn = 1 ROLLBACK;
ELSE ROLLBACK TRANSACTION sp_PublishGeneration;
THROW;
END CATCH
@@ -324,15 +341,16 @@ BEGIN
IF @@ROWCOUNT = 0
BEGIN
RAISERROR('Draft %I64d for cluster %s not found (was it already published?)', 16, 1, @DraftGenerationId, @ClusterId);
ROLLBACK;
RAISERROR('Draft %I64d for cluster %s not in Draft status (was it already published?)', 16, 1, @DraftGenerationId, @ClusterId);
IF @OwnsTxn = 1 ROLLBACK;
ELSE ROLLBACK TRANSACTION sp_PublishGeneration;
RETURN;
END
INSERT dbo.ConfigAuditLog (Principal, EventType, ClusterId, GenerationId)
VALUES (SUSER_SNAME(), 'Published', @ClusterId, @DraftGenerationId);
COMMIT;
IF @OwnsTxn = 1 COMMIT;
END
";

View File

@@ -11,6 +11,18 @@ public sealed class DraftSnapshot
public required long GenerationId { get; init; }
public required string ClusterId { get; init; }
/// <summary>
/// Cluster's Enterprise segment (UNS level 1). When set, <see cref="DraftValidator"/> uses
/// the actual length for path-length checks instead of a conservative 32-char upper bound.
/// </summary>
public string? Enterprise { get; init; }
/// <summary>
/// Cluster's Site segment (UNS level 2). When set, <see cref="DraftValidator"/> uses the
/// actual length for path-length checks instead of a conservative 32-char upper bound.
/// </summary>
public string? Site { get; init; }
public IReadOnlyList<Namespace> Namespaces { get; init; } = [];
public IReadOnlyList<DriverInstance> DriverInstances { get; init; } = [];
public IReadOnlyList<Device> Devices { get; init; } = [];

View File

@@ -59,8 +59,13 @@ public static class DraftValidator
/// <summary>Cluster.Enterprise + Site + area + line + equipment + 4 slashes ≤ 200 chars.</summary>
private static void ValidatePathLength(DraftSnapshot draft, List<ValidationError> errors)
{
// The cluster row isn't in the snapshot — we assume caller pre-validated Enterprise+Site
// length and bound them as constants <= 64 chars each. Here we validate the dynamic portion.
// Use actual Enterprise/Site lengths when the snapshot carries them (populated by
// DraftValidationService from the ServerCluster row). Fall back to a conservative
// 32-char upper bound per segment when not supplied — over-penalises short values
// but never under-penalises long ones, which is acceptable for the fallback case.
var enterpriseLen = draft.Enterprise?.Length ?? 32;
var siteLen = draft.Site?.Length ?? 32;
var areaById = draft.UnsAreas.ToDictionary(a => a.UnsAreaId);
var lineById = draft.UnsLines.ToDictionary(l => l.UnsLineId);
@@ -69,8 +74,7 @@ public static class DraftValidator
if (!lineById.TryGetValue(eq.UnsLineId!, out var line)) continue;
if (!areaById.TryGetValue(line.UnsAreaId, out var area)) continue;
// rough upper bound: Enterprise+Site at most 32+32; add dynamic segments + 4 slashes
var len = 32 + 32 + area.Name.Length + line.Name.Length + eq.Name.Length + 4;
var len = enterpriseLen + siteLen + area.Name.Length + line.Name.Length + eq.Name.Length + 4;
if (len > MaxPathLength)
errors.Add(new("PathTooLong",
$"Equipment path exceeds {MaxPathLength} chars (approx {len})",

View File

@@ -18,10 +18,16 @@ public sealed class DraftValidationService(OtOpcUaConfigDbContext db)
.FirstOrDefaultAsync(g => g.GenerationId == draftId, ct)
?? throw new InvalidOperationException($"Draft {draftId} not found");
// Load the cluster row so path-length validation uses actual Enterprise/Site lengths.
var cluster = await db.ServerClusters.AsNoTracking()
.FirstOrDefaultAsync(c => c.ClusterId == draft.ClusterId, ct);
var snapshot = new DraftSnapshot
{
GenerationId = draft.GenerationId,
ClusterId = draft.ClusterId,
Enterprise = cluster?.Enterprise,
Site = cluster?.Site,
Namespaces = await db.Namespaces.AsNoTracking().Where(n => n.GenerationId == draftId).ToListAsync(ct),
DriverInstances = await db.DriverInstances.AsNoTracking().Where(d => d.GenerationId == draftId).ToListAsync(ct),
Devices = await db.Devices.AsNoTracking().Where(d => d.GenerationId == draftId).ToListAsync(ct),