using NATS.Client.Core;
using NATS.E2E.Cluster.Tests.Infrastructure;

namespace NATS.E2E.Cluster.Tests;

// Go reference: server/cluster_test.go — TestClusterRouteReconnect, TestClusterQueueSubs, etc.
[Collection("E2E-ThreeNodeCluster")]
public class ClusterResilienceTests(ThreeNodeClusterFixture fixture) : IClassFixture<ThreeNodeClusterFixture>
{
    /// <summary>
    /// Kill node 2, then verify a pub on node 0 reaches a sub on node 1.
    /// Go reference: server/cluster_test.go TestClusterRouteSingleHopSubjectMatch
    /// </summary>
    [Fact]
    public async Task NodeDies_TrafficReroutesToSurvivors()
    {
        using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));

        await fixture.KillNode(2);

        // Surviving cluster (nodes 0 and 1) should have 1 route each
        await fixture.WaitForFullMeshAsync(expectedRoutes: 1);

        await using var publisher = fixture.CreateClient(0);
        await using var subscriber = fixture.CreateClient(1);

        await publisher.ConnectAsync();
        await subscriber.ConnectAsync();

        const string subject = "resilience.node-dies";
        await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);

        await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);

        await publisher.PublishAsync(subject, "hello-after-kill", cancellationToken: cts.Token);
        await publisher.PingAsync(cts.Token);

        var msg = await sub.Msgs.ReadAsync(cts.Token);
        msg.Data.ShouldBe("hello-after-kill");

        // Restore node 2 and wait for full mesh before next test
        await fixture.RestartNode(2);
        await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
    }

    /// <summary>
    /// Kill node 2, restart it, then verify subscriptions on the restarted node receive messages from node 0.
    /// Go reference: server/cluster_test.go TestClusterRouteReconnect
    /// </summary>
    [Fact]
    public async Task NodeRejoins_SubscriptionsPropagateAgain()
    {
        using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));

        await fixture.KillNode(2);
        await fixture.RestartNode(2);
        await fixture.WaitForFullMeshAsync(expectedRoutes: 2);

        await using var publisher = fixture.CreateClient(0);
        await using var subscriber = fixture.CreateClient(2);

        await publisher.ConnectAsync();
        await subscriber.ConnectAsync();

        const string subject = "resilience.node-rejoins";
        await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);

        await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);

        await publisher.PublishAsync(subject, "hello-after-rejoin", cancellationToken: cts.Token);
        await publisher.PingAsync(cts.Token);

        var msg = await sub.Msgs.ReadAsync(cts.Token);
        msg.Data.ShouldBe("hello-after-rejoin");
    }

    /// <summary>
    /// Kill node 1 and restart it, then verify pub on node 0 reaches sub on node 1 once full mesh is restored.
    /// Go reference: server/cluster_test.go TestClusterRouteReconnect
    /// </summary>
    [Fact]
    public async Task AllRoutesReconnect_AfterNodeRestart()
    {
        using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));

        await fixture.KillNode(1);
        await fixture.RestartNode(1);
        await fixture.WaitForFullMeshAsync(expectedRoutes: 2);

        await using var publisher = fixture.CreateClient(0);
        await using var subscriber = fixture.CreateClient(1);

        await publisher.ConnectAsync();
        await subscriber.ConnectAsync();

        const string subject = "resilience.all-routes-reconnect";
        await using var sub = await subscriber.SubscribeCoreAsync<string>(subject, cancellationToken: cts.Token);

        await WaitForPropagationAsync(publisher, subscriber, $"probe.{subject}", cts.Token);

        await publisher.PublishAsync(subject, "hello-after-restart", cancellationToken: cts.Token);
        await publisher.PingAsync(cts.Token);

        var msg = await sub.Msgs.ReadAsync(cts.Token);
        msg.Data.ShouldBe("hello-after-restart");
    }

    /// <summary>
    /// Two queue subscribers on node 1 in the same group receive all 10 messages even when node 2 is killed.
    /// Go reference: server/cluster_test.go TestClusterQueueSubscriberWithLoss
    /// </summary>
    [Fact]
    [SlopwatchSuppress("SW004", "Task.Delay(Timeout.Infinite) is used purely as a cancellable wait primitive alongside WhenAny — it never actually delays; drainTimeout drives the deadline")]
    public async Task QueueGroup_NodeDies_RemainingMembersDeliver()
    {
        using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(60));

        await using var publisher = fixture.CreateClient(0);
        await using var subConn1 = fixture.CreateClient(1);
        await using var subConn2 = fixture.CreateClient(1);

        await publisher.ConnectAsync();
        await subConn1.ConnectAsync();
        await subConn2.ConnectAsync();

        const string subject = "resilience.queue-group";
        const string queue = "rq";

        await using var sub1 = await subConn1.SubscribeCoreAsync<string>(
            subject, queueGroup: queue, cancellationToken: cts.Token);
        await using var sub2 = await subConn2.SubscribeCoreAsync<string>(
            subject, queueGroup: queue, cancellationToken: cts.Token);

        // Wait for subscriptions to propagate to node 0
        await WaitForPropagationAsync(publisher, subConn1, $"probe.{subject}", cts.Token);

        // Kill node 2 — this tests cluster stability but node 1 subs remain unaffected
        await fixture.KillNode(2);
        await fixture.WaitForFullMeshAsync(expectedRoutes: 1);

        const int messageCount = 10;
        for (var i = 0; i < messageCount; i++)
        {
            await publisher.PublishAsync(subject, $"msg-{i}", cancellationToken: cts.Token);
        }

        await publisher.PingAsync(cts.Token);

        // Drain messages from both subscribers; each message is delivered to exactly one member.
        // Use a TaskCompletionSource as the completion signal so the happy path never throws.
        var received = new List<string>(messageCount);
        var allReceived = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);

        async Task DrainAsync(INatsSub<string> sub)
        {
            await foreach (var msg in sub.Msgs.ReadAllAsync(cts.Token))
            {
                int count;
                lock (received)
                {
                    received.Add(msg.Data!);
                    count = received.Count;
                }

                if (count >= messageCount)
                {
                    allReceived.TrySetResult();
                    return;
                }
            }
        }

        var drain1 = DrainAsync(sub1);
        var drain2 = DrainAsync(sub2);

        using var drainTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(10));
        await allReceived.Task.WaitAsync(drainTimeout.Token);

        received.Count.ShouldBe(messageCount);

        // Restore node 2 and wait for full mesh before next test
        await fixture.RestartNode(2);
        await fixture.WaitForFullMeshAsync(expectedRoutes: 2);
    }

    // ---------------------------------------------------------------------------
    // Helpers
    // ---------------------------------------------------------------------------

    /// <summary>
    /// Probes the route between publisher and subscriber by sending probe messages until
    /// the subscriber sees one, indicating subscriptions have propagated across the cluster.
    /// </summary>
    [SlopwatchSuppress("SW003", "OperationCanceledException from the inner readCts timeout is the expected retry signal; swallowing it is intentional so the polling loop continues to the next tick")]
    private static async Task WaitForPropagationAsync(
        NatsConnection publisher,
        NatsConnection subscriber,
        string probeSubject,
        CancellationToken ct)
    {
        await using var probeSub = await subscriber.SubscribeCoreAsync<string>(probeSubject, cancellationToken: ct);

        // PingAsync on subscriber flushes the SUB to the server before we start probing.
        await subscriber.PingAsync(ct);

        using var timer = new PeriodicTimer(TimeSpan.FromMilliseconds(150));
        while (await timer.WaitForNextTickAsync(ct))
        {
            await publisher.PublishAsync(probeSubject, "probe", cancellationToken: ct);

            // PingAsync is a request/reply round-trip: when the pong returns, any message
            // the server dispatched before the ping is already buffered in probeSub.Msgs.
            await publisher.PingAsync(ct);

            if (probeSub.Msgs.TryRead(out _))
                return;
        }
    }
}