scadalink-design/src/ScadaLink.TemplateEngine/Validation/CSharpDelimiterScanner.cs

namespace ScadaLink.TemplateEngine.Validation;

/// <summary>
/// String/comment-aware scanner for the balanced-delimiter ("does it look like
/// valid C#") checks used by <see cref="ScriptCompiler"/> and
/// <c>SharedScriptService.ValidateSyntax</c>.
///
/// <para>
/// This is <b>not</b> a compiler. It is an interim structural check that walks
/// the source once and tracks <c>{}</c>, <c>[]</c> and <c>()</c> depth while
/// correctly skipping over the C# lexical constructs in which a delimiter is
/// inert: line/block comments, regular string literals (with <c>\</c> escapes),
/// verbatim strings (<c>@"..."</c>, where <c>""</c> escapes a quote and <c>\</c>
/// is literal), interpolated strings (<c>$"..."</c> / <c>$@"..."</c> — the holes
/// <c>{...}</c> are code and <c>{{</c>/<c>}}</c> are escaped braces), raw string
/// literals (<c>"""..."""</c>), and char literals (<c>'}'</c>).
/// </para>
///
/// <para>
/// It is intentionally conservative: when the real Roslyn-based compiler is
/// wired in (see <see cref="ScriptCompiler"/>) this hand-rolled scan should be
/// replaced by <c>CSharpSyntaxTree.ParseText</c> diagnostics. Until then this
/// scanner removes the false positives that a naive character count produced
/// for valid scripts containing a delimiter inside a string or comment.
/// </para>
/// </summary>
internal static class CSharpDelimiterScanner
{
    /// <summary>The kind of delimiter mismatch found, if any.</summary>
    internal enum Mismatch
    {
        None,
        UnexpectedCloseBrace,
        UnexpectedCloseBracket,
        UnexpectedCloseParen,
        UnclosedBrace,
        UnclosedBracket,
        UnclosedParen,
        UnclosedBlockComment,
        UnterminatedString,
        UnterminatedChar,
    }

    /// <summary>
    /// Returns true when <paramref name="pattern"/> occurs in a <b>code</b>
    /// region of <paramref name="code"/> — i.e. not wholly inside a string
    /// literal, char literal, or comment. Used by the interim forbidden-API
    /// scan so that the inert text <c>System.IO.</c> in a comment or string
    /// literal is not flagged as a forbidden API call (TemplateEngine-006).
    ///
    /// <para>
    /// This removes the false-positive half of the substring scan. It does
    /// <b>not</b> close the bypass half: namespace aliases, <c>using static</c>,
    /// and <c>global::</c>-qualified references still evade a pure text match.
    /// Authoritative forbidden-API enforcement requires Roslyn semantic symbol
    /// analysis and is deferred to the real script compiler / Site Runtime
    /// sandbox; this check is advisory only.
    /// </para>
    /// </summary>
    internal static bool ContainsInCode(string code, string pattern)
    {
        if (string.IsNullOrEmpty(pattern))
            return false;

        // Blank out every string/char-literal/comment span, then do an ordinary
        // substring search over what remains (the code regions).
        var codeOnly = BlankNonCodeSpans(code);
        return codeOnly.Contains(pattern, StringComparison.Ordinal);
    }

    /// <summary>
    /// Replaces the content of every comment, string literal, and char literal
    /// with spaces (newlines preserved), leaving only code regions intact.
    /// Delimiter characters themselves are also blanked so a pattern cannot
    /// straddle a literal boundary.
    /// </summary>
    private static string BlankNonCodeSpans(string code)
    {
        var buffer = code.ToCharArray();
        int n = code.Length;
        int i = 0;

        void Blank(int from, int to)
        {
            for (int k = from; k < to && k < n; k++)
                if (buffer[k] != '\n' && buffer[k] != '\r')
                    buffer[k] = ' ';
        }

        while (i < n)
        {
            char c = code[i];
            char next = i + 1 < n ? code[i + 1] : '\0';
            int start = i;

            if (c == '/' && next == '/')
            {
                i += 2;
                while (i < n && code[i] != '\n') i++;
                Blank(start, i);
                continue;
            }
            if (c == '/' && next == '*')
            {
                i += 2;
                while (i < n && !(code[i] == '*' && i + 1 < n && code[i + 1] == '/')) i++;
                if (i < n) i += 2;
                Blank(start, i);
                continue;
            }
            if (c == '"' && next == '"' && i + 2 < n && code[i + 2] == '"')
            {
                SkipRawString(code, ref i);
                Blank(start, i);
                continue;
            }
            if (c == '$')
            {
                int j = i + 1;
                bool verbatim = false;
                if (j < n && code[j] == '@') { verbatim = true; j++; }
                if (j < n && code[j] == '"')
                {
                    i = j;
                    SkipInterpolatedString(code, ref i, verbatim);
                    Blank(start, i);
                    continue;
                }
            }
            if (c == '@' && next == '"')
            {
                i++;
                SkipVerbatimString(code, ref i);
                Blank(start, i);
                continue;
            }
            if (c == '"')
            {
                SkipRegularString(code, ref i);
                Blank(start, i);
                continue;
            }
            if (c == '\'')
            {
                SkipCharLiteral(code, ref i);
                Blank(start, i);
                continue;
            }

            i++;
        }

        return new string(buffer);
    }

    /// <summary>
    /// Walks <paramref name="code"/> once and reports the first structural
    /// delimiter problem, or <see cref="Mismatch.None"/> when the source is
    /// balanced. Delimiters inside comments, strings, and char literals are
    /// ignored.
    /// </summary>
    internal static Mismatch Scan(string code)
    {
        int brace = 0, bracket = 0, paren = 0;
        int i = 0;
        int n = code.Length;

        while (i < n)
        {
            char c = code[i];
            char next = i + 1 < n ? code[i + 1] : '\0';

            // Line comment.
            if (c == '/' && next == '/')
            {
                i += 2;
                while (i < n && code[i] != '\n') i++;
                continue;
            }

            // Block comment.
            if (c == '/' && next == '*')
            {
                i += 2;
                bool closed = false;
                while (i < n)
                {
                    if (code[i] == '*' && i + 1 < n && code[i + 1] == '/')
                    {
                        i += 2;
                        closed = true;
                        break;
                    }
                    i++;
                }
                if (!closed) return Mismatch.UnclosedBlockComment;
                continue;
            }

            // Raw string literal: three or more consecutive quotes open it; the
            // same number of quotes closes it. Detected before $/@-prefixed and
            // plain strings.
            if (c == '"' && next == '"' && i + 2 < n && code[i + 2] == '"')
            {
                if (!SkipRawString(code, ref i)) return Mismatch.UnterminatedString;
                continue;
            }

            // Interpolated string ($"..." or $@"..." / @$"...").
            if (c == '$')
            {
                int j = i + 1;
                bool verbatim = false;
                if (j < n && code[j] == '@') { verbatim = true; j++; }
                if (j < n && code[j] == '"')
                {
                    i = j;
                    if (!SkipInterpolatedString(code, ref i, verbatim)) return Mismatch.UnterminatedString;
                    continue;
                }
            }

            // Verbatim string (@"...").
            if (c == '@' && next == '"')
            {
                i++; // now on the opening quote
                if (!SkipVerbatimString(code, ref i)) return Mismatch.UnterminatedString;
                continue;
            }

            // Regular string literal.
            if (c == '"')
            {
                if (!SkipRegularString(code, ref i)) return Mismatch.UnterminatedString;
                continue;
            }

            // Char literal.
            if (c == '\'')
            {
                if (!SkipCharLiteral(code, ref i)) return Mismatch.UnterminatedChar;
                continue;
            }

            switch (c)
            {
                case '{': brace++; break;
                case '}':
                    brace--;
                    if (brace < 0) return Mismatch.UnexpectedCloseBrace;
                    break;
                case '[': bracket++; break;
                case ']':
                    bracket--;
                    if (bracket < 0) return Mismatch.UnexpectedCloseBracket;
                    break;
                case '(': paren++; break;
                case ')':
                    paren--;
                    if (paren < 0) return Mismatch.UnexpectedCloseParen;
                    break;
            }

            i++;
        }

        if (brace != 0) return Mismatch.UnclosedBrace;
        if (bracket != 0) return Mismatch.UnclosedBracket;
        if (paren != 0) return Mismatch.UnclosedParen;
        return Mismatch.None;
    }

    /// <summary>
    /// Advances <paramref name="i"/> past a regular <c>"..."</c> string literal.
    /// On entry <c>code[i] == '"'</c>. Returns false if the string is unterminated.
    /// </summary>
    private static bool SkipRegularString(string code, ref int i)
    {
        int n = code.Length;
        i++; // past opening quote
        while (i < n)
        {
            char c = code[i];
            if (c == '\\') { i += 2; continue; } // escape — skip next char
            if (c == '\n') return false;         // unterminated (no multi-line)
            if (c == '"') { i++; return true; }
            i++;
        }
        return false;
    }

    /// <summary>
    /// Advances past a verbatim <c>@"..."</c> string. On entry <c>code[i] == '"'</c>.
    /// Inside, <c>\</c> is literal and <c>""</c> is an escaped quote.
    /// </summary>
    private static bool SkipVerbatimString(string code, ref int i)
    {
        int n = code.Length;
        i++; // past opening quote
        while (i < n)
        {
            if (code[i] == '"')
            {
                if (i + 1 < n && code[i + 1] == '"') { i += 2; continue; } // escaped quote
                i++;
                return true;
            }
            i++;
        }
        return false;
    }

    /// <summary>
    /// Advances past an interpolated string. <paramref name="verbatim"/> selects
    /// the <c>$@"..."</c> escaping rules. Interpolation holes <c>{...}</c> are
    /// skipped over (their braces are code, not literal text); <c>{{</c>/<c>}}</c>
    /// are escaped braces. On entry <c>code[i] == '"'</c>.
    /// </summary>
    private static bool SkipInterpolatedString(string code, ref int i, bool verbatim)
    {
        int n = code.Length;
        i++; // past opening quote
        while (i < n)
        {
            char c = code[i];

            if (!verbatim && c == '\\') { i += 2; continue; }

            if (c == '"')
            {
                if (verbatim && i + 1 < n && code[i + 1] == '"') { i += 2; continue; }
                i++;
                return true;
            }

            if (c == '{')
            {
                if (i + 1 < n && code[i + 1] == '{') { i += 2; continue; } // escaped brace
                // Interpolation hole — skip to the matching '}', tracking nested
                // braces so a hole containing an object initializer is handled.
                i++;
                int depth = 1;
                while (i < n && depth > 0)
                {
                    char h = code[i];
                    if (h == '{') depth++;
                    else if (h == '}') depth--;
                    else if (h == '"')
                    {
                        // A nested string inside the hole.
                        if (!SkipRegularString(code, ref i)) return false;
                        continue;
                    }
                    i++;
                }
                continue;
            }

            if (c == '}' && i + 1 < n && code[i + 1] == '}') { i += 2; continue; } // escaped brace

            i++;
        }
        return false;
    }

    /// <summary>
    /// Advances past a raw string literal <c>"""..."""</c> (C# 11). On entry
    /// <c>code[i]</c> is the first of three or more opening quotes.
    /// </summary>
    private static bool SkipRawString(string code, ref int i)
    {
        int n = code.Length;
        int openCount = 0;
        while (i < n && code[i] == '"') { openCount++; i++; }

        // Look for a run of the same number of quotes.
        while (i < n)
        {
            if (code[i] == '"')
            {
                int closeCount = 0;
                int start = i;
                while (i < n && code[i] == '"') { closeCount++; i++; }
                if (closeCount >= openCount) return true;
                // Fewer quotes than the opener — they are literal content; keep scanning.
                if (closeCount == 0) i = start + 1;
            }
            else
            {
                i++;
            }
        }
        return false;
    }

    /// <summary>
    /// Advances past a <c>'x'</c> char literal. On entry <c>code[i] == '\''</c>.
    /// </summary>
    private static bool SkipCharLiteral(string code, ref int i)
    {
        int n = code.Length;
        i++; // past opening quote
        while (i < n)
        {
            char c = code[i];
            if (c == '\\') { i += 2; continue; }
            if (c == '\n') return false;
            if (c == '\'') { i++; return true; }
            i++;
        }
        return false;
    }
}