chat/chat/web/render.py

"""Transcript display formatting (Task 33, Requirements §16.3).

Bot and user prose is rendered with **lightweight markdown**:

* ``*action*`` → ``<em class="action">…</em>`` — italic narration.
* ``**bold**`` → ``<strong>…</strong>`` — emphasis.
* ``((ooc))`` → ``<span class="ooc">((ooc))</span>`` — author-to-system
  asides; visible to the reader, dimmed/italic in CSS, and stripped from
  the prompt sent to the bot (see :func:`chat.web.turns._strip_ooc_for_prompt`).
* ``> line`` → ``<blockquote>line</blockquote>``.
* Double newline → paragraph break.
* Everything else is HTML-escaped and wrapped in ``<p>…</p>``.

No headings, code blocks, links, images, or tables — out of scope per
Requirements §16.3. The renderer is the single source of truth used by
both the chat-detail GET (initial timeline render, via Jinja filter) and
the per-turn SSE fragments emitted from :mod:`chat.web.turns`.

Order of operations matters:

1. ``html.escape`` the whole input first — every replacement below assumes
   user-supplied ``<``/``>``/``&`` are already neutralised, so the wrapper
   tags we add can never collide with an attacker-controlled tag.
2. OOC wrap before action/bold so its inner ``*`` are not interpreted.
3. Bold (``**``) before action (``*``) — the bold pattern is stricter and
   would otherwise be partially consumed by the action regex.
4. Blockquote pass over already-escaped lines (so we match ``&gt;``).
5. Paragraph split on double newline.
"""

from __future__ import annotations

import html
import re

# ``((…))`` — non-greedy, allows newlines so a multi-line OOC aside still
# wraps cleanly. The inner ``[^)]*?`` keeps it from spanning across a
# closing-paren boundary.
_OOC_PATTERN = re.compile(r"\(\([^)]*?\)\)", re.DOTALL)

# ``**bold**`` — strict: no embedded asterisks or newlines. Must run
# *before* the single-asterisk action pattern, otherwise ``**x**`` would
# be partly consumed by ``*…*``.
_BOLD_PATTERN = re.compile(r"\*\*([^*\n]+)\*\*")

# ``*action*`` — single-asterisk italics; same restriction as bold.
_ACTION_PATTERN = re.compile(r"\*([^*\n]+)\*")

# ``> line`` at start of a line — note we match the *escaped* form
# ``&gt;`` because this pass runs after ``html.escape``.
_BLOCKQUOTE_PATTERN = re.compile(r"^&gt;\s?(.+)$", re.MULTILINE)


def render_prose(text: str) -> str:
    """Render prose to safe HTML.

    Returns an empty string for empty/whitespace-only input so the caller
    can append the result without producing stray ``<p></p>`` tags.
    """
    if not text or not text.strip():
        return ""

    # Normalise CRLF so paragraph splitting on ``\n\n`` works for input
    # pasted from Windows clients.
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    escaped = html.escape(text)

    # OOC first — the wrapped span survives subsequent passes.
    escaped = _OOC_PATTERN.sub(
        lambda m: f'<span class="ooc">{m.group(0)}</span>', escaped
    )

    # Bold strictly before action (regex precedence — see module docstring).
    escaped = _BOLD_PATTERN.sub(r"<strong>\1</strong>", escaped)
    escaped = _ACTION_PATTERN.sub(r'<em class="action">\1</em>', escaped)

    # Blockquote on already-escaped ``&gt;`` markers.
    escaped = _BLOCKQUOTE_PATTERN.sub(r"<blockquote>\1</blockquote>", escaped)

    # Paragraph splitting — drop empty fragments so a trailing ``\n\n``
    # doesn't yield an empty ``<p></p>`` block.
    paragraphs = [p.strip() for p in escaped.split("\n\n") if p.strip()]
    return "".join(f"<p>{p}</p>" for p in paragraphs)


def render_turn_html(speaker: str, text: str, role: str = "bot") -> str:
    """Render a full transcript turn as ``<div class="turn …">…</div>``.

    Used by both the SSE fragment publisher in :mod:`chat.web.turns`
    (per-turn live updates) and indirectly by the chat-detail Jinja
    template (initial render, via the ``render_prose`` filter).

    ``role`` selects the CSS class (``turn-you`` vs ``turn-bot``); the
    speaker label and role name are HTML-escaped defensively even though
    they currently come from trusted server-side state.
    """
    speaker_html = html.escape(speaker)
    role_html = html.escape(role)
    body_html = render_prose(text)
    return (
        f'<div class="turn turn-{role_html}">'
        f"<strong>{speaker_html}</strong>"
        f"{body_html}"
        f"</div>"
    )