107 lines
4.3 KiB
Python
107 lines
4.3 KiB
Python
"""Transcript display formatting (Task 33, Requirements §16.3).
|
|
|
|
Bot and user prose is rendered with **lightweight markdown**:
|
|
|
|
* ``*action*`` → ``<em class="action">…</em>`` — italic narration.
|
|
* ``**bold**`` → ``<strong>…</strong>`` — emphasis.
|
|
* ``((ooc))`` → ``<span class="ooc">((ooc))</span>`` — author-to-system
|
|
asides; visible to the reader, dimmed/italic in CSS, and stripped from
|
|
the prompt sent to the bot (see :func:`chat.web.turns._strip_ooc_for_prompt`).
|
|
* ``> line`` → ``<blockquote>line</blockquote>``.
|
|
* Double newline → paragraph break.
|
|
* Everything else is HTML-escaped and wrapped in ``<p>…</p>``.
|
|
|
|
No headings, code blocks, links, images, or tables — out of scope per
|
|
Requirements §16.3. The renderer is the single source of truth used by
|
|
both the chat-detail GET (initial timeline render, via Jinja filter) and
|
|
the per-turn SSE fragments emitted from :mod:`chat.web.turns`.
|
|
|
|
Order of operations matters:
|
|
|
|
1. ``html.escape`` the whole input first — every replacement below assumes
|
|
user-supplied ``<``/``>``/``&`` are already neutralised, so the wrapper
|
|
tags we add can never collide with an attacker-controlled tag.
|
|
2. OOC wrap before action/bold so its inner ``*`` are not interpreted.
|
|
3. Bold (``**``) before action (``*``) — the bold pattern is stricter and
|
|
would otherwise be partially consumed by the action regex.
|
|
4. Blockquote pass over already-escaped lines (so we match ``>``).
|
|
5. Paragraph split on double newline.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import html
|
|
import re
|
|
|
|
# ``((…))`` — non-greedy, allows newlines so a multi-line OOC aside still
|
|
# wraps cleanly. The inner ``[^)]*?`` keeps it from spanning across a
|
|
# closing-paren boundary.
|
|
_OOC_PATTERN = re.compile(r"\(\([^)]*?\)\)", re.DOTALL)
|
|
|
|
# ``**bold**`` — strict: no embedded asterisks or newlines. Must run
|
|
# *before* the single-asterisk action pattern, otherwise ``**x**`` would
|
|
# be partly consumed by ``*…*``.
|
|
_BOLD_PATTERN = re.compile(r"\*\*([^*\n]+)\*\*")
|
|
|
|
# ``*action*`` — single-asterisk italics; same restriction as bold.
|
|
_ACTION_PATTERN = re.compile(r"\*([^*\n]+)\*")
|
|
|
|
# ``> line`` at start of a line — note we match the *escaped* form
|
|
# ``>`` because this pass runs after ``html.escape``.
|
|
_BLOCKQUOTE_PATTERN = re.compile(r"^>\s?(.+)$", re.MULTILINE)
|
|
|
|
|
|
def render_prose(text: str) -> str:
|
|
"""Render prose to safe HTML.
|
|
|
|
Returns an empty string for empty/whitespace-only input so the caller
|
|
can append the result without producing stray ``<p></p>`` tags.
|
|
"""
|
|
if not text or not text.strip():
|
|
return ""
|
|
|
|
# Normalise CRLF so paragraph splitting on ``\n\n`` works for input
|
|
# pasted from Windows clients.
|
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
escaped = html.escape(text)
|
|
|
|
# OOC first — the wrapped span survives subsequent passes.
|
|
escaped = _OOC_PATTERN.sub(
|
|
lambda m: f'<span class="ooc">{m.group(0)}</span>', escaped
|
|
)
|
|
|
|
# Bold strictly before action (regex precedence — see module docstring).
|
|
escaped = _BOLD_PATTERN.sub(r"<strong>\1</strong>", escaped)
|
|
escaped = _ACTION_PATTERN.sub(r'<em class="action">\1</em>', escaped)
|
|
|
|
# Blockquote on already-escaped ``>`` markers.
|
|
escaped = _BLOCKQUOTE_PATTERN.sub(r"<blockquote>\1</blockquote>", escaped)
|
|
|
|
# Paragraph splitting — drop empty fragments so a trailing ``\n\n``
|
|
# doesn't yield an empty ``<p></p>`` block.
|
|
paragraphs = [p.strip() for p in escaped.split("\n\n") if p.strip()]
|
|
return "".join(f"<p>{p}</p>" for p in paragraphs)
|
|
|
|
|
|
def render_turn_html(speaker: str, text: str, role: str = "bot") -> str:
|
|
"""Render a full transcript turn as ``<div class="turn …">…</div>``.
|
|
|
|
Used by both the SSE fragment publisher in :mod:`chat.web.turns`
|
|
(per-turn live updates) and indirectly by the chat-detail Jinja
|
|
template (initial render, via the ``render_prose`` filter).
|
|
|
|
``role`` selects the CSS class (``turn-you`` vs ``turn-bot``); the
|
|
speaker label and role name are HTML-escaped defensively even though
|
|
they currently come from trusted server-side state.
|
|
"""
|
|
speaker_html = html.escape(speaker)
|
|
role_html = html.escape(role)
|
|
body_html = render_prose(text)
|
|
return (
|
|
f'<div class="turn turn-{role_html}">'
|
|
f"<strong>{speaker_html}</strong>"
|
|
f"{body_html}"
|
|
f"</div>"
|
|
)
|