mirror of
https://gitlab.com/tildes/tildes.git
synced 2026-04-17 23:08:32 +02:00
548 lines
20 KiB
Python
548 lines
20 KiB
Python
# Copyright (c) 2018 Tildes contributors <code@tildes.net>
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
"""Functions/constants related to markdown handling."""
|
|
|
|
import re
|
|
from collections.abc import Callable, Iterator
|
|
from functools import partial
|
|
from random import randint
|
|
from typing import Any, Optional, Union
|
|
|
|
import bleach
|
|
from bs4 import BeautifulSoup, Tag
|
|
from html5lib.filters.base import Filter
|
|
from html5lib.treewalkers.base import NonRecursiveTreeWalker
|
|
from pygments import highlight
|
|
from pygments.formatters import HtmlFormatter
|
|
from pygments.lexers import get_lexer_by_name, PhpLexer
|
|
from pygments.util import ClassNotFound
|
|
|
|
from tildes.enums import HTMLSanitizationContext
|
|
from tildes.metrics import histogram_timer
|
|
from tildes.schemas.group import is_valid_group_path
|
|
from tildes.schemas.user import is_valid_username
|
|
|
|
from .cmark import (
|
|
CMARK_EXTENSIONS,
|
|
cmark_find_syntax_extension,
|
|
cmark_node_free,
|
|
CMARK_OPTS,
|
|
cmark_parser_attach_syntax_extension,
|
|
cmark_parser_feed,
|
|
cmark_parser_finish,
|
|
cmark_parser_free,
|
|
cmark_parser_get_syntax_extensions,
|
|
cmark_parser_new,
|
|
cmark_render_html,
|
|
)
|
|
|
|
|
|
def allow_syntax_highlighting_classes(tag: str, name: str, value: str) -> bool:
|
|
"""Allow all CSS classes from Pygments.
|
|
|
|
Pygments will add a <code class="highlight">, as well as many <span> tags around
|
|
elements of the code, with classes all starting with "syntax-".
|
|
"""
|
|
if tag not in ("span", "code"):
|
|
raise ValueError("This method only sanitizes <span> and <code> tags")
|
|
|
|
if tag == "span":
|
|
# allow class attribute with a single class starting with "syntax-"
|
|
return name == "class" and " " not in value and value.startswith("syntax-")
|
|
|
|
if tag == "code":
|
|
# allow class attribute with a single class of exactly "highlight"
|
|
return name == "class" and value == "highlight"
|
|
|
|
return False
|
|
|
|
|
|
ALLOWED_HTML_TAGS = (
|
|
"a",
|
|
"b",
|
|
"blockquote",
|
|
"br",
|
|
"code",
|
|
"del",
|
|
"details",
|
|
"em",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"hr",
|
|
"i",
|
|
"ins",
|
|
"li",
|
|
"ol",
|
|
"p",
|
|
"pre",
|
|
"small",
|
|
"strong",
|
|
"sub",
|
|
"summary",
|
|
"sup",
|
|
"span",
|
|
"table",
|
|
"tbody",
|
|
"td",
|
|
"th",
|
|
"thead",
|
|
"tr",
|
|
"ul",
|
|
)
|
|
ALLOWED_LINK_PROTOCOLS = ("gemini", "http", "https", "mailto")
|
|
|
|
ALLOWED_HTML_ATTRIBUTES_DEFAULT: dict[str, Union[list[str], Callable]] = {
|
|
"a": ["href", "title"],
|
|
"details": ["open"],
|
|
"ol": ["start"],
|
|
"td": ["align"],
|
|
"th": ["align"],
|
|
"code": allow_syntax_highlighting_classes,
|
|
"span": allow_syntax_highlighting_classes,
|
|
}
|
|
|
|
# per-context overrides for allowed attributes
|
|
ALLOWED_HTML_ATTRIBUTES_OVERRIDES = {
|
|
HTMLSanitizationContext.USER_BIO: {"a": ["href", "title", "rel"]}
|
|
}
|
|
|
|
# Regex that finds ordered list markdown that was probably accidental - ones being
|
|
# initiated by anything except "1." at the start of a post
|
|
BAD_ORDERED_LIST_REGEX = re.compile(
|
|
r"((?:\A)" # The start of the entire text
|
|
r"(?!1\.)\d+)" # A number that isn't "1"
|
|
r"\.\s" # Followed by a period and a space
|
|
)
|
|
|
|
STRIP_IMAGE_ELEMENTS_REGEX = re.compile(r'<img src="([^"<>]*?)" alt="([^"<>]*?)" />')
|
|
|
|
SUBSEQUENT_BLOCKQUOTES_REGEX = re.compile("^>([^\n]*?)\n\n(?=>)", flags=re.MULTILINE)
|
|
|
|
|
|
@histogram_timer("markdown_processing")
|
|
def convert_markdown_to_safe_html(
|
|
markdown: str, context: Optional[HTMLSanitizationContext] = None
|
|
) -> str:
|
|
"""Convert markdown to sanitized HTML."""
|
|
# apply custom pre-processing to markdown
|
|
markdown, replacements = preprocess_markdown(markdown)
|
|
|
|
markdown_bytes = markdown.encode("utf8")
|
|
|
|
parser = cmark_parser_new(CMARK_OPTS)
|
|
for name in CMARK_EXTENSIONS:
|
|
ext = cmark_find_syntax_extension(name)
|
|
cmark_parser_attach_syntax_extension(parser, ext)
|
|
exts = cmark_parser_get_syntax_extensions(parser)
|
|
|
|
cmark_parser_feed(parser, markdown_bytes, len(markdown_bytes))
|
|
doc = cmark_parser_finish(parser)
|
|
|
|
html_bytes = cmark_render_html(doc, CMARK_OPTS, exts)
|
|
|
|
cmark_parser_free(parser)
|
|
cmark_node_free(doc)
|
|
|
|
html = html_bytes.decode("utf8")
|
|
|
|
# apply custom post-processing to HTML
|
|
html = postprocess_markdown_html(html)
|
|
|
|
# add linkification and sanitize the HTML
|
|
html = linkify_and_sanitize_html(html, context)
|
|
|
|
# finally restore any escaped substrings before returning HTML
|
|
return restore_replacements(html, replacements)
|
|
|
|
|
|
def preprocess_markdown(markdown: str) -> tuple[str, dict[str, str]]:
|
|
"""Pre-process markdown before passing it to CommonMark."""
|
|
markdown = escape_accidental_ordered_lists(markdown)
|
|
|
|
markdown = merge_subsequent_blockquotes(markdown)
|
|
|
|
# fix the "shrug" emoji ¯\_(ツ)_/¯ to prevent markdown mangling it
|
|
markdown = markdown.replace(r"¯\_(ツ)_/¯", r"¯\\\_(ツ)\_/¯")
|
|
|
|
# temporary replacements to restore after markdown processing
|
|
replacements = {}
|
|
if r"\@" in markdown:
|
|
# cmark rendering removes `\` before any punctuation,
|
|
# so `\@` becomes `@` and unexpectedly linkifies during the
|
|
# later linkify step.
|
|
# prevent that by using a replacement during cmark rendering.
|
|
replacements[r"\@"] = random_replacement_string(markdown)
|
|
markdown = markdown.replace(r"\@", replacements[r"\@"])
|
|
|
|
return (markdown, replacements)
|
|
|
|
|
|
def random_replacement_string(markdown: str) -> str:
|
|
"""Pick a random replacement string not present in input markdown."""
|
|
some_int = randint(1000000, 99999999999)
|
|
some_int_str = str(some_int)
|
|
if some_int_str in markdown:
|
|
return random_replacement_string(markdown)
|
|
else:
|
|
return some_int_str
|
|
|
|
|
|
def merge_subsequent_blockquotes(markdown: str) -> str:
|
|
"""Merge subsequent (separated) blockquotes into a single one.
|
|
|
|
By default, if someone quotes more than one paragraph without also adding the >
|
|
symbol on the blank lines between them, they will be interpreted as separate
|
|
blockquotes. This almost never seems to be intended, so this merges them. If
|
|
separate quotes are wanted, they can still achieve it by using at least two
|
|
newlines between quoted paragraphs (or various other methods).
|
|
"""
|
|
return SUBSEQUENT_BLOCKQUOTES_REGEX.sub(">\\1\n>\n", markdown)
|
|
|
|
|
|
def escape_accidental_ordered_lists(markdown: str) -> str:
|
|
"""Escape markdown that's probably an accidental ordered list.
|
|
|
|
It's a common markdown mistake to accidentally start a numbered list, by beginning a
|
|
post with a number followed by a period. For example, someone might try to write
|
|
"1975. It was a long time ago.", and the result will be a comment that says "1. It
|
|
was a long time ago." since that gets parsed into a numbered list.
|
|
|
|
This fixes that quirk of markdown by escaping anything that would start a numbered
|
|
list at the beginning of a post, except for "1. ".
|
|
"""
|
|
return BAD_ORDERED_LIST_REGEX.sub(r"\1\\. ", markdown)
|
|
|
|
|
|
def postprocess_markdown_html(html: str) -> str:
|
|
"""Apply post-processing to HTML generated by markdown parser."""
|
|
# cmark (and cmark-gfm) replaces double-quote characters with the " entity.
|
|
# This is almost always unnecessary, and is causing issues with some of the HTML
|
|
# processing, since (for example) BeautifulSoup will convert them back, which causes
|
|
# the string-replacement in apply_syntax_highlighting() to fail if a code block
|
|
# contains any double-quote characters.
|
|
#
|
|
# We'll just do a full replacement here - this has a possibility of being dangerous,
|
|
# but it should be extremely unlikely and the sanitization function should make sure
|
|
# that nothing malicious can happen regardless.
|
|
html = html.replace(""", '"')
|
|
|
|
# apply syntax highlighting to code blocks
|
|
html = apply_syntax_highlighting(html)
|
|
|
|
# replace <img> elements that were generated by `` Markdown syntax
|
|
html = strip_image_elements(html)
|
|
|
|
return html
|
|
|
|
|
|
def restore_replacements(html: str, preprocessed_replacements: dict[str, str]) -> str:
|
|
"""Restore replacement strings to sanitized and linkified HTML."""
|
|
if r"\@" in preprocessed_replacements:
|
|
html = html.replace(preprocessed_replacements[r"\@"], "@")
|
|
return html
|
|
|
|
|
|
class CodeHtmlFormatter(HtmlFormatter):
|
|
"""Custom Pygments HtmlFormatter to use a <code> tag.
|
|
|
|
The default HtmlFormatter in Pygments outputs the code inside a
|
|
<div class="highlight"><pre>...</pre></div> structure. This changes that to
|
|
<code class="highlight">...</code> instead (assumes a <pre> is already present).
|
|
"""
|
|
|
|
def wrap(self, source: Any, outfile: Any) -> Iterator[tuple[int, str]]:
|
|
"""Wrap the highlighted tokens with the <code> tag."""
|
|
# pylint: disable=unused-argument
|
|
yield (0, '<code class="highlight">')
|
|
yield from source
|
|
yield (0, "</code>")
|
|
|
|
|
|
def apply_syntax_highlighting(html: str) -> str:
|
|
"""Get all code blocks with defined info string in class and highlight them."""
|
|
soup = BeautifulSoup(html, features="html5lib")
|
|
|
|
# Get all code blocks and for every code block that has info string
|
|
code_blocks = soup.find_all("code", class_=re.compile("^language-"))
|
|
for code_block in code_blocks:
|
|
if not isinstance(code_block, Tag):
|
|
continue
|
|
|
|
language = code_block["class"][0].replace("language-", "")
|
|
|
|
try:
|
|
lexer = get_lexer_by_name(language)
|
|
# If target language is PHP, override default lexer construction
|
|
# and set startinline to True, so even code that is not enclosed
|
|
# inside <?php ... ?> will get highlighted.
|
|
if isinstance(lexer, PhpLexer):
|
|
lexer = PhpLexer(startinline=True)
|
|
except ClassNotFound:
|
|
continue
|
|
|
|
highlighted = highlight(
|
|
code_block.text, lexer, CodeHtmlFormatter(classprefix="syntax-")
|
|
)
|
|
html = html.replace(str(code_block), highlighted, 1)
|
|
|
|
return html
|
|
|
|
|
|
def strip_image_elements(html: str) -> str:
|
|
"""Replace all <img> elements generated from Markdown with <a>.
|
|
|
|
The Markdown syntax `` creates an image tag. Except for the leading
|
|
`!`, the syntax is identical to that of links. We can pretend we never parsed it as
|
|
a Markdown image by replacing all image tags with links.
|
|
|
|
<img src="/url" alt="alt text" />
|
|
==>
|
|
!<a href="/url">alt text</a>
|
|
"""
|
|
return STRIP_IMAGE_ELEMENTS_REGEX.sub(r'!<a href="\1">\2</a>', html)
|
|
|
|
|
|
class LinkifyFilter(Filter):
|
|
"""html5lib Filter to convert custom text patterns to links.
|
|
|
|
This replaces references to group paths and usernames with links to the relevant
|
|
pages.
|
|
|
|
This implementation is based heavily on the linkify implementation from the Bleach
|
|
library.
|
|
"""
|
|
|
|
# Regex that finds probable references to groups. This isn't "perfect", just a first
|
|
# pass to find likely candidates. The validity of the group path is checked more
|
|
# carefully later.
|
|
# Note: currently specifically excludes paths immediately followed by a tilde, but
|
|
# this may be possible to remove once strikethrough is implemented (since that's
|
|
# probably what they were trying to do)
|
|
GROUP_REFERENCE_REGEX = re.compile(r"(?<!\w)~([\w.]+)\b(?!~)")
|
|
|
|
# Regex that finds probable references to users. As above, this isn't "perfect"
|
|
# either but works as an initial pass with the validity of the username checked more
|
|
# carefully later.
|
|
USERNAME_REFERENCE_REGEX = re.compile(r"(?<![\w\\])(?:/?u/|@)([\w-]+)\b")
|
|
|
|
# Regex that finds probable references to subreddits. Matches with or without the
|
|
# preceding slash (e.g. either of "r/emacs" or "/r/emacs").
|
|
SUBREDDIT_REFERENCE_REGEX = re.compile(r"(?<!\w)/?r/(\w+)\b")
|
|
|
|
def __init__(
|
|
self, source: NonRecursiveTreeWalker, skip_tags: Optional[list[str]] = None
|
|
):
|
|
"""Initialize a linkification filter to apply to HTML.
|
|
|
|
The skip_tags argument can be a list of tag names, and the contents of any of
|
|
those tags will be excluded from linkification.
|
|
"""
|
|
super().__init__(source)
|
|
self.skip_tags = skip_tags or []
|
|
|
|
# always skip the contents of <a> tags in addition to any others
|
|
self.skip_tags.append("a")
|
|
|
|
def __iter__(self) -> Iterator[dict]:
|
|
"""Iterate over the tree, modifying it as necessary before yielding."""
|
|
inside_skipped_tags = []
|
|
|
|
for token in super().__iter__():
|
|
if (
|
|
token["type"] in ("StartTag", "EmptyTag")
|
|
and token["name"] in self.skip_tags
|
|
):
|
|
# if this is the start of a tag we want to skip, add it to the list of
|
|
# skipped tags that we're currently inside
|
|
inside_skipped_tags.append(token["name"])
|
|
elif inside_skipped_tags:
|
|
# if we're currently inside any skipped tags, the only thing we want to
|
|
# do is look for all the end tags we need to be able to finish skipping
|
|
if token["type"] == "EndTag":
|
|
try:
|
|
inside_skipped_tags.remove(token["name"])
|
|
except ValueError:
|
|
pass
|
|
elif token["type"] == "Characters":
|
|
# this is only reachable if inside_skipped_tags is empty, so this is a
|
|
# text token not inside a skipped tag - do the actual linkification
|
|
# replacements
|
|
|
|
# Note: doing the replacements "iteratively" like this only works
|
|
# because they are "disjoint" and we know they're not competing to
|
|
# replace the same text. If more replacements are added in the future
|
|
# that might conflict with each other, this will need to be reworked
|
|
# somehow.
|
|
replaced_tokens = self._linkify_tokens(
|
|
[token],
|
|
filter_regex=self.GROUP_REFERENCE_REGEX,
|
|
linkify_function=self._tokenize_group_match,
|
|
)
|
|
|
|
replaced_tokens = self._linkify_tokens(
|
|
replaced_tokens,
|
|
filter_regex=self.USERNAME_REFERENCE_REGEX,
|
|
linkify_function=self._tokenize_username_match,
|
|
)
|
|
|
|
replaced_tokens = self._linkify_tokens(
|
|
replaced_tokens,
|
|
filter_regex=self.SUBREDDIT_REFERENCE_REGEX,
|
|
linkify_function=self._tokenize_subreddit_match,
|
|
)
|
|
|
|
# yield all the tokens returned from the replacement process (will be
|
|
# just the original token if nothing was replaced)
|
|
for new_token in replaced_tokens:
|
|
yield new_token
|
|
|
|
# we either yielded new tokens or the original one already, so we don't
|
|
# want to fall through and yield the original again
|
|
continue
|
|
|
|
yield token
|
|
|
|
@staticmethod
|
|
def _linkify_tokens(
|
|
tokens: list[dict], filter_regex: re.Pattern, linkify_function: Callable
|
|
) -> list[dict]:
|
|
"""Check tokens for text that matches a regex and linkify it.
|
|
|
|
The `filter_regex` argument should be a compiled pattern that will be applied to
|
|
the text in all of the supplied tokens. If any matches are found, they will each
|
|
be used to call `linkify_function`, which will validate the match and convert it
|
|
back into tokens (representing an <a> tag if it is valid for linkifying, or just
|
|
text if not).
|
|
"""
|
|
new_tokens = []
|
|
|
|
for token in tokens:
|
|
# we don't want to touch any tokens other than character ones
|
|
if token["type"] != "Characters":
|
|
new_tokens.append(token)
|
|
continue
|
|
|
|
original_text = token["data"]
|
|
current_index = 0
|
|
|
|
for match in filter_regex.finditer(original_text):
|
|
# if there were some characters between the previous match and this one,
|
|
# add a token containing those first
|
|
if match.start() > current_index:
|
|
new_tokens.append(
|
|
{
|
|
"type": "Characters",
|
|
"data": original_text[current_index : match.start()],
|
|
}
|
|
)
|
|
|
|
# call the linkify function to convert this match into tokens
|
|
linkified_tokens = linkify_function(match)
|
|
new_tokens.extend(linkified_tokens)
|
|
|
|
# move the progress marker up to the end of this match
|
|
current_index = match.end()
|
|
|
|
# if there's still some text left over, add one more token for it (this will
|
|
# be the entire thing if there weren't any matches)
|
|
if current_index < len(original_text):
|
|
new_tokens.append(
|
|
{"type": "Characters", "data": original_text[current_index:]}
|
|
)
|
|
|
|
return new_tokens
|
|
|
|
@staticmethod
|
|
def _tokenize_group_match(match: re.Match) -> list[dict]:
|
|
"""Convert a potential group reference into HTML tokens."""
|
|
# convert the potential group path to lowercase to allow people to use incorrect
|
|
# casing but still have it link properly
|
|
group_path = match[1].lower()
|
|
|
|
# Even though they're technically valid paths, we don't want to linkify anything
|
|
# starting with a number like "~10" or "~4.5", since that's just going to be
|
|
# someone using it in the "approximately" sense. This will be a problem if a
|
|
# top-level group's name ever starts with a number, but I think that's unlikely.
|
|
is_ignored = group_path.startswith(tuple("0123456789"))
|
|
|
|
# if it's a valid group path and not ignored by the above logic, convert to <a>
|
|
if is_valid_group_path(group_path) and not is_ignored:
|
|
return [
|
|
{
|
|
"type": "StartTag",
|
|
"name": "a",
|
|
"data": {
|
|
(None, "class"): "link-group",
|
|
(None, "href"): f"/~{group_path}",
|
|
},
|
|
},
|
|
{"type": "Characters", "data": match[0]},
|
|
{"type": "EndTag", "name": "a"},
|
|
]
|
|
|
|
# one of the checks failed, so just keep it as the original text
|
|
return [{"type": "Characters", "data": match[0]}]
|
|
|
|
@staticmethod
|
|
def _tokenize_username_match(match: re.Match) -> list[dict]:
|
|
"""Convert a potential username reference into HTML tokens."""
|
|
# if it's a valid username, convert to <a>
|
|
if is_valid_username(match[1]):
|
|
return [
|
|
{
|
|
"type": "StartTag",
|
|
"name": "a",
|
|
"data": {
|
|
(None, "class"): "link-user",
|
|
(None, "href"): f"/user/{match[1]}",
|
|
},
|
|
},
|
|
{"type": "Characters", "data": match[0]},
|
|
{"type": "EndTag", "name": "a"},
|
|
]
|
|
|
|
# the username wasn't valid, so just keep it as the original text
|
|
return [{"type": "Characters", "data": match[0]}]
|
|
|
|
@staticmethod
|
|
def _tokenize_subreddit_match(match: re.Match) -> list[dict]:
|
|
"""Convert a subreddit reference into HTML tokens."""
|
|
return [
|
|
{
|
|
"type": "StartTag",
|
|
"name": "a",
|
|
"data": {(None, "href"): f"https://www.reddit.com/r/{match[1]}/"},
|
|
},
|
|
{"type": "Characters", "data": match[0]},
|
|
{"type": "EndTag", "name": "a"},
|
|
]
|
|
|
|
|
|
def linkify_and_sanitize_html(
|
|
html: str, context: Optional[HTMLSanitizationContext] = None
|
|
) -> str:
|
|
"""Use bleach and html5lib filters to linkify and sanitize HTML."""
|
|
# list of tag names to exclude from linkification
|
|
linkify_skipped_tags = ["code", "pre"]
|
|
|
|
tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags)
|
|
|
|
allowed_attributes = ALLOWED_HTML_ATTRIBUTES_DEFAULT
|
|
if context:
|
|
# include overrides for the current context
|
|
overrides = ALLOWED_HTML_ATTRIBUTES_OVERRIDES.get(context, {})
|
|
allowed_attributes = {**allowed_attributes, **overrides}
|
|
|
|
cleaner = bleach.Cleaner(
|
|
tags=ALLOWED_HTML_TAGS,
|
|
attributes=allowed_attributes,
|
|
protocols=ALLOWED_LINK_PROTOCOLS,
|
|
filters=[tildes_linkifier],
|
|
)
|
|
return cleaner.clean(html)
|