view sat/plugins/plugin_misc_text_syntaxes.py @ 3826:81c79b7cafa7

tests (unit/ap-gateway): tests for XMPP identity/vCard4 <=> AP actor data conversion: rel 368
author Goffi <goffi@goffi.org>
date Wed, 29 Jun 2022 14:06:33 +0200
parents 33d75cd3c371
children 524856bd7b19
line wrap: on
line source

#!/usr/bin/env python3


# SAT plugin for managing various text syntaxes
# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from functools import partial
from html import escape
import re
from typing import Set

from twisted.internet import defer
from twisted.internet.threads import deferToThread

from sat.core import exceptions
from sat.core.constants import Const as C
from sat.core.i18n import D_, _
from sat.core.log import getLogger
from sat.tools import xml_tools

try:
    from lxml import html
    from lxml.html import clean
    from lxml import etree
except ImportError:
    raise exceptions.MissingModule(
        "Missing module lxml, please download/install it from http://lxml.de/"
    )

log = getLogger(__name__)

CATEGORY = D_("Composition")
NAME = "Syntax"
_SYNTAX_XHTML = "xhtml"  # must be lower case
_SYNTAX_CURRENT = "@CURRENT@"

# TODO: check/adapt following list
# list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
STYLES_WHITELIST = (
    "azimuth",
    "background-color",
    "border-bottom-color",
    "border-collapse",
    "border-color",
    "border-left-color",
    "border-right-color",
    "border-top-color",
    "clear",
    "color",
    "cursor",
    "direction",
    "display",
    "elevation",
    "float",
    "font",
    "font-family",
    "font-size",
    "font-style",
    "font-variant",
    "font-weight",
    "height",
    "letter-spacing",
    "line-height",
    "overflow",
    "pause",
    "pause-after",
    "pause-before",
    "pitch",
    "pitch-range",
    "richness",
    "speak",
    "speak-header",
    "speak-numeral",
    "speak-punctuation",
    "speech-rate",
    "stress",
    "text-align",
    "text-decoration",
    "text-indent",
    "unicode-bidi",
    "vertical-align",
    "voice-family",
    "volume",
    "white-space",
    "width",
)

# cf. https://www.w3.org/TR/html/syntax.html#void-elements
VOID_ELEMENTS = (
    "area",
    "base",
    "br",
    "col",
    "embed",
    "hr",
    "img",
    "input",
    "keygen",
    "link",
    "menuitem",
    "meta",
    "param",
    "source",
    "track",
    "wbr")

SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
SAFE_CLASSES = {
    # those classes are used for code highlighting
    "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
    "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
    "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
    "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
    "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
}
STYLES_VALUES_REGEX = (
    r"^("
    + "|".join(
        [
            "([a-z-]+)",  # alphabetical names
            "(#[0-9a-f]+)",  # hex value
            "(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))",  # values with units (or not)
            "rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)",  # rgb function
            "rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)",  # rgba function
        ]
    )
    + ") *(!important)?$"
)  # we accept "!important" at the end
STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)

PLUGIN_INFO = {
    C.PI_NAME: "Text syntaxes",
    C.PI_IMPORT_NAME: "TEXT_SYNTAXES",
    C.PI_TYPE: "MISC",
    C.PI_MODES: C.PLUG_MODE_BOTH,
    C.PI_PROTOCOLS: [],
    C.PI_DEPENDENCIES: [],
    C.PI_MAIN: "TextSyntaxes",
    C.PI_HANDLER: "no",
    C.PI_DESCRIPTION: _(
        """Management of various text syntaxes (XHTML-IM, Markdown, etc)"""
    ),
}


class TextSyntaxes(object):
    """ Text conversion class
    XHTML utf-8 is used as intermediate language for conversions
    """

    OPT_DEFAULT = "DEFAULT"
    OPT_HIDDEN = "HIDDEN"
    OPT_NO_THREAD = "NO_THREAD"
    SYNTAX_XHTML = _SYNTAX_XHTML
    SYNTAX_MARKDOWN = "markdown"
    SYNTAX_TEXT = "text"
    # default_syntax must be lower case
    default_syntax = SYNTAX_XHTML


    def __init__(self, host):
        log.info(_("Text syntaxes plugin initialization"))
        self.host = host
        self.syntaxes = {}

        self.params = """
            <params>
            <individual>
            <category name="%(category_name)s" label="%(category_label)s">
                <param name="%(name)s" label="%(label)s" type="list" security="0">
                    %(options)s
                </param>
            </category>
            </individual>
            </params>
        """

        self.params_data = {
            "category_name": CATEGORY,
            "category_label": _(CATEGORY),
            "name": NAME,
            "label": _(NAME),
            "syntaxes": self.syntaxes,
        }

        self.addSyntax(
            self.SYNTAX_XHTML,
            lambda xhtml: defer.succeed(xhtml),
            lambda xhtml: defer.succeed(xhtml),
            TextSyntaxes.OPT_NO_THREAD,
        )
        # TODO: text => XHTML should add <a/> to url like in frontends
        #       it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar
        self.addSyntax(
            self.SYNTAX_TEXT,
            lambda text: escape(text),
            lambda xhtml: self._removeMarkups(xhtml),
            [TextSyntaxes.OPT_HIDDEN],
        )
        try:
            import markdown, html2text
            from markdown.extensions import Extension

            # XXX: we disable raw HTML parsing by default, to avoid parsing error
            #      when the user is not aware of markdown and HTML
            class EscapeHTML(Extension):
                def extendMarkdown(self, md):
                    md.preprocessors.deregister('html_block')
                    md.inlinePatterns.deregister('html')

            def _html2text(html, baseurl=""):
                h = html2text.HTML2Text(baseurl=baseurl)
                h.body_width = 0  # do not truncate the lines, it breaks the long URLs
                return h.handle(html)

            self.addSyntax(
                self.SYNTAX_MARKDOWN,
                partial(markdown.markdown,
                        extensions=[
                            EscapeHTML(),
                            'nl2br',
                            'codehilite',
                            'fenced_code',
                            'sane_lists',
                            'tables',
                            ],
                        extension_configs = {
                            "codehilite": {
                                "css_class": "highlight",
                            }
                        }),
                _html2text,
                [TextSyntaxes.OPT_DEFAULT],
            )
        except ImportError:
            log.warning("markdown or html2text not found, can't use Markdown syntax")
            log.info(
                "You can download/install them from https://pythonhosted.org/Markdown/ "
                "and https://github.com/Alir3z4/html2text/"
            )
        host.bridge.addMethod(
            "syntaxConvert",
            ".plugin",
            in_sign="sssbs",
            out_sign="s",
            async_=True,
            method=self.convert,
        )
        host.bridge.addMethod(
            "syntaxGet", ".plugin", in_sign="s", out_sign="s", method=self.getSyntax
        )
        if xml_tools.cleanXHTML is None:
            log.debug("Installing cleaning method")
            xml_tools.cleanXHTML = self.cleanXHTML

    def _updateParamOptions(self):
        data_synt = self.syntaxes
        default_synt = TextSyntaxes.default_syntax
        syntaxes = []

        for syntax in list(data_synt.keys()):
            flags = data_synt[syntax]["flags"]
            if TextSyntaxes.OPT_HIDDEN not in flags:
                syntaxes.append(syntax)

        syntaxes.sort(key=lambda synt: synt.lower())
        options = []

        for syntax in syntaxes:
            selected = 'selected="true"' if syntax == default_synt else ""
            options.append('<option value="%s" %s/>' % (syntax, selected))

        self.params_data["options"] = "\n".join(options)
        self.host.memory.updateParams(self.params % self.params_data)

    def getCurrentSyntax(self, profile):
        """ Return the selected syntax for the given profile

        @param profile: %(doc_profile)s
        @return: profile selected syntax
        """
        return self.host.memory.getParamA(NAME, CATEGORY, profile_key=profile)

    def _logError(self, failure, action="converting syntax"):
        log.error(
            "Error while {action}: {failure}".format(action=action, failure=failure)
        )
        return failure

    def cleanStyle(self, styles_raw: str) -> str:
        """"Clean unsafe CSS styles

        Remove styles not in the whitelist, or where the value doesn't match the regex
        @param styles_raw: CSS styles
        @return: cleaned styles
        """
        styles: List[str] = styles_raw.split(";")
        cleaned_styles = []
        for style in styles:
            try:
                key, value = style.split(":")
            except ValueError:
                continue
            key = key.lower().strip()
            if key not in STYLES_WHITELIST:
                continue
            value = value.lower().strip()
            if not STYLES_ACCEPTED_VALUE.match(value):
                continue
            if value == "none":
                continue
            cleaned_styles.append((key, value))
        return "; ".join(
            ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
        )

    def cleanClasses(self, classes_raw: str) -> str:
        """Remove any non whitelisted class

        @param classes_raw: classes set on an element
        @return: remaining classes (can be empty string)
        """
        return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))

    def cleanXHTML(self, xhtml):
        """Clean XHTML text by removing potentially dangerous/malicious parts

        @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
        @return (unicode): cleaned XHTML
        """

        if isinstance(xhtml, str):
            try:
                xhtml_elt = html.fromstring(xhtml)
            except etree.ParserError as e:
                if not xhtml.strip():
                    return ""
                log.error("Can't clean XHTML: {xhtml}".format(xhtml=xhtml))
                raise e
        elif isinstance(xhtml, html.HtmlElement):
            xhtml_elt = xhtml
        else:
            log.error("Only strings and HtmlElements can be cleaned")
            raise exceptions.DataError
        cleaner = clean.Cleaner(
            style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
        )
        xhtml_elt = cleaner.clean_html(xhtml_elt)
        for elt in xhtml_elt.xpath("//*[@style]"):
            elt.set("style", self.cleanStyle(elt.get("style")))
        for elt in xhtml_elt.xpath("//*[@class]"):
            elt.set("class", self.cleanClasses(elt.get("class")))
        # we remove self-closing elements for non-void elements
        for element in xhtml_elt.iter(tag=etree.Element):
            if not element.text:
                if element.tag in VOID_ELEMENTS:
                    element.text = None
                else:
                    element.text = ''
        return html.tostring(xhtml_elt, encoding=str, method="xml")

    def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True,
                profile=None):
        """Convert a text between two syntaxes

        @param text: text to convert
        @param syntax_from: source syntax (e.g. "markdown")
        @param syntax_to: dest syntax (e.g.: "XHTML")
        @param safe: clean resulting XHTML to avoid malicious code if True
        @param profile: needed only when syntax_from or syntax_to is set to
            _SYNTAX_CURRENT
        @return(unicode): converted text
        """
        # FIXME: convert should be abled to handle domish.Element directly
        #        when dealing with XHTML
        # TODO: a way for parser to return parsing errors/warnings

        if syntax_from == _SYNTAX_CURRENT:
            syntax_from = self.getCurrentSyntax(profile)
        else:
            syntax_from = syntax_from.lower().strip()
        if syntax_to == _SYNTAX_CURRENT:
            syntax_to = self.getCurrentSyntax(profile)
        else:
            syntax_to = syntax_to.lower().strip()
        syntaxes = self.syntaxes
        if syntax_from not in syntaxes:
            raise exceptions.NotFound(syntax_from)
        if syntax_to not in syntaxes:
            raise exceptions.NotFound(syntax_to)
        d = None

        if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]:
            d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text)
        else:
            d = deferToThread(syntaxes[syntax_from]["to"], text)

        # TODO: keep only body element and change it to a div here ?

        if safe:
            d.addCallback(self.cleanXHTML)

        if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]:
            d.addCallback(syntaxes[syntax_to]["from"])
        else:
            d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml))

        # converters can add new lines that disturb the microblog change detection
        d.addCallback(lambda text: text.rstrip())
        return d

    def addSyntax(self, name, to_xhtml_cb, from_xhtml_cb, flags=None):
        """Add a new syntax to the manager

        @param name: unique name of the syntax
        @param to_xhtml_cb: callback to convert from syntax to XHTML
        @param from_xhtml_cb: callback to convert from XHTML to syntax
        @param flags: set of optional flags, can be:
            TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one)
            TextSyntaxes.OPT_HIDDEN: do not show in parameters
            TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred)
        """
        flags = flags if flags is not None else []
        if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags:
            raise ValueError(
                "{} and {} are mutually exclusive".format(
                    TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT
                )
            )

        syntaxes = self.syntaxes
        key = name.lower().strip()
        if key in syntaxes:
            raise exceptions.ConflictError(
                "This syntax key already exists: {}".format(key)
            )
        syntaxes[key] = {
            "name": name,
            "to": to_xhtml_cb,
            "from": from_xhtml_cb,
            "flags": flags,
        }
        if TextSyntaxes.OPT_DEFAULT in flags:
            TextSyntaxes.default_syntax = key

        self._updateParamOptions()

    def getSyntax(self, name):
        """get syntax key corresponding to a name

        @raise exceptions.NotFound: syntax doesn't exist
        """
        key = name.lower().strip()
        if key in self.syntaxes:
            return key
        raise exceptions.NotFound

    def _removeMarkups(self, xhtml):
        """Remove XHTML markups from the given string.

        @param xhtml: the XHTML string to be cleaned
        @return: the cleaned string
        """
        cleaner = clean.Cleaner(kill_tags=["style"])
        cleaned = cleaner.clean_html(html.fromstring(xhtml))
        return html.tostring(cleaned, encoding=str, method="text")