Mercurial > libervia-backend

diff libervia/backend/plugins/plugin_misc_text_syntaxes.py @ 4071:4b842c1fb686
refactoring: renamed `sat` package to `libervia.backend`
author: Goffi <goffi@goffi.org>
date: Fri, 02 Jun 2023 11:49:51 +0200
parents: sat/plugins/plugin_misc_text_syntaxes.py@00dbc3370d35
children: 26b7ed2817da
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libervia/backend/plugins/plugin_misc_text_syntaxes.py	Fri Jun 02 11:49:51 2023 +0200
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+
+
+# SAT plugin for managing various text syntaxes
+# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from functools import partial
+from html import escape
+import re
+from typing import Set
+
+from twisted.internet import defer
+from twisted.internet.threads import deferToThread
+
+from libervia.backend.core import exceptions
+from libervia.backend.core.constants import Const as C
+from libervia.backend.core.i18n import D_, _
+from libervia.backend.core.log import getLogger
+from libervia.backend.tools import xml_tools
+
+try:
+    from lxml import html
+    from lxml.html import clean
+    from lxml import etree
+except ImportError:
+    raise exceptions.MissingModule(
+        "Missing module lxml, please download/install it from http://lxml.de/"
+    )
+
+log = getLogger(__name__)
+
+CATEGORY = D_("Composition")
+NAME = "Syntax"
+_SYNTAX_XHTML = "xhtml"  # must be lower case
+_SYNTAX_CURRENT = "@CURRENT@"
+
+# TODO: check/adapt following list
+# list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
+STYLES_WHITELIST = (
+    "azimuth",
+    "background-color",
+    "border-bottom-color",
+    "border-collapse",
+    "border-color",
+    "border-left-color",
+    "border-right-color",
+    "border-top-color",
+    "clear",
+    "color",
+    "cursor",
+    "direction",
+    "display",
+    "elevation",
+    "float",
+    "font",
+    "font-family",
+    "font-size",
+    "font-style",
+    "font-variant",
+    "font-weight",
+    "height",
+    "letter-spacing",
+    "line-height",
+    "overflow",
+    "pause",
+    "pause-after",
+    "pause-before",
+    "pitch",
+    "pitch-range",
+    "richness",
+    "speak",
+    "speak-header",
+    "speak-numeral",
+    "speak-punctuation",
+    "speech-rate",
+    "stress",
+    "text-align",
+    "text-decoration",
+    "text-indent",
+    "unicode-bidi",
+    "vertical-align",
+    "voice-family",
+    "volume",
+    "white-space",
+    "width",
+)
+
+# cf. https://www.w3.org/TR/html/syntax.html#void-elements
+VOID_ELEMENTS = (
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "keygen",
+    "link",
+    "menuitem",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr")
+
+SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
+SAFE_CLASSES = {
+    # those classes are used for code highlighting
+    "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
+    "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
+    "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
+    "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
+    "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
+}
+STYLES_VALUES_REGEX = (
+    r"^("
+    + "|".join(
+        [
+            "([a-z-]+)",  # alphabetical names
+            "(#[0-9a-f]+)",  # hex value
+            "(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))",  # values with units (or not)
+            "rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)",  # rgb function
+            "rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)",  # rgba function
+        ]
+    )
+    + ") *(!important)?$"
+)  # we accept "!important" at the end
+STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
+
+PLUGIN_INFO = {
+    C.PI_NAME: "Text syntaxes",
+    C.PI_IMPORT_NAME: "TEXT_SYNTAXES",
+    C.PI_TYPE: "MISC",
+    C.PI_MODES: C.PLUG_MODE_BOTH,
+    C.PI_PROTOCOLS: [],
+    C.PI_DEPENDENCIES: [],
+    C.PI_MAIN: "TextSyntaxes",
+    C.PI_HANDLER: "no",
+    C.PI_DESCRIPTION: _(
+        """Management of various text syntaxes (XHTML-IM, Markdown, etc)"""
+    ),
+}
+
+
+class TextSyntaxes(object):
+    """ Text conversion class
+    XHTML utf-8 is used as intermediate language for conversions
+    """
+
+    OPT_DEFAULT = "DEFAULT"
+    OPT_HIDDEN = "HIDDEN"
+    OPT_NO_THREAD = "NO_THREAD"
+    SYNTAX_XHTML = _SYNTAX_XHTML
+    SYNTAX_MARKDOWN = "markdown"
+    SYNTAX_TEXT = "text"
+    # default_syntax must be lower case
+    default_syntax = SYNTAX_XHTML
+
+
+    def __init__(self, host):
+        log.info(_("Text syntaxes plugin initialization"))
+        self.host = host
+        self.syntaxes = {}
+
+        self.params = """
+            <params>
+            <individual>
+            <category name="%(category_name)s" label="%(category_label)s">
+                <param name="%(name)s" label="%(label)s" type="list" security="0">
+                    %(options)s
+                </param>
+            </category>
+            </individual>
+            </params>
+        """
+
+        self.params_data = {
+            "category_name": CATEGORY,
+            "category_label": _(CATEGORY),
+            "name": NAME,
+            "label": _(NAME),
+            "syntaxes": self.syntaxes,
+        }
+
+        self.add_syntax(
+            self.SYNTAX_XHTML,
+            lambda xhtml: defer.succeed(xhtml),
+            lambda xhtml: defer.succeed(xhtml),
+            TextSyntaxes.OPT_NO_THREAD,
+        )
+        # TODO: text => XHTML should add <a/> to url like in frontends
+        #       it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar
+        self.add_syntax(
+            self.SYNTAX_TEXT,
+            lambda text: escape(text),
+            lambda xhtml: self._remove_markups(xhtml),
+            [TextSyntaxes.OPT_HIDDEN],
+        )
+        try:
+            import markdown, html2text
+            from markdown.extensions import Extension
+
+            # XXX: we disable raw HTML parsing by default, to avoid parsing error
+            #      when the user is not aware of markdown and HTML
+            class EscapeHTML(Extension):
+                def extendMarkdown(self, md):
+                    md.preprocessors.deregister('html_block')
+                    md.inlinePatterns.deregister('html')
+
+            def _html2text(html, baseurl=""):
+                h = html2text.HTML2Text(baseurl=baseurl)
+                h.body_width = 0  # do not truncate the lines, it breaks the long URLs
+                return h.handle(html)
+
+            self.add_syntax(
+                self.SYNTAX_MARKDOWN,
+                partial(markdown.markdown,
+                        extensions=[
+                            EscapeHTML(),
+                            'nl2br',
+                            'codehilite',
+                            'fenced_code',
+                            'sane_lists',
+                            'tables',
+                            ],
+                        extension_configs = {
+                            "codehilite": {
+                                "css_class": "highlight",
+                            }
+                        }),
+                _html2text,
+                [TextSyntaxes.OPT_DEFAULT],
+            )
+        except ImportError:
+            log.warning("markdown or html2text not found, can't use Markdown syntax")
+            log.info(
+                "You can download/install them from https://pythonhosted.org/Markdown/ "
+                "and https://github.com/Alir3z4/html2text/"
+            )
+        host.bridge.add_method(
+            "syntax_convert",
+            ".plugin",
+            in_sign="sssbs",
+            out_sign="s",
+            async_=True,
+            method=self.convert,
+        )
+        host.bridge.add_method(
+            "syntax_get", ".plugin", in_sign="s", out_sign="s", method=self.get_syntax
+        )
+        if xml_tools.clean_xhtml is None:
+            log.debug("Installing cleaning method")
+            xml_tools.clean_xhtml = self.clean_xhtml
+
+    def _update_param_options(self):
+        data_synt = self.syntaxes
+        default_synt = TextSyntaxes.default_syntax
+        syntaxes = []
+
+        for syntax in list(data_synt.keys()):
+            flags = data_synt[syntax]["flags"]
+            if TextSyntaxes.OPT_HIDDEN not in flags:
+                syntaxes.append(syntax)
+
+        syntaxes.sort(key=lambda synt: synt.lower())
+        options = []
+
+        for syntax in syntaxes:
+            selected = 'selected="true"' if syntax == default_synt else ""
+            options.append('<option value="%s" %s/>' % (syntax, selected))
+
+        self.params_data["options"] = "\n".join(options)
+        self.host.memory.update_params(self.params % self.params_data)
+
+    def get_current_syntax(self, profile):
+        """ Return the selected syntax for the given profile
+
+        @param profile: %(doc_profile)s
+        @return: profile selected syntax
+        """
+        return self.host.memory.param_get_a(NAME, CATEGORY, profile_key=profile)
+
+    def _log_error(self, failure, action="converting syntax"):
+        log.error(
+            "Error while {action}: {failure}".format(action=action, failure=failure)
+        )
+        return failure
+
+    def clean_style(self, styles_raw: str) -> str:
+        """"Clean unsafe CSS styles
+
+        Remove styles not in the whitelist, or where the value doesn't match the regex
+        @param styles_raw: CSS styles
+        @return: cleaned styles
+        """
+        styles: List[str] = styles_raw.split(";")
+        cleaned_styles = []
+        for style in styles:
+            try:
+                key, value = style.split(":")
+            except ValueError:
+                continue
+            key = key.lower().strip()
+            if key not in STYLES_WHITELIST:
+                continue
+            value = value.lower().strip()
+            if not STYLES_ACCEPTED_VALUE.match(value):
+                continue
+            if value == "none":
+                continue
+            cleaned_styles.append((key, value))
+        return "; ".join(
+            ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
+        )
+
+    def clean_classes(self, classes_raw: str) -> str:
+        """Remove any non whitelisted class
+
+        @param classes_raw: classes set on an element
+        @return: remaining classes (can be empty string)
+        """
+        return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))
+
+    def clean_xhtml(self, xhtml):
+        """Clean XHTML text by removing potentially dangerous/malicious parts
+
+        @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
+        @return (unicode): cleaned XHTML
+        """
+
+        if isinstance(xhtml, str):
+            try:
+                xhtml_elt = html.fromstring(xhtml)
+            except etree.ParserError as e:
+                if not xhtml.strip():
+                    return ""
+                log.error("Can't clean XHTML: {xhtml}".format(xhtml=xhtml))
+                raise e
+        elif isinstance(xhtml, html.HtmlElement):
+            xhtml_elt = xhtml
+        else:
+            log.error("Only strings and HtmlElements can be cleaned")
+            raise exceptions.DataError
+        cleaner = clean.Cleaner(
+            style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
+        )
+        xhtml_elt = cleaner.clean_html(xhtml_elt)
+        for elt in xhtml_elt.xpath("//*[@style]"):
+            elt.set("style", self.clean_style(elt.get("style")))
+        for elt in xhtml_elt.xpath("//*[@class]"):
+            elt.set("class", self.clean_classes(elt.get("class")))
+        # we remove self-closing elements for non-void elements
+        for element in xhtml_elt.iter(tag=etree.Element):
+            if not element.text:
+                if element.tag in VOID_ELEMENTS:
+                    element.text = None
+                else:
+                    element.text = ''
+        return html.tostring(xhtml_elt, encoding=str, method="xml")
+
+    def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True,
+                profile=None):
+        """Convert a text between two syntaxes
+
+        @param text: text to convert
+        @param syntax_from: source syntax (e.g. "markdown")
+        @param syntax_to: dest syntax (e.g.: "XHTML")
+        @param safe: clean resulting XHTML to avoid malicious code if True
+        @param profile: needed only when syntax_from or syntax_to is set to
+            _SYNTAX_CURRENT
+        @return(unicode): converted text
+        """
+        # FIXME: convert should be abled to handle domish.Element directly
+        #        when dealing with XHTML
+        # TODO: a way for parser to return parsing errors/warnings
+
+        if syntax_from == _SYNTAX_CURRENT:
+            syntax_from = self.get_current_syntax(profile)
+        else:
+            syntax_from = syntax_from.lower().strip()
+        if syntax_to == _SYNTAX_CURRENT:
+            syntax_to = self.get_current_syntax(profile)
+        else:
+            syntax_to = syntax_to.lower().strip()
+        syntaxes = self.syntaxes
+        if syntax_from not in syntaxes:
+            raise exceptions.NotFound(syntax_from)
+        if syntax_to not in syntaxes:
+            raise exceptions.NotFound(syntax_to)
+        d = None
+
+        if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]:
+            d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text)
+        else:
+            d = deferToThread(syntaxes[syntax_from]["to"], text)
+
+        # TODO: keep only body element and change it to a div here ?
+
+        if safe:
+            d.addCallback(self.clean_xhtml)
+
+        if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]:
+            d.addCallback(syntaxes[syntax_to]["from"])
+        else:
+            d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml))
+
+        # converters can add new lines that disturb the microblog change detection
+        d.addCallback(lambda text: text.rstrip())
+        return d
+
+    def add_syntax(self, name, to_xhtml_cb, from_xhtml_cb, flags=None):
+        """Add a new syntax to the manager
+
+        @param name: unique name of the syntax
+        @param to_xhtml_cb: callback to convert from syntax to XHTML
+        @param from_xhtml_cb: callback to convert from XHTML to syntax
+        @param flags: set of optional flags, can be:
+            TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one)
+            TextSyntaxes.OPT_HIDDEN: do not show in parameters
+            TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred)
+        """
+        flags = flags if flags is not None else []
+        if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags:
+            raise ValueError(
+                "{} and {} are mutually exclusive".format(
+                    TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT
+                )
+            )
+
+        syntaxes = self.syntaxes
+        key = name.lower().strip()
+        if key in syntaxes:
+            raise exceptions.ConflictError(
+                "This syntax key already exists: {}".format(key)
+            )
+        syntaxes[key] = {
+            "name": name,
+            "to": to_xhtml_cb,
+            "from": from_xhtml_cb,
+            "flags": flags,
+        }
+        if TextSyntaxes.OPT_DEFAULT in flags:
+            TextSyntaxes.default_syntax = key
+
+        self._update_param_options()
+
+    def get_syntax(self, name):
+        """get syntax key corresponding to a name
+
+        @raise exceptions.NotFound: syntax doesn't exist
+        """
+        key = name.lower().strip()
+        if key in self.syntaxes:
+            return key
+        raise exceptions.NotFound
+
+    def _remove_markups(self, xhtml):
+        """Remove XHTML markups from the given string.
+
+        @param xhtml: the XHTML string to be cleaned
+        @return: the cleaned string
+        """
+        cleaner = clean.Cleaner(kill_tags=["style"])
+        cleaned = cleaner.clean_html(html.fromstring(xhtml))
+        return html.tostring(cleaned, encoding=str, method="text")
author	Goffi <goffi@goffi.org>
date	Fri, 02 Jun 2023 11:49:51 +0200
parents	sat/plugins/plugin_misc_text_syntaxes.py@00dbc3370d35
children	26b7ed2817da