Mercurial > libervia-backend
view sat/plugins/plugin_misc_text_syntaxes.py @ 3671:9c50d2f812c1
docker (e2e): add `pytest-twisted` to image
author | Goffi <goffi@goffi.org> |
---|---|
date | Wed, 08 Sep 2021 17:58:48 +0200 |
parents | f568f304c982 |
children | 09f5ac48ffe3 |
line wrap: on
line source
#!/usr/bin/env python3 # SAT plugin for managing various text syntaxes # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import re from html import escape from functools import partial from sat.core.i18n import _, D_ from sat.core.constants import Const as C from sat.core.log import getLogger from twisted.internet import defer from twisted.internet.threads import deferToThread from sat.core import exceptions from sat.tools import xml_tools try: from lxml import html from lxml.html import clean from lxml import etree except ImportError: raise exceptions.MissingModule( "Missing module lxml, please download/install it from http://lxml.de/" ) log = getLogger(__name__) CATEGORY = D_("Composition") NAME = "Syntax" _SYNTAX_XHTML = "xhtml" # must be lower case _SYNTAX_CURRENT = "@CURRENT@" # TODO: check/adapt following list # list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html) STYLES_WHITELIST = ( "azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width", ) # cf. https://www.w3.org/TR/html/syntax.html#void-elements VOID_ELEMENTS = ( "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "menuitem", "meta", "param", "source", "track", "wbr") SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) STYLES_VALUES_REGEX = ( r"^(" + "|".join( [ "([a-z-]+)", # alphabetical names "(#[0-9a-f]+)", # hex value "(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))", # values with units (or not) "rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)", # rgb function "rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)", # rgba function ] ) + ") *(!important)?$" ) # we accept "!important" at the end STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX) PLUGIN_INFO = { C.PI_NAME: "Text syntaxes", C.PI_IMPORT_NAME: "TEXT_SYNTAXES", C.PI_TYPE: "MISC", C.PI_PROTOCOLS: [], C.PI_DEPENDENCIES: [], C.PI_MAIN: "TextSyntaxes", C.PI_HANDLER: "no", C.PI_DESCRIPTION: _( """Management of various text syntaxes (XHTML-IM, Markdown, etc)""" ), } class TextSyntaxes(object): """ Text conversion class XHTML utf-8 is used as intermediate language for conversions """ OPT_DEFAULT = "DEFAULT" OPT_HIDDEN = "HIDDEN" OPT_NO_THREAD = "NO_THREAD" SYNTAX_XHTML = _SYNTAX_XHTML SYNTAX_MARKDOWN = "markdown" SYNTAX_TEXT = "text" # default_syntax must be lower case default_syntax = SYNTAX_XHTML def __init__(self, host): log.info(_("Text syntaxes plugin initialization")) self.host = host self.syntaxes = {} self.params = """ <params> <individual> <category name="%(category_name)s" label="%(category_label)s"> <param name="%(name)s" label="%(label)s" type="list" security="0"> %(options)s </param> </category> </individual> </params> """ self.params_data = { "category_name": CATEGORY, "category_label": _(CATEGORY), "name": NAME, "label": _(NAME), "syntaxes": self.syntaxes, } self.addSyntax( self.SYNTAX_XHTML, lambda xhtml: defer.succeed(xhtml), lambda xhtml: defer.succeed(xhtml), TextSyntaxes.OPT_NO_THREAD, ) # TODO: text => XHTML should add <a/> to url like in frontends # it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar self.addSyntax( self.SYNTAX_TEXT, lambda text: escape(text), lambda xhtml: self._removeMarkups(xhtml), [TextSyntaxes.OPT_HIDDEN], ) try: import markdown, html2text from markdown.extensions import Extension # XXX: we disable raw HTML parsing by default, to avoid parsing error # when the user is not aware of markdown and HTML class EscapeHTML(Extension): def extendMarkdown(self, md): md.preprocessors.deregister('html_block') md.inlinePatterns.deregister('html') def _html2text(html, baseurl=""): h = html2text.HTML2Text(baseurl=baseurl) h.body_width = 0 # do not truncate the lines, it breaks the long URLs return h.handle(html) self.addSyntax( self.SYNTAX_MARKDOWN, partial(markdown.markdown, extensions=[ EscapeHTML(), 'nl2br', 'codehilite', 'fenced_code', 'sane_lists', 'tables', ], extension_configs = { "codehilite": { "css_class": "highlight", } }), _html2text, [TextSyntaxes.OPT_DEFAULT], ) except ImportError: log.warning("markdown or html2text not found, can't use Markdown syntax") log.info( "You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/" ) host.bridge.addMethod( "syntaxConvert", ".plugin", in_sign="sssbs", out_sign="s", async_=True, method=self.convert, ) host.bridge.addMethod( "syntaxGet", ".plugin", in_sign="s", out_sign="s", method=self.getSyntax ) if xml_tools.cleanXHTML is None: log.debug("Installing cleaning method") xml_tools.cleanXHTML = self.cleanXHTML def _updateParamOptions(self): data_synt = self.syntaxes default_synt = TextSyntaxes.default_syntax syntaxes = [] for syntax in list(data_synt.keys()): flags = data_synt[syntax]["flags"] if TextSyntaxes.OPT_HIDDEN not in flags: syntaxes.append(syntax) syntaxes.sort(key=lambda synt: synt.lower()) options = [] for syntax in syntaxes: selected = 'selected="true"' if syntax == default_synt else "" options.append('<option value="%s" %s/>' % (syntax, selected)) self.params_data["options"] = "\n".join(options) self.host.memory.updateParams(self.params % self.params_data) def getCurrentSyntax(self, profile): """ Return the selected syntax for the given profile @param profile: %(doc_profile)s @return: profile selected syntax """ return self.host.memory.getParamA(NAME, CATEGORY, profile_key=profile) def _logError(self, failure, action="converting syntax"): log.error( "Error while {action}: {failure}".format(action=action, failure=failure) ) return failure def cleanStyle(self, styles): """"Clean unsafe CSS styles Remove styles not in the whitelist, or where the value doesn't match the regex @param styles_raw(unicode): CSS styles @return (unicode): cleaned styles """ styles = styles.split(";") cleaned_styles = [] for style in styles: try: key, value = style.split(":") except ValueError: continue key = key.lower().strip() if key not in STYLES_WHITELIST: continue value = value.lower().strip() if not STYLES_ACCEPTED_VALUE.match(value): continue if value == "none": continue cleaned_styles.append((key, value)) return "; ".join( ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles] ) def cleanXHTML(self, xhtml): """Clean XHTML text by removing potentially dangerous/malicious parts @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean @return (unicode): cleaned XHTML """ if isinstance(xhtml, str): try: xhtml_elt = html.fromstring(xhtml) except etree.ParserError as e: if not xhtml.strip(): return "" log.error("Can't clean XHTML: {xhtml}".format(xhtml=xhtml)) raise e elif isinstance(xhtml, html.HtmlElement): xhtml_elt = xhtml else: log.error("Only strings and HtmlElements can be cleaned") raise exceptions.DataError cleaner = clean.Cleaner( style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS ) xhtml_elt = cleaner.clean_html(xhtml_elt) for elt in xhtml_elt.xpath("//*[@style]"): elt.set("style", self.cleanStyle(elt.get("style"))) # we remove self-closing elements for non-void elements for element in xhtml_elt.iter(tag=etree.Element): if not element.text: if element.tag in VOID_ELEMENTS: element.text = None else: element.text = '' return html.tostring(xhtml_elt, encoding=str, method="xml") def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None): """Convert a text between two syntaxes @param text: text to convert @param syntax_from: source syntax (e.g. "markdown") @param syntax_to: dest syntax (e.g.: "XHTML") @param safe: clean resulting XHTML to avoid malicious code if True @param profile: needed only when syntax_from or syntax_to is set to _SYNTAX_CURRENT @return(unicode): converted text """ # FIXME: convert should be abled to handle domish.Element directly # when dealing with XHTML # TODO: a way for parser to return parsing errors/warnings if syntax_from == _SYNTAX_CURRENT: syntax_from = self.getCurrentSyntax(profile) else: syntax_from = syntax_from.lower().strip() if syntax_to == _SYNTAX_CURRENT: syntax_to = self.getCurrentSyntax(profile) else: syntax_to = syntax_to.lower().strip() syntaxes = self.syntaxes if syntax_from not in syntaxes: raise exceptions.NotFound(syntax_from) if syntax_to not in syntaxes: raise exceptions.NotFound(syntax_to) d = None if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]: d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text) else: d = deferToThread(syntaxes[syntax_from]["to"], text) # TODO: keep only body element and change it to a div here ? if safe: d.addCallback(self.cleanXHTML) if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]: d.addCallback(syntaxes[syntax_to]["from"]) else: d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml)) # converters can add new lines that disturb the microblog change detection d.addCallback(lambda text: text.rstrip()) return d def addSyntax(self, name, to_xhtml_cb, from_xhtml_cb, flags=None): """Add a new syntax to the manager @param name: unique name of the syntax @param to_xhtml_cb: callback to convert from syntax to XHTML @param from_xhtml_cb: callback to convert from XHTML to syntax @param flags: set of optional flags, can be: TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one) TextSyntaxes.OPT_HIDDEN: do not show in parameters TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred) """ flags = flags if flags is not None else [] if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags: raise ValueError( "{} and {} are mutually exclusive".format( TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT ) ) syntaxes = self.syntaxes key = name.lower().strip() if key in syntaxes: raise exceptions.ConflictError( "This syntax key already exists: {}".format(key) ) syntaxes[key] = { "name": name, "to": to_xhtml_cb, "from": from_xhtml_cb, "flags": flags, } if TextSyntaxes.OPT_DEFAULT in flags: TextSyntaxes.default_syntax = key self._updateParamOptions() def getSyntax(self, name): """get syntax key corresponding to a name @raise exceptions.NotFound: syntax doesn't exist """ key = name.lower().strip() if key in self.syntaxes: return key raise exceptions.NotFound def _removeMarkups(self, xhtml): """Remove XHTML markups from the given string. @param xhtml: the XHTML string to be cleaned @return: the cleaned string """ cleaner = clean.Cleaner(kill_tags=["style"]) cleaned = cleaner.clean_html(html.fromstring(xhtml)) return html.tostring(cleaned, encoding=str, method="text")