diff sat/plugins/plugin_misc_text_syntaxes.py @ 2562:26edcf3a30eb

core, setup: huge cleaning: - moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention - move twisted directory to root - removed all hacks from setup.py, and added missing dependencies, it is now clean - use https URL for website in setup.py - removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed - renamed sat.sh to sat and fixed its installation - added python_requires to specify Python version needed - replaced glib2reactor which use deprecated code by gtk3reactor sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author Goffi <goffi@goffi.org>
date Mon, 02 Apr 2018 19:44:50 +0200
parents src/plugins/plugin_misc_text_syntaxes.py@0046283a285d
children 56f94936df1e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sat/plugins/plugin_misc_text_syntaxes.py	Mon Apr 02 19:44:50 2018 +0200
@@ -0,0 +1,295 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# SAT plugin for managing various text syntaxes
+# Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from sat.core.i18n import _, D_
+from sat.core.constants import Const as C
+from sat.core.log import getLogger
+log = getLogger(__name__)
+
+from twisted.internet import defer
+from twisted.internet.threads import deferToThread
+from sat.core import exceptions
+try:
+    from lxml import html
+    from lxml.html import clean
+except ImportError:
+    raise exceptions.MissingModule(u"Missing module lxml, please download/install it from http://lxml.de/")
+from cgi import escape
+import re
+
+
+CATEGORY = D_("Composition")
+NAME = "Syntax"
+_SYNTAX_XHTML = "XHTML"
+_SYNTAX_CURRENT = "@CURRENT@"
+
+# TODO: check/adapt following list
+# list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
+STYLES_WHITELIST = ("azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width")
+
+SAFE_ATTRS = html.defs.safe_attrs.union(('style', 'poster', 'controls'))
+STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names
+                                       '(#[0-9a-f]+)', # hex value
+                                       '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not)
+                                       'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function
+                                       'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function
+                                      ]) + ') *(!important)?$' # we accept "!important" at the end
+STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
+
+PLUGIN_INFO = {
+    C.PI_NAME: "Text syntaxes",
+    C.PI_IMPORT_NAME: "TEXT-SYNTAXES",
+    C.PI_TYPE: "MISC",
+    C.PI_PROTOCOLS: [],
+    C.PI_DEPENDENCIES: [],
+    C.PI_MAIN: "TextSyntaxes",
+    C.PI_HANDLER: "no",
+    C.PI_DESCRIPTION: _("""Management of various text syntaxes (XHTML-IM, Markdown, etc)""")
+}
+
+
+class TextSyntaxes(object):
+    """ Text conversion class
+    XHTML utf-8 is used as intermediate language for conversions
+    """
+
+    OPT_DEFAULT = "DEFAULT"
+    OPT_HIDDEN = "HIDDEN"
+    OPT_NO_THREAD = "NO_THREAD"
+    SYNTAX_XHTML = _SYNTAX_XHTML
+    SYNTAX_MARKDOWN = "markdown"
+    SYNTAX_TEXT = "text"
+    syntaxes = {}
+    default_syntax = SYNTAX_XHTML
+
+    params = """
+    <params>
+    <individual>
+    <category name="%(category_name)s" label="%(category_label)s">
+        <param name="%(name)s" label="%(label)s" type="list" security="0">
+            %(options)s
+        </param>
+    </category>
+    </individual>
+    </params>
+    """
+
+    params_data = {
+        'category_name': CATEGORY,
+        'category_label': _(CATEGORY),
+        'name': NAME,
+        'label': _(NAME),
+        'syntaxes': syntaxes,
+        }
+
+    def __init__(self, host):
+        log.info(_("Text syntaxes plugin initialization"))
+        self.host = host
+        self.addSyntax(self.SYNTAX_XHTML, lambda xhtml: defer.succeed(xhtml), lambda xhtml: defer.succeed(xhtml),
+                       TextSyntaxes.OPT_NO_THREAD)
+        # TODO: text => XHTML should add <a/> to url like in frontends
+        #       it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar
+        self.addSyntax(self.SYNTAX_TEXT, lambda text: escape(text), lambda xhtml: self._removeMarkups(xhtml), [TextSyntaxes.OPT_HIDDEN])
+        try:
+            import markdown, html2text
+
+            def _html2text(html, baseurl=''):
+                h = html2text.HTML2Text(baseurl=baseurl)
+                h.body_width = 0  # do not truncate the lines, it breaks the long URLs
+                return h.handle(html)
+            self.addSyntax(self.SYNTAX_MARKDOWN, markdown.markdown, _html2text, [TextSyntaxes.OPT_DEFAULT])
+        except ImportError:
+            log.warning(u"markdown or html2text not found, can't use Markdown syntax")
+            log.info(u"You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/")
+        host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssbs', out_sign='s',
+                              async=True, method=self.convert)
+        host.bridge.addMethod("syntaxGet", ".plugin", in_sign='s', out_sign='s',
+                              method=self.getSyntax)
+
+    def _updateParamOptions(self):
+        data_synt = TextSyntaxes.syntaxes
+        default_synt = TextSyntaxes.default_syntax
+        syntaxes = []
+
+        for syntax in data_synt.keys():
+            flags = data_synt[syntax]["flags"]
+            if TextSyntaxes.OPT_HIDDEN not in flags:
+                syntaxes.append(syntax)
+
+        syntaxes.sort(key=lambda synt: synt.lower())
+        options = []
+
+        for syntax in syntaxes:
+            selected = 'selected="true"' if syntax == default_synt else ''
+            options.append(u'<option value="%s" %s/>' % (syntax, selected))
+
+        TextSyntaxes.params_data["options"] = u'\n'.join(options)
+        self.host.memory.updateParams(TextSyntaxes.params % TextSyntaxes.params_data)
+
+    def getCurrentSyntax(self, profile):
+        """ Return the selected syntax for the given profile
+
+        @param profile: %(doc_profile)s
+        @return: profile selected syntax
+        """
+        return self.host.memory.getParamA(NAME, CATEGORY , profile_key=profile)
+
+    def _logError(self, failure, action=u"converting syntax"):
+        log.error(u"Error while {action}: {failure}".format(action=action, failure=failure))
+        return failure
+
+    def cleanXHTML(self, xhtml):
+        """ Clean XHTML text by removing potentially dangerous/malicious parts
+        @param xhtml: raw xhtml text to clean (or lxml's HtmlElement)
+        """
+        def blocking_cleaning(xhtml):
+            """ Clean XHTML and style attributes """
+
+            def clean_style(styles_raw):
+                """" Remove styles not in the whitelist,
+                or where the value doesn't match the regex """
+                styles = styles_raw.split(";")
+                cleaned_styles = []
+                for style in styles:
+                    try:
+                        key, value = style.split(':')
+                    except ValueError:
+                        continue
+                    key = key.lower().strip()
+                    if key not in STYLES_WHITELIST:
+                        continue
+                    value = value.lower().strip()
+                    if not STYLES_ACCEPTED_VALUE.match(value):
+                        continue
+                    if value == "none":
+                        continue
+                    cleaned_styles.append((key, value))
+                return "; ".join(["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles])
+
+            if isinstance(xhtml, basestring):
+                xhtml_elt = html.fromstring(xhtml)
+            elif isinstance(xhtml, html.HtmlElement):
+                xhtml_elt = xhtml
+            else:
+                log.error("Only strings and HtmlElements can be cleaned")
+                raise exceptions.DataError
+            cleaner = clean.Cleaner(style=False,
+                                    add_nofollow=False,
+                                    safe_attrs=SAFE_ATTRS)
+            xhtml_elt = cleaner.clean_html(xhtml_elt)
+            for elt in xhtml_elt.xpath("//*[@style]"):
+                elt.set("style", clean_style(elt.get('style')))
+            return html.tostring(xhtml_elt, encoding=unicode, method='xml')
+
+        d = deferToThread(blocking_cleaning, xhtml)
+        d.addErrback(self._logError, action=u"cleaning syntax")
+        return d
+
+    def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None):
+        """Convert a text between two syntaxes
+
+        @param text: text to convert
+        @param syntax_from: source syntax (e.g. "markdown")
+        @param syntax_to: dest syntax (e.g.: "XHTML")
+        @param safe: clean resulting XHTML to avoid malicious code if True
+        @param profile: needed only when syntax_from or syntax_to is set to _SYNTAX_CURRENT
+        @return(unicode): converted text
+        """
+        # FIXME: convert should be abled to handle domish.Element directly
+        #        when dealing with XHTML
+        # TODO: a way for parser to return parsing errors/warnings
+
+        if syntax_from == _SYNTAX_CURRENT:
+            syntax_from = self.getCurrentSyntax(profile)
+        else:
+            syntax_from = syntax_from.lower().strip()
+        if syntax_to == _SYNTAX_CURRENT:
+            syntax_to = self.getCurrentSyntax(profile)
+        else:
+            syntax_to = syntax_to.lower().strip()
+        syntaxes = TextSyntaxes.syntaxes
+        if syntax_from not in syntaxes:
+            raise exceptions.NotFound(syntax_from)
+        if syntax_to not in syntaxes:
+            raise exceptions.NotFound(syntax_to)
+        d = None
+
+        if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]:
+            d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text)
+        else:
+            d = deferToThread(syntaxes[syntax_from]["to"], text)
+
+        #TODO: keep only body element and change it to a div here ?
+
+        if safe:
+            d.addCallback(self.cleanXHTML)
+
+        if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]:
+            d.addCallback(syntaxes[syntax_to]["from"])
+        else:
+            d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml))
+
+        # converters can add new lines that disturb the microblog change detection
+        d.addCallback(lambda text: text.rstrip())
+        return d
+
+    def addSyntax(self, name, to_xhtml_cb, from_xhtml_cb, flags = None):
+        """Add a new syntax to the manager
+
+        @param name: unique name of the syntax
+        @param to_xhtml_cb: callback to convert from syntax to XHTML
+        @param from_xhtml_cb: callback to convert from XHTML to syntax
+        @param flags: set of optional flags, can be:
+            TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one)
+            TextSyntaxes.OPT_HIDDEN: do not show in parameters
+            TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred)
+        """
+        flags = flags if flags is not None else []
+        if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags:
+            raise ValueError(u"{} and {} are mutually exclusive".format(TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT))
+
+        syntaxes = TextSyntaxes.syntaxes
+        key = name.lower().strip()
+        if key in syntaxes:
+            raise exceptions.ConflictError(u"This syntax key already exists: {}".format(key))
+        syntaxes[key] = {"name": name, "to": to_xhtml_cb, "from": from_xhtml_cb, "flags": flags}
+        if TextSyntaxes.OPT_DEFAULT in flags:
+            TextSyntaxes.default_syntaxe = key
+
+        self._updateParamOptions()
+
+    def getSyntax(self, name):
+        """get syntax key corresponding to a name
+
+        @raise exceptions.NotFound: syntax doesn't exist
+        """
+        key = name.lower().strip()
+        if key in self.syntaxes:
+            return key
+        raise exceptions.NotFound
+
+    def _removeMarkups(self, xhtml):
+        """Remove XHTML markups from the given string.
+
+        @param xhtml: the XHTML string to be cleaned
+        @return: the cleaned string
+        """
+        cleaner = clean.Cleaner(kill_tags=['style'])
+        cleaned = cleaner.clean_html(html.fromstring(xhtml))
+        return html.tostring(cleaned, encoding=unicode, method="text")