diff src/plugins/plugin_xep_0071.py @ 668:7bb50096d225

plugin XEP_0071: first draft a new "xhtml" key is added in bridge newMessage's extra data when rich text is found.
author Goffi <goffi@goffi.org>
date Tue, 05 Nov 2013 22:41:45 +0100
parents
children 98b2400e17d6
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/plugin_xep_0071.py	Tue Nov 05 22:41:45 2013 +0100
@@ -0,0 +1,177 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# SAT plugin for Publish-Subscribe (xep-0071)
+# Copyright (C) 2009, 2010, 2011, 2012, 2013 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from logging import debug, info, error
+
+from wokkel import disco, pubsub, iwokkel
+from zope.interface import implements
+# from lxml import etree
+from lxml import html
+try:
+    from twisted.words.protocols.xmlstream import XMPPHandler
+except ImportError:
+    from wokkel.subprotocols import XMPPHandler
+
+NS_XHTML_IM = 'http://jabber.org/protocol/xhtml-im'
+NS_XHTML = 'http://www.w3.org/1999/xhtml'
+
+PLUGIN_INFO = {
+    "name": "XHTML-IM Plugin",
+    "import_name": "XEP-0071",
+    "type": "XEP",
+    "protocols": ["XEP-0071"],
+    "dependencies": ["TEXT-SYNTAXES"],
+    "main": "XEP_0071",
+    "handler": "yes",
+    "description": _("""Implementation of XHTML-IM""")
+}
+
+allowed = {
+          "a": set(["href", "style", "type"]),
+          "blockquote": set(["style"]),
+          "body": set(["style"]),
+          "br": set([]),
+          "cite": set(["style"]),
+          "em": set([]),
+          "img": set(["alt", "height", "src", "style", "width"]),
+          "li": set(["style"]),
+          "ol": set(["style"]),
+          "p": set(["style"]),
+          "span": set(["style"]),
+          "strong": set([]),
+          "ul": set(["style"]),
+          }
+
+styles_allowed = ["background-color", "color", "font-family", "font-size", "font-style", "font-weight", "margin-left", "margin-right", "text-align", "text-decoration"]
+
+blacklist = ['script'] # tag that we have to kill (we don't keep content)
+
+
+class XEP_0071(object):
+    SYNTAX_XHTML_IM = "XHTML-IM"
+
+    def __init__(self, host):
+        info(_("XHTML-IM plugin initialization"))
+        self.host = host
+        txt_synt_plg = self.host.plugins["TEXT-SYNTAXES"]
+        txt_synt_plg.addSyntax(self.SYNTAX_XHTML_IM, lambda xhtml: xhtml, self.XHTML2XHTML_IM, [txt_synt_plg.OPT_HIDDEN])
+        host.trigger.add("MessageReceived", self.messageReceivedTrigger)
+
+    def getHandler(self, profile):
+        return XEP_0071_handler(self)
+
+    def _messagePostTreat(self, data, body_elt):
+        """ Callback which manage the post treatment of the message in case of XHTML-IM found
+        @param data: data send by MessageReceived trigger through post_treat deferred
+        @param xhtml_im: XHTML-IM body element found
+        @return: the data with the extra parameter updated
+        """
+        #TODO: check if text only body is empty, then try to convert XHTML-IM to pure text and show a warning message
+        def converted(xhtml):
+            data['extra']['xhtml'] = xhtml
+            return data
+        txt_synt_plg = self.host.plugins["TEXT-SYNTAXES"]
+        d = txt_synt_plg.convert(body_elt.toXml(), self.SYNTAX_XHTML_IM, safe=True)
+        d.addCallback(converted)
+        return d
+
+    def messageReceivedTrigger(self, message, post_treat, profile):
+        """ Check presence of XHTML-IM in message
+        """
+        try:
+            html_elt = message.elements(NS_XHTML_IM, 'html').next()
+            body_elt = html_elt.elements(NS_XHTML, 'body').next()
+            # OK, we have found rich text
+            post_treat.addCallback(self._messagePostTreat, body_elt)
+        except StopIteration:
+            # No XHTML-IM
+            pass
+        return True
+
+    def _purgeStyle(self, styles_raw):
+        """ Remove unauthorised styles according to the XEP-0071
+        @param styles_raw: raw styles (value of the style attribute)
+        """
+        purged = []
+
+        styles = [style.strip().split(':') for style in styles_raw.split(';')]
+
+        for style_tuple in styles:
+            if len(style_tuple) != 2:
+                continue
+            name, value = style_tuple
+            name = name.strip()
+            if name not in styles_allowed:
+                continue
+            purged.append((name, value.strip()))
+
+        return u'; '.join([u"%s: %s" % data for data in purged])
+
+    def XHTML2XHTML_IM(self, xhtml):
+        """ Convert XHTML document to XHTML_IM subset
+        @param xhtml: raw xhtml to convert
+        """
+        # TODO: more clever tag replacement (replace forbidden tags with equivalents when possible)
+
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        root = html.fromstring(xhtml, parser=parser)
+        body_elt = root.find('body')
+        if body_elt is None:
+            # we use the whole XML as body if no body element is found
+            body_elt = html.Element('body')
+            body_elt.append(root)
+        else:
+            body_elt.attrib.clear()
+
+        allowed_tags = allowed.keys()
+        to_strip = []
+        for elem in body_elt.iter():
+            if elem.tag not in allowed_tags:
+                to_strip.append(elem)
+            else:
+                # we remove unallowed attributes
+                attrib = elem.attrib
+                att_to_remove = set(attrib).difference(allowed[elem.tag])
+                for att in att_to_remove:
+                    del(attrib[att])
+                if "style" in attrib:
+                    attrib["style"] = self._purgeStyle(attrib["style"])
+
+        for elem in to_strip:
+            if elem.tag in blacklist:
+                #we need to remove the element and all descendants
+                debug(u"removing black listed tag: %s" % (elem.tag))
+                elem.drop_tree()
+            else:
+                elem.drop_tag()
+
+        return html.tostring(body_elt, encoding='unicode', method='xml')
+
+class XEP_0071_handler(XMPPHandler):
+    implements(iwokkel.IDisco)
+
+    def __init__(self, plugin_parent):
+        self.plugin_parent = plugin_parent
+        self.host = plugin_parent.host
+
+    def getDiscoInfo(self, requestor, target, nodeIdentifier=''):
+        return [disco.DiscoFeature(NS_XHTML_IM)]
+
+    def getDiscoItems(self, requestor, target, nodeIdentifier=''):
+        return []