# HG changeset patch # User Goffi # Date 1383687705 -3600 # Node ID 7bb50096d225e0f2dbf659ecefc039b4a2d3ad1a # Parent a79a6843928c995d405e07283dc82a76d37d0ecd plugin XEP_0071: first draft a new "xhtml" key is added in bridge newMessage's extra data when rich text is found. diff -r a79a6843928c -r 7bb50096d225 src/plugins/plugin_xep_0071.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/plugin_xep_0071.py Tue Nov 05 22:41:45 2013 +0100 @@ -0,0 +1,177 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# SAT plugin for Publish-Subscribe (xep-0071) +# Copyright (C) 2009, 2010, 2011, 2012, 2013 Jérôme Poisson (goffi@goffi.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from logging import debug, info, error + +from wokkel import disco, pubsub, iwokkel +from zope.interface import implements +# from lxml import etree +from lxml import html +try: + from twisted.words.protocols.xmlstream import XMPPHandler +except ImportError: + from wokkel.subprotocols import XMPPHandler + +NS_XHTML_IM = 'http://jabber.org/protocol/xhtml-im' +NS_XHTML = 'http://www.w3.org/1999/xhtml' + +PLUGIN_INFO = { + "name": "XHTML-IM Plugin", + "import_name": "XEP-0071", + "type": "XEP", + "protocols": ["XEP-0071"], + "dependencies": ["TEXT-SYNTAXES"], + "main": "XEP_0071", + "handler": "yes", + "description": _("""Implementation of XHTML-IM""") +} + +allowed = { + "a": set(["href", "style", "type"]), + "blockquote": set(["style"]), + "body": set(["style"]), + "br": set([]), + "cite": set(["style"]), + "em": set([]), + "img": set(["alt", "height", "src", "style", "width"]), + "li": set(["style"]), + "ol": set(["style"]), + "p": set(["style"]), + "span": set(["style"]), + "strong": set([]), + "ul": set(["style"]), + } + +styles_allowed = ["background-color", "color", "font-family", "font-size", "font-style", "font-weight", "margin-left", "margin-right", "text-align", "text-decoration"] + +blacklist = ['script'] # tag that we have to kill (we don't keep content) + + +class XEP_0071(object): + SYNTAX_XHTML_IM = "XHTML-IM" + + def __init__(self, host): + info(_("XHTML-IM plugin initialization")) + self.host = host + txt_synt_plg = self.host.plugins["TEXT-SYNTAXES"] + txt_synt_plg.addSyntax(self.SYNTAX_XHTML_IM, lambda xhtml: xhtml, self.XHTML2XHTML_IM, [txt_synt_plg.OPT_HIDDEN]) + host.trigger.add("MessageReceived", self.messageReceivedTrigger) + + def getHandler(self, profile): + return XEP_0071_handler(self) + + def _messagePostTreat(self, data, body_elt): + """ Callback which manage the post treatment of the message in case of XHTML-IM found + @param data: data send by MessageReceived trigger through post_treat deferred + @param xhtml_im: XHTML-IM body element found + @return: the data with the extra parameter updated + """ + #TODO: check if text only body is empty, then try to convert XHTML-IM to pure text and show a warning message + def converted(xhtml): + data['extra']['xhtml'] = xhtml + return data + txt_synt_plg = self.host.plugins["TEXT-SYNTAXES"] + d = txt_synt_plg.convert(body_elt.toXml(), self.SYNTAX_XHTML_IM, safe=True) + d.addCallback(converted) + return d + + def messageReceivedTrigger(self, message, post_treat, profile): + """ Check presence of XHTML-IM in message + """ + try: + html_elt = message.elements(NS_XHTML_IM, 'html').next() + body_elt = html_elt.elements(NS_XHTML, 'body').next() + # OK, we have found rich text + post_treat.addCallback(self._messagePostTreat, body_elt) + except StopIteration: + # No XHTML-IM + pass + return True + + def _purgeStyle(self, styles_raw): + """ Remove unauthorised styles according to the XEP-0071 + @param styles_raw: raw styles (value of the style attribute) + """ + purged = [] + + styles = [style.strip().split(':') for style in styles_raw.split(';')] + + for style_tuple in styles: + if len(style_tuple) != 2: + continue + name, value = style_tuple + name = name.strip() + if name not in styles_allowed: + continue + purged.append((name, value.strip())) + + return u'; '.join([u"%s: %s" % data for data in purged]) + + def XHTML2XHTML_IM(self, xhtml): + """ Convert XHTML document to XHTML_IM subset + @param xhtml: raw xhtml to convert + """ + # TODO: more clever tag replacement (replace forbidden tags with equivalents when possible) + + parser = html.HTMLParser(remove_comments=True, encoding='utf-8') + root = html.fromstring(xhtml, parser=parser) + body_elt = root.find('body') + if body_elt is None: + # we use the whole XML as body if no body element is found + body_elt = html.Element('body') + body_elt.append(root) + else: + body_elt.attrib.clear() + + allowed_tags = allowed.keys() + to_strip = [] + for elem in body_elt.iter(): + if elem.tag not in allowed_tags: + to_strip.append(elem) + else: + # we remove unallowed attributes + attrib = elem.attrib + att_to_remove = set(attrib).difference(allowed[elem.tag]) + for att in att_to_remove: + del(attrib[att]) + if "style" in attrib: + attrib["style"] = self._purgeStyle(attrib["style"]) + + for elem in to_strip: + if elem.tag in blacklist: + #we need to remove the element and all descendants + debug(u"removing black listed tag: %s" % (elem.tag)) + elem.drop_tree() + else: + elem.drop_tag() + + return html.tostring(body_elt, encoding='unicode', method='xml') + +class XEP_0071_handler(XMPPHandler): + implements(iwokkel.IDisco) + + def __init__(self, plugin_parent): + self.plugin_parent = plugin_parent + self.host = plugin_parent.host + + def getDiscoInfo(self, requestor, target, nodeIdentifier=''): + return [disco.DiscoFeature(NS_XHTML_IM)] + + def getDiscoItems(self, requestor, target, nodeIdentifier=''): + return []