comparison src/plugins/plugin_xep_0071.py @ 668:7bb50096d225

plugin XEP_0071: first draft a new "xhtml" key is added in bridge newMessage's extra data when rich text is found.
author Goffi <goffi@goffi.org>
date Tue, 05 Nov 2013 22:41:45 +0100
parents
children 98b2400e17d6
comparison
equal deleted inserted replaced
667:a79a6843928c 668:7bb50096d225
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # SAT plugin for Publish-Subscribe (xep-0071)
5 # Copyright (C) 2009, 2010, 2011, 2012, 2013 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 from logging import debug, info, error
21
22 from wokkel import disco, pubsub, iwokkel
23 from zope.interface import implements
24 # from lxml import etree
25 from lxml import html
26 try:
27 from twisted.words.protocols.xmlstream import XMPPHandler
28 except ImportError:
29 from wokkel.subprotocols import XMPPHandler
30
31 NS_XHTML_IM = 'http://jabber.org/protocol/xhtml-im'
32 NS_XHTML = 'http://www.w3.org/1999/xhtml'
33
34 PLUGIN_INFO = {
35 "name": "XHTML-IM Plugin",
36 "import_name": "XEP-0071",
37 "type": "XEP",
38 "protocols": ["XEP-0071"],
39 "dependencies": ["TEXT-SYNTAXES"],
40 "main": "XEP_0071",
41 "handler": "yes",
42 "description": _("""Implementation of XHTML-IM""")
43 }
44
45 allowed = {
46 "a": set(["href", "style", "type"]),
47 "blockquote": set(["style"]),
48 "body": set(["style"]),
49 "br": set([]),
50 "cite": set(["style"]),
51 "em": set([]),
52 "img": set(["alt", "height", "src", "style", "width"]),
53 "li": set(["style"]),
54 "ol": set(["style"]),
55 "p": set(["style"]),
56 "span": set(["style"]),
57 "strong": set([]),
58 "ul": set(["style"]),
59 }
60
61 styles_allowed = ["background-color", "color", "font-family", "font-size", "font-style", "font-weight", "margin-left", "margin-right", "text-align", "text-decoration"]
62
63 blacklist = ['script'] # tag that we have to kill (we don't keep content)
64
65
66 class XEP_0071(object):
67 SYNTAX_XHTML_IM = "XHTML-IM"
68
69 def __init__(self, host):
70 info(_("XHTML-IM plugin initialization"))
71 self.host = host
72 txt_synt_plg = self.host.plugins["TEXT-SYNTAXES"]
73 txt_synt_plg.addSyntax(self.SYNTAX_XHTML_IM, lambda xhtml: xhtml, self.XHTML2XHTML_IM, [txt_synt_plg.OPT_HIDDEN])
74 host.trigger.add("MessageReceived", self.messageReceivedTrigger)
75
76 def getHandler(self, profile):
77 return XEP_0071_handler(self)
78
79 def _messagePostTreat(self, data, body_elt):
80 """ Callback which manage the post treatment of the message in case of XHTML-IM found
81 @param data: data send by MessageReceived trigger through post_treat deferred
82 @param xhtml_im: XHTML-IM body element found
83 @return: the data with the extra parameter updated
84 """
85 #TODO: check if text only body is empty, then try to convert XHTML-IM to pure text and show a warning message
86 def converted(xhtml):
87 data['extra']['xhtml'] = xhtml
88 return data
89 txt_synt_plg = self.host.plugins["TEXT-SYNTAXES"]
90 d = txt_synt_plg.convert(body_elt.toXml(), self.SYNTAX_XHTML_IM, safe=True)
91 d.addCallback(converted)
92 return d
93
94 def messageReceivedTrigger(self, message, post_treat, profile):
95 """ Check presence of XHTML-IM in message
96 """
97 try:
98 html_elt = message.elements(NS_XHTML_IM, 'html').next()
99 body_elt = html_elt.elements(NS_XHTML, 'body').next()
100 # OK, we have found rich text
101 post_treat.addCallback(self._messagePostTreat, body_elt)
102 except StopIteration:
103 # No XHTML-IM
104 pass
105 return True
106
107 def _purgeStyle(self, styles_raw):
108 """ Remove unauthorised styles according to the XEP-0071
109 @param styles_raw: raw styles (value of the style attribute)
110 """
111 purged = []
112
113 styles = [style.strip().split(':') for style in styles_raw.split(';')]
114
115 for style_tuple in styles:
116 if len(style_tuple) != 2:
117 continue
118 name, value = style_tuple
119 name = name.strip()
120 if name not in styles_allowed:
121 continue
122 purged.append((name, value.strip()))
123
124 return u'; '.join([u"%s: %s" % data for data in purged])
125
126 def XHTML2XHTML_IM(self, xhtml):
127 """ Convert XHTML document to XHTML_IM subset
128 @param xhtml: raw xhtml to convert
129 """
130 # TODO: more clever tag replacement (replace forbidden tags with equivalents when possible)
131
132 parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
133 root = html.fromstring(xhtml, parser=parser)
134 body_elt = root.find('body')
135 if body_elt is None:
136 # we use the whole XML as body if no body element is found
137 body_elt = html.Element('body')
138 body_elt.append(root)
139 else:
140 body_elt.attrib.clear()
141
142 allowed_tags = allowed.keys()
143 to_strip = []
144 for elem in body_elt.iter():
145 if elem.tag not in allowed_tags:
146 to_strip.append(elem)
147 else:
148 # we remove unallowed attributes
149 attrib = elem.attrib
150 att_to_remove = set(attrib).difference(allowed[elem.tag])
151 for att in att_to_remove:
152 del(attrib[att])
153 if "style" in attrib:
154 attrib["style"] = self._purgeStyle(attrib["style"])
155
156 for elem in to_strip:
157 if elem.tag in blacklist:
158 #we need to remove the element and all descendants
159 debug(u"removing black listed tag: %s" % (elem.tag))
160 elem.drop_tree()
161 else:
162 elem.drop_tag()
163
164 return html.tostring(body_elt, encoding='unicode', method='xml')
165
166 class XEP_0071_handler(XMPPHandler):
167 implements(iwokkel.IDisco)
168
169 def __init__(self, plugin_parent):
170 self.plugin_parent = plugin_parent
171 self.host = plugin_parent.host
172
173 def getDiscoInfo(self, requestor, target, nodeIdentifier=''):
174 return [disco.DiscoFeature(NS_XHTML_IM)]
175
176 def getDiscoItems(self, requestor, target, nodeIdentifier=''):
177 return []