# HG changeset patch # User souliane # Date 1391614611 -3600 # Node ID c4b22aedb7d7c9a80cccf1c30ddb6238a706fabd # Parent d7f9cd8a08cd28d1b9906da50931574d89e26508 plugin groupblog, XEP-0071, XEP-0277, text_syntaxes: manage raw/rich/xhtml data for content/title: Implementation should follow the following formal specification: "title" and "content" data can be passed in raw, xhtml or rich format. When we receive from a frontend a new/updated microblog item: - keys "title" or "content" have to be escaped (disable HTML tags) - keys "title_rich" or "content_rich" have to be converted from the current syntax to XHTML - keys "title_xhtml" or "content_xhtml" have to be cleaned from unwanted XHTML content Rules to deal with concurrent keys: - existence of both "*_xhtml" and "*_rich" keys must raise an exception - existence of both raw and ("*_xhtml" or "*_rich") is OK As the storage always need raw data, if it is not given by the user it can be extracted from the "*_rich" or "*_xhtml" data (remove the XHTML tags). When a frontend wants to edit a blog post that contains XHTML title or content, the conversion is made from XHTML to the current user-defined syntax. - plugin text_syntaxes: added "text" syntax (using lxml) diff -r d7f9cd8a08cd -r c4b22aedb7d7 src/plugins/plugin_misc_groupblog.py --- a/src/plugins/plugin_misc_groupblog.py Wed Jan 22 17:10:28 2014 +0100 +++ b/src/plugins/plugin_misc_groupblog.py Wed Feb 05 16:36:51 2014 +0100 @@ -276,8 +276,10 @@ """ node_name = self.getNodeName(client.jid) mblog_data = {'content': message} - if 'rich' in extra: - mblog_data['rich'] = extra['rich'] + + for attr in ['content_rich', 'title', 'title_rich']: + if attr in extra and extra[attr]: + mblog_data[attr] = extra[attr] P = self.host.plugins["XEP-0060"] access_model_value = ACCESS_TYPE_MAP[access_type] @@ -285,13 +287,13 @@ # XXX: use the item identifier? http://bugs.goffi.org/show_bug.cgi?id=63 comments_node = "%s_%s__%s" % (NS_COMMENT_PREFIX, str(uuid.uuid4()), node_name) mblog_data['comments'] = "xmpp:%(service)s?%(query)s" % {'service': service.userhost(), - 'query': urllib.urlencode([('node',comments_node.encode('utf-8'))])} + 'query': urllib.urlencode([('node', comments_node.encode('utf-8'))])} _options = {P.OPT_ACCESS_MODEL: access_model_value, P.OPT_PERSIST_ITEMS: 1, P.OPT_MAX_ITEMS: -1, P.OPT_DELIVER_PAYLOADS: 1, P.OPT_SEND_ITEM_SUBSCRIBE: 1, - P.OPT_PUBLISH_MODEL: "subscribers", #TODO: should be open if *both* node and item access_model are open (public node and item) + P.OPT_PUBLISH_MODEL: "subscribers", # TODO: should be open if *both* node and item access_model are open (public node and item) } if access_model_value == 'roster': _options[P.OPT_ROSTER_GROUPS_ALLOWED] = list(access_list) @@ -325,7 +327,7 @@ defer_blog.addErrback(self._mblogPublicationFailed) return defer_blog - entry_d = self.host.plugins["XEP-0277"].data2entry(mblog_data, client.profile) + entry_d = self.host.plugins["XEP-0277"].data2entry(mblog_data, client.profile) entry_d.addCallback(itemCreated) return entry_d @@ -412,8 +414,9 @@ def initialised(result): profile, client = result mblog_data = {'content': message} - if 'rich' in extra: - mblog_data['rich'] = extra['rich'] + for attr in ['content_rich', 'title', 'title_rich']: + if attr in extra and extra[attr]: + mblog_data[attr] = extra[attr] service, node, item_id = pub_data if comments: node = self.getNodeName(client.jid) @@ -448,8 +451,9 @@ profile, client = result service, node = self.host.plugins["XEP-0277"].parseCommentUrl(node_url) mblog_data = {'content': message} - if 'rich' in extra: - mblog_data['rich'] = extra['rich'] + for attr in ['content_rich', 'title', 'title_rich']: + if attr in extra and extra[attr]: + mblog_data[attr] = extra[attr] if 'allow_comments' in extra: raise NotImplementedError # TODO entry_d = self.host.plugins["XEP-0277"].data2entry(mblog_data, profile) diff -r d7f9cd8a08cd -r c4b22aedb7d7 src/plugins/plugin_misc_text_syntaxes.py --- a/src/plugins/plugin_misc_text_syntaxes.py Wed Jan 22 17:10:28 2014 +0100 +++ b/src/plugins/plugin_misc_text_syntaxes.py Wed Feb 05 16:36:51 2014 +0100 @@ -26,6 +26,7 @@ from sat.core import exceptions from lxml import html from lxml.html import clean +from cgi import escape import re @@ -70,6 +71,7 @@ OPT_NO_THREAD = "NO_THREAD" SYNTAX_XHTML = _SYNTAX_XHTML SYNTAX_MARKDOWN = "markdown" + SYNTAX_TEXT = "text" params = """ @@ -99,6 +101,7 @@ self.syntaxes = {} self.addSyntax(self.SYNTAX_XHTML, lambda xhtml: defer.succeed(xhtml), lambda xhtml: defer.succeed(xhtml), TextSyntaxes.OPT_NO_THREAD) + self.addSyntax(self.SYNTAX_TEXT, lambda text: escape(text), lambda xhtml: self._removeMarkups(xhtml), [TextSyntaxes.OPT_HIDDEN]) try: import markdown, html2text self.addSyntax(self.SYNTAX_MARKDOWN, markdown.markdown, html2text.html2text, [TextSyntaxes.OPT_DEFAULT]) @@ -238,3 +241,12 @@ self._updateParamOptions() + def _removeMarkups(self, xhtml): + """ + Remove XHTML markups from the given string. + @param xhtml: the XHTML string to be cleaned + @return: the cleaned string + """ + cleaner = clean.Cleaner(kill_tags=['style']) + cleaned = cleaner.clean_html(html.fromstring(xhtml)) + return html.tostring(cleaned, method="text") diff -r d7f9cd8a08cd -r c4b22aedb7d7 src/plugins/plugin_xep_0071.py --- a/src/plugins/plugin_xep_0071.py Wed Jan 22 17:10:28 2014 +0100 +++ b/src/plugins/plugin_xep_0071.py Wed Feb 05 16:36:51 2014 +0100 @@ -18,6 +18,7 @@ # along with this program. If not, see . from sat.core.i18n import _ +from sat.core import exceptions from logging import debug, info, error from wokkel import disco, pubsub, iwokkel @@ -104,9 +105,15 @@ mess_data['extra']['xhtml'] = xhtml_im return mess_data - rich = mess_data['extra'].pop('rich') syntax = self.synt_plg.getCurrentSyntax(profile) - d = self.synt_plg.convert(rich, syntax, self.SYNTAX_XHTML_IM) + rich = mess_data['extra'].get('rich', '') + xhtml = mess_data['extra'].get('xhtml', '') + if rich: + d = self.synt_plg.convert(rich, syntax, self.SYNTAX_XHTML_IM) + if xhtml: + raise exceptions.DataError(_("Can't have xhtml and rich content at the same time")) + if xhtml: + d = self.synt_plg.clean_xhtml(xhtml) d.addCallback(syntax_converted) return d @@ -126,13 +133,8 @@ def sendMessageTrigger(self, mess_data, treatments, profile): """ Check presence of rich text in extra """ - try: - rich = mess_data['extra']['rich'] - # OK, we have found rich text + if 'rich' in mess_data['extra'] or 'xhtml' in mess_data['extra']: treatments.addCallback(self._sendMessageAddRich, profile) - except KeyError: - # No rich text found - pass return True def _purgeStyle(self, styles_raw): diff -r d7f9cd8a08cd -r c4b22aedb7d7 src/plugins/plugin_xep_0277.py --- a/src/plugins/plugin_xep_0277.py Wed Jan 22 17:10:28 2014 +0100 +++ b/src/plugins/plugin_xep_0277.py Wed Feb 05 16:36:51 2014 +0100 @@ -84,6 +84,17 @@ return (service, node) + def __removeXHTMLMarkups(self, xhtml): + """ + Remove XHTML markups from the given string. + @param xhtml: the XHTML string to be cleaned + @return: a Deferred instance for the cleaned string + """ + return self.host.plugins["TEXT-SYNTAXES"].convert(xhtml, + self.host.plugins["TEXT-SYNTAXES"].SYNTAX_XHTML, + self.host.plugins["TEXT-SYNTAXES"].SYNTAX_TEXT, + False) + @defer.inlineCallbacks def item2mbdata(self, item): """Convert an XML Item to microblog data used in bridge API @@ -96,17 +107,36 @@ raise exceptions.DataError('no entry found') _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8')) microblog_data = {} + + for key in ['title', 'content']: + for type_ in ['', 'xhtml']: + try: + attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key) + except AttributeError: + continue + if not attr.text: + continue + try: + content_type = attr.attrs['type'].lower() + except KeyError: + content_type = 'text' + if content_type == 'xhtml': + # TODO: proper check of body namespace + microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(attr.text) + else: + microblog_data[key] = attr.text + if key not in microblog_data and ('%s_xhtml' % key) in microblog_data: + microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key]) + if 'title' not in microblog_data: + raise exceptions.DataError(_("Atom entry misses a title element")) + if 'content' not in microblog_data: + microblog_data['content'] = microblog_data['title'] + del microblog_data['title'] + if 'title_xhtml' in microblog_data: + microblog_data['content_xhtml'] = microblog_data['title_xhtml'] + del microblog_data['title_xhtml'] + try: - try: - content_type =_entry.title.attrs['type'].lower() - except KeyError: - content_type = 'text' - if content_type == 'xhtml': - # TODO: proper check of body namespace - microblog_data['xhtml'] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(_entry.title.text) - microblog_data['content'] = _entry.title.text # FIXME: must use text version of the microblog, or convert XHTML to text if not available - else: - microblog_data['content'] = _entry.title.text if len(_entry.authors): microblog_data['author'] = _entry.authors[0].name.text microblog_data['updated'] = str(int(_entry.updated.tf)) @@ -126,7 +156,6 @@ except (KeyError, exceptions.DataError, RuntimeError): warning("Can't parse link") continue - except (AttributeError, KeyError): error(_('Error while parsing atom entry for microblogging event')) raise exceptions.DataError @@ -160,20 +189,48 @@ @defer.inlineCallbacks def data2entry(self, data, profile): """Convert a data dict to en entry usable to create an item - @param data: data dict as given by bridge method + @param data: data dict as given by bridge method. @return: deferred which fire domish.Element""" _uuid = unicode(uuid.uuid1()) _entry = atom.Entry() + _entry.title = '' # reset the default value which is not empty - if "rich" in data: - synt = self.host.plugins["TEXT-SYNTAXES"] - converted = yield synt.convert(data['rich'], synt.getCurrentSyntax(profile), "XHTML") - content = u'
%s
' % (NS_XHTML, converted) - _entry.title.attrs['type'] = 'xhtml' - else: - content = escape(data['content']) - _entry.title.attrs['type'] = 'text' - _entry.title = unicode(content).encode('utf-8') + elems = {'title': atom.Title, 'content': atom.Content} + synt = self.host.plugins["TEXT-SYNTAXES"] + + # loop on ('title', 'title_rich', 'title_xhtml', 'content', 'content_rich', 'content_xhtml') + for key in elems.keys(): + for type_ in ['', 'rich', 'xhtml']: + attr = "%s_%s" % (key, type_) if type_ else key + if attr in data: + if type_: + if type_ == 'rich': # convert input from current syntax to XHTML + converted = yield synt.convert(data[attr], synt.getCurrentSyntax(profile), "XHTML") + else: # clean the XHTML input + converted = yield synt.clean_xhtml(data[attr]) + elem = elems[key](u'
%s
' % (NS_XHTML, converted)) + elem.attrs['type'] = 'xhtml' + if hasattr(_entry, '%s_xhtml' % key): + raise exceptions.DataError(_("Can't have xhtml and rich content at the same time")) + setattr(_entry, '%s_xhtml' % key, elem) + else: # raw text only needs to be escaped to get HTML-safe sequence + elem = elems[key](escape(data[attr])) + elem.attrs['type'] = 'text' + setattr(_entry, key, elem) + if not getattr(_entry, key).text: + if hasattr(_entry, '%s_xhtml' % key): + text = yield self.__removeXHTMLMarkups(getattr(_entry, '%s_xhtml' % key).text) + setattr(_entry, key, text) + if not _entry.title.text: # eventually move the data from content to title + _entry.title = _entry.content.text + _entry.title.attrs['type'] = _entry.content.attrs['type'] + _entry.content.text = '' + _entry.content.attrs['type'] = '' + if hasattr(_entry, 'content_xhtml'): + _entry.title_xhtml = atom.Title(_entry.content_xhtml.text) + _entry.title_xhtml.attrs['type'] = _entry.content_xhtml.attrs['type'] + _entry.content_xhtml.text = '' + _entry.content_xhtml.attrs['type'] = '' _entry.author = atom.Author() _entry.author.name = data.get('author', self.host.getJidNStream(profile)[0].userhost()).encode('utf-8') diff -r d7f9cd8a08cd -r c4b22aedb7d7 src/test/test_plugin_misc_text_syntaxes.py --- a/src/test/test_plugin_misc_text_syntaxes.py Wed Jan 22 17:10:28 2014 +0100 +++ b/src/test/test_plugin_misc_text_syntaxes.py Wed Feb 05 16:36:51 2014 +0100 @@ -21,16 +21,12 @@ from sat.test import helpers from sat.plugins import plugin_misc_text_syntaxes +import re class SanitisationTest(helpers.SatTestCase): - def setUp(self): - self.host = helpers.FakeSAT() - self.text_syntaxes = plugin_misc_text_syntaxes.TextSyntaxes(self.host) - - def test_xhtml_sanitise(self): - evil_html = """ + EVIL_HTML1 = """ @@ -57,6 +53,13 @@ """ # example from lxml: /usr/share/doc/python-lxml-doc/html/lxmlhtml.html#cleaning-up-html + EVIL_HTML2 = """

test retest
toto

""" + + def setUp(self): + self.host = helpers.FakeSAT() + self.text_syntaxes = plugin_misc_text_syntaxes.TextSyntaxes(self.host) + + def test_xhtml_sanitise(self): expected = """
@@ -72,15 +75,23 @@
""" - d = self.text_syntaxes.clean_xhtml(evil_html) + d = self.text_syntaxes.clean_xhtml(self.EVIL_HTML1) d.addCallback(self.assertEqualXML, expected, ignore_blank=True) return d def test_styles_sanitise(self): - evil_html = """

test retest
toto

""" - expected = """

test retest
toto

""" - d = self.text_syntaxes.clean_xhtml(evil_html) + d = self.text_syntaxes.clean_xhtml(self.EVIL_HTML2) d.addCallback(self.assertEqualXML, expected) return d + + def test_removeXHTMLMarkups(self): + expected = """ a link another link a paragraph secret EVIL! of EVIL! Password: annoying EVIL!spam spam SPAM! """ + result = self.text_syntaxes._removeMarkups(self.EVIL_HTML1) + self.assertEqual(re.sub(r"\s+", " ", result).rstrip(), expected.rstrip()) + + expected = """test retest toto""" + result = self.text_syntaxes._removeMarkups(self.EVIL_HTML2) + self.assertEqual(re.sub(r"\s+", " ", result).rstrip(), expected.rstrip()) +