changeset 832:c4b22aedb7d7

plugin groupblog, XEP-0071, XEP-0277, text_syntaxes: manage raw/rich/xhtml data for content/title: Implementation should follow the following formal specification: "title" and "content" data can be passed in raw, xhtml or rich format. When we receive from a frontend a new/updated microblog item: - keys "title" or "content" have to be escaped (disable HTML tags) - keys "title_rich" or "content_rich" have to be converted from the current syntax to XHTML - keys "title_xhtml" or "content_xhtml" have to be cleaned from unwanted XHTML content Rules to deal with concurrent keys: - existence of both "*_xhtml" and "*_rich" keys must raise an exception - existence of both raw and ("*_xhtml" or "*_rich") is OK As the storage always need raw data, if it is not given by the user it can be extracted from the "*_rich" or "*_xhtml" data (remove the XHTML tags). When a frontend wants to edit a blog post that contains XHTML title or content, the conversion is made from XHTML to the current user-defined syntax. - plugin text_syntaxes: added "text" syntax (using lxml)
author souliane <souliane@mailoo.org>
date Wed, 05 Feb 2014 16:36:51 +0100
parents d7f9cd8a08cd
children 9bac2fc74968
files src/plugins/plugin_misc_groupblog.py src/plugins/plugin_misc_text_syntaxes.py src/plugins/plugin_xep_0071.py src/plugins/plugin_xep_0277.py src/test/test_plugin_misc_text_syntaxes.py
diffstat 5 files changed, 134 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/plugin_misc_groupblog.py	Wed Jan 22 17:10:28 2014 +0100
+++ b/src/plugins/plugin_misc_groupblog.py	Wed Feb 05 16:36:51 2014 +0100
@@ -276,8 +276,10 @@
         """
         node_name = self.getNodeName(client.jid)
         mblog_data = {'content': message}
-        if 'rich' in extra:
-            mblog_data['rich'] = extra['rich']
+
+        for attr in ['content_rich', 'title', 'title_rich']:
+            if attr in extra and extra[attr]:
+                mblog_data[attr] = extra[attr]
         P = self.host.plugins["XEP-0060"]
         access_model_value = ACCESS_TYPE_MAP[access_type]
 
@@ -285,13 +287,13 @@
             # XXX: use the item identifier? http://bugs.goffi.org/show_bug.cgi?id=63
             comments_node = "%s_%s__%s" % (NS_COMMENT_PREFIX, str(uuid.uuid4()), node_name)
             mblog_data['comments'] = "xmpp:%(service)s?%(query)s" % {'service': service.userhost(),
-                                                                     'query': urllib.urlencode([('node',comments_node.encode('utf-8'))])}
+                                                                     'query': urllib.urlencode([('node', comments_node.encode('utf-8'))])}
             _options = {P.OPT_ACCESS_MODEL: access_model_value,
                         P.OPT_PERSIST_ITEMS: 1,
                         P.OPT_MAX_ITEMS: -1,
                         P.OPT_DELIVER_PAYLOADS: 1,
                         P.OPT_SEND_ITEM_SUBSCRIBE: 1,
-                        P.OPT_PUBLISH_MODEL: "subscribers", #TODO: should be open if *both* node and item access_model are open (public node and item)
+                        P.OPT_PUBLISH_MODEL: "subscribers",  # TODO: should be open if *both* node and item access_model are open (public node and item)
                        }
             if access_model_value == 'roster':
                 _options[P.OPT_ROSTER_GROUPS_ALLOWED] = list(access_list)
@@ -325,7 +327,7 @@
             defer_blog.addErrback(self._mblogPublicationFailed)
             return defer_blog
 
-        entry_d =  self.host.plugins["XEP-0277"].data2entry(mblog_data, client.profile)
+        entry_d = self.host.plugins["XEP-0277"].data2entry(mblog_data, client.profile)
         entry_d.addCallback(itemCreated)
         return entry_d
 
@@ -412,8 +414,9 @@
         def initialised(result):
             profile, client = result
             mblog_data = {'content': message}
-            if 'rich' in extra:
-                mblog_data['rich'] = extra['rich']
+            for attr in ['content_rich', 'title', 'title_rich']:
+                if attr in extra and extra[attr]:
+                    mblog_data[attr] = extra[attr]
             service, node, item_id = pub_data
             if comments:
                 node = self.getNodeName(client.jid)
@@ -448,8 +451,9 @@
             profile, client = result
             service, node = self.host.plugins["XEP-0277"].parseCommentUrl(node_url)
             mblog_data = {'content': message}
-            if 'rich' in extra:
-                mblog_data['rich'] = extra['rich']
+            for attr in ['content_rich', 'title', 'title_rich']:
+                if attr in extra and extra[attr]:
+                    mblog_data[attr] = extra[attr]
             if 'allow_comments' in extra:
                 raise NotImplementedError # TODO
             entry_d = self.host.plugins["XEP-0277"].data2entry(mblog_data, profile)
--- a/src/plugins/plugin_misc_text_syntaxes.py	Wed Jan 22 17:10:28 2014 +0100
+++ b/src/plugins/plugin_misc_text_syntaxes.py	Wed Feb 05 16:36:51 2014 +0100
@@ -26,6 +26,7 @@
 from sat.core import exceptions
 from lxml import html
 from lxml.html import clean
+from cgi import escape
 import re
 
 
@@ -70,6 +71,7 @@
     OPT_NO_THREAD = "NO_THREAD"
     SYNTAX_XHTML = _SYNTAX_XHTML
     SYNTAX_MARKDOWN = "markdown"
+    SYNTAX_TEXT = "text"
 
     params = """
     <params>
@@ -99,6 +101,7 @@
         self.syntaxes = {}
         self.addSyntax(self.SYNTAX_XHTML, lambda xhtml: defer.succeed(xhtml), lambda xhtml: defer.succeed(xhtml),
                        TextSyntaxes.OPT_NO_THREAD)
+        self.addSyntax(self.SYNTAX_TEXT, lambda text: escape(text), lambda xhtml: self._removeMarkups(xhtml), [TextSyntaxes.OPT_HIDDEN])
         try:
             import markdown, html2text
             self.addSyntax(self.SYNTAX_MARKDOWN, markdown.markdown, html2text.html2text, [TextSyntaxes.OPT_DEFAULT])
@@ -238,3 +241,12 @@
 
         self._updateParamOptions()
 
+    def _removeMarkups(self, xhtml):
+        """
+        Remove XHTML markups from the given string.
+        @param xhtml: the XHTML string to be cleaned
+        @return: the cleaned string
+        """
+        cleaner = clean.Cleaner(kill_tags=['style'])
+        cleaned = cleaner.clean_html(html.fromstring(xhtml))
+        return html.tostring(cleaned, method="text")
--- a/src/plugins/plugin_xep_0071.py	Wed Jan 22 17:10:28 2014 +0100
+++ b/src/plugins/plugin_xep_0071.py	Wed Feb 05 16:36:51 2014 +0100
@@ -18,6 +18,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from sat.core.i18n import _
+from sat.core import exceptions
 from logging import debug, info, error
 
 from wokkel import disco, pubsub, iwokkel
@@ -104,9 +105,15 @@
             mess_data['extra']['xhtml'] = xhtml_im
             return mess_data
 
-        rich = mess_data['extra'].pop('rich')
         syntax = self.synt_plg.getCurrentSyntax(profile)
-        d = self.synt_plg.convert(rich, syntax, self.SYNTAX_XHTML_IM)
+        rich = mess_data['extra'].get('rich', '')
+        xhtml = mess_data['extra'].get('xhtml', '')
+        if rich:
+            d = self.synt_plg.convert(rich, syntax, self.SYNTAX_XHTML_IM)
+            if xhtml:
+                raise exceptions.DataError(_("Can't have xhtml and rich content at the same time"))
+        if xhtml:
+            d = self.synt_plg.clean_xhtml(xhtml)
         d.addCallback(syntax_converted)
         return d
 
@@ -126,13 +133,8 @@
     def sendMessageTrigger(self, mess_data, treatments, profile):
         """ Check presence of rich text in extra
         """
-        try:
-            rich = mess_data['extra']['rich']
-            # OK, we have found rich text
+        if 'rich' in mess_data['extra'] or 'xhtml' in mess_data['extra']:
             treatments.addCallback(self._sendMessageAddRich, profile)
-        except KeyError:
-            # No rich text found
-            pass
         return True
 
     def _purgeStyle(self, styles_raw):
--- a/src/plugins/plugin_xep_0277.py	Wed Jan 22 17:10:28 2014 +0100
+++ b/src/plugins/plugin_xep_0277.py	Wed Feb 05 16:36:51 2014 +0100
@@ -84,6 +84,17 @@
 
         return (service, node)
 
+    def __removeXHTMLMarkups(self, xhtml):
+        """
+        Remove XHTML markups from the given string.
+        @param xhtml: the XHTML string to be cleaned
+        @return: a Deferred instance for the cleaned string
+        """
+        return self.host.plugins["TEXT-SYNTAXES"].convert(xhtml,
+                                                          self.host.plugins["TEXT-SYNTAXES"].SYNTAX_XHTML,
+                                                          self.host.plugins["TEXT-SYNTAXES"].SYNTAX_TEXT,
+                                                          False)
+
     @defer.inlineCallbacks
     def item2mbdata(self, item):
         """Convert an XML Item to microblog data used in bridge API
@@ -96,17 +107,36 @@
             raise exceptions.DataError('no entry found')
         _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8'))
         microblog_data = {}
+
+        for key in ['title', 'content']:
+            for type_ in ['', 'xhtml']:
+                try:
+                    attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key)
+                except AttributeError:
+                    continue
+                if not attr.text:
+                    continue
+                try:
+                    content_type = attr.attrs['type'].lower()
+                except KeyError:
+                    content_type = 'text'
+                if content_type == 'xhtml':
+                    # TODO: proper check of body namespace
+                    microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(attr.text)
+                else:
+                    microblog_data[key] = attr.text
+            if key not in microblog_data and ('%s_xhtml' % key) in microblog_data:
+                microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key])
+        if 'title' not in microblog_data:
+            raise exceptions.DataError(_("Atom entry misses a title element"))
+        if 'content' not in microblog_data:
+            microblog_data['content'] = microblog_data['title']
+            del microblog_data['title']
+            if 'title_xhtml' in microblog_data:
+                microblog_data['content_xhtml'] = microblog_data['title_xhtml']
+                del microblog_data['title_xhtml']
+
         try:
-            try:
-                content_type =_entry.title.attrs['type'].lower()
-            except KeyError:
-                content_type = 'text'
-            if content_type == 'xhtml':
-                # TODO: proper check of body namespace
-                microblog_data['xhtml'] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(_entry.title.text)
-                microblog_data['content'] = _entry.title.text # FIXME: must use text version of the microblog, or convert XHTML to text if not available
-            else:
-                microblog_data['content'] = _entry.title.text
             if len(_entry.authors):
                 microblog_data['author'] = _entry.authors[0].name.text
             microblog_data['updated'] = str(int(_entry.updated.tf))
@@ -126,7 +156,6 @@
                 except (KeyError, exceptions.DataError, RuntimeError):
                     warning("Can't parse link")
                     continue
-
         except (AttributeError, KeyError):
             error(_('Error while parsing atom entry for microblogging event'))
             raise exceptions.DataError
@@ -160,20 +189,48 @@
     @defer.inlineCallbacks
     def data2entry(self, data, profile):
         """Convert a data dict to en entry usable to create an item
-        @param data: data dict as given by bridge method
+        @param data: data dict as given by bridge method.
         @return: deferred which fire domish.Element"""
         _uuid = unicode(uuid.uuid1())
         _entry = atom.Entry()
+        _entry.title = ''  # reset the default value which is not empty
 
-        if "rich" in data:
-            synt = self.host.plugins["TEXT-SYNTAXES"]
-            converted = yield synt.convert(data['rich'], synt.getCurrentSyntax(profile), "XHTML")
-            content = u'<div xmlns="%s">%s</div>' % (NS_XHTML, converted)
-            _entry.title.attrs['type'] = 'xhtml'
-        else:
-            content = escape(data['content'])
-            _entry.title.attrs['type'] = 'text'
-        _entry.title = unicode(content).encode('utf-8')
+        elems = {'title': atom.Title, 'content': atom.Content}
+        synt = self.host.plugins["TEXT-SYNTAXES"]
+
+        # loop on ('title', 'title_rich', 'title_xhtml', 'content', 'content_rich', 'content_xhtml')
+        for key in elems.keys():
+            for type_ in ['', 'rich', 'xhtml']:
+                attr = "%s_%s" % (key, type_) if type_ else key
+                if attr in data:
+                    if type_:
+                        if type_ == 'rich':  # convert input from current syntax to XHTML
+                            converted = yield synt.convert(data[attr], synt.getCurrentSyntax(profile), "XHTML")
+                        else:  # clean the XHTML input
+                            converted = yield synt.clean_xhtml(data[attr])
+                        elem = elems[key](u'<div xmlns="%s">%s</div>' % (NS_XHTML, converted))
+                        elem.attrs['type'] = 'xhtml'
+                        if hasattr(_entry, '%s_xhtml' % key):
+                            raise exceptions.DataError(_("Can't have xhtml and rich content at the same time"))
+                        setattr(_entry, '%s_xhtml' % key, elem)
+                    else:  # raw text only needs to be escaped to get HTML-safe sequence
+                        elem = elems[key](escape(data[attr]))
+                        elem.attrs['type'] = 'text'
+                        setattr(_entry, key, elem)
+            if not getattr(_entry, key).text:
+                if hasattr(_entry, '%s_xhtml' % key):
+                    text = yield self.__removeXHTMLMarkups(getattr(_entry, '%s_xhtml' % key).text)
+                    setattr(_entry, key, text)
+        if not _entry.title.text:  # eventually move the data from content to title
+            _entry.title = _entry.content.text
+            _entry.title.attrs['type'] = _entry.content.attrs['type']
+            _entry.content.text = ''
+            _entry.content.attrs['type'] = ''
+            if hasattr(_entry, 'content_xhtml'):
+                _entry.title_xhtml = atom.Title(_entry.content_xhtml.text)
+                _entry.title_xhtml.attrs['type'] = _entry.content_xhtml.attrs['type']
+                _entry.content_xhtml.text = ''
+                _entry.content_xhtml.attrs['type'] = ''
 
         _entry.author = atom.Author()
         _entry.author.name = data.get('author', self.host.getJidNStream(profile)[0].userhost()).encode('utf-8')
--- a/src/test/test_plugin_misc_text_syntaxes.py	Wed Jan 22 17:10:28 2014 +0100
+++ b/src/test/test_plugin_misc_text_syntaxes.py	Wed Feb 05 16:36:51 2014 +0100
@@ -21,16 +21,12 @@
 
 from sat.test import helpers
 from sat.plugins import plugin_misc_text_syntaxes
+import re
 
 
 class SanitisationTest(helpers.SatTestCase):
 
-    def setUp(self):
-        self.host = helpers.FakeSAT()
-        self.text_syntaxes = plugin_misc_text_syntaxes.TextSyntaxes(self.host)
-
-    def test_xhtml_sanitise(self):
-        evil_html = """
+    EVIL_HTML1 = """
    <html>
     <head>
       <script type="text/javascript" src="evil-site"></script>
@@ -57,6 +53,13 @@
     </body>
    </html>"""  # example from lxml: /usr/share/doc/python-lxml-doc/html/lxmlhtml.html#cleaning-up-html
 
+    EVIL_HTML2 = """<p style='display: None; test: blah; background: url(: alert()); color: blue;'>test <strong>retest</strong><br><span style="background-color: (alert('bouh')); titi; color: #cf2828; font-size: 3px; direction: !important; color: red; color: red !important; font-size: 100px       !important; font-size: 100px  ! important; font-size: 100%; font-size: 100ox; font-size: 100px; font-size: 100;;;; font-size: 100 %; color: 100 px 1.7em; color: rgba(0, 0, 0, 0.1); color: rgb(35,79,255); background-color: no-repeat; background-color: :alert(1); color: (alert('XSS')); color: (window.location='http://example.org/'); color: url(:window.location='http://example.org/'); "> toto </span></p>"""
+
+    def setUp(self):
+        self.host = helpers.FakeSAT()
+        self.text_syntaxes = plugin_misc_text_syntaxes.TextSyntaxes(self.host)
+
+    def test_xhtml_sanitise(self):
         expected = """<div>
       <style>/* deleted */</style>
     <body>
@@ -72,15 +75,23 @@
     </img></body>
    </div>"""
 
-        d = self.text_syntaxes.clean_xhtml(evil_html)
+        d = self.text_syntaxes.clean_xhtml(self.EVIL_HTML1)
         d.addCallback(self.assertEqualXML, expected, ignore_blank=True)
         return d
 
     def test_styles_sanitise(self):
-        evil_html = """<p style='display: None; test: blah; background: url(: alert()); color: blue;'>test <strong>retest</strong><br><span style="background-color: (alert('bouh')); titi; color: #cf2828; font-size: 3px; direction: !important; color: red; color: red !important; font-size: 100px       !important; font-size: 100px  ! important; font-size: 100%; font-size: 100ox; font-size: 100px; font-size: 100;;;; font-size: 100 %; color: 100 px 1.7em; color: rgba(0, 0, 0, 0.1); color: rgb(35,79,255); background-color: no-repeat; background-color: :alert(1); color: (alert('XSS')); color: (window.location='http://example.org/'); color: url(:window.location='http://example.org/'); "> toto </span></p>"""
-
         expected = """<p style="color: blue">test <strong>retest</strong><br/><span style="color: #cf2828; font-size: 3px; color: red; color: red !important; font-size: 100px       !important; font-size: 100%; font-size: 100px; font-size: 100; font-size: 100 %; color: rgba(0, 0, 0, 0.1); color: rgb(35,79,255); background-color: no-repeat"> toto </span></p>"""
 
-        d = self.text_syntaxes.clean_xhtml(evil_html)
+        d = self.text_syntaxes.clean_xhtml(self.EVIL_HTML2)
         d.addCallback(self.assertEqualXML, expected)
         return d
+
+    def test_removeXHTMLMarkups(self):
+        expected = """ a link another link a paragraph secret EVIL! of EVIL! Password: annoying EVIL!spam spam SPAM! """
+        result = self.text_syntaxes._removeMarkups(self.EVIL_HTML1)
+        self.assertEqual(re.sub(r"\s+", " ", result).rstrip(), expected.rstrip())
+
+        expected = """test retest toto"""
+        result = self.text_syntaxes._removeMarkups(self.EVIL_HTML2)
+        self.assertEqual(re.sub(r"\s+", " ", result).rstrip(), expected.rstrip())
+