changeset 859:64ec04991d9d

plugin XEP-0277: fix pubsub entry parsing using lxml instead of feed.atom
author souliane <souliane@mailoo.org>
date Sun, 23 Feb 2014 13:54:41 +0100 (2014-02-23)
parents 660b3f5b6c78
children 2b98f5631fba
files src/plugins/plugin_xep_0277.py src/test/helpers.py src/test/test_plugin_xep_0277.py
diffstat 3 files changed, 188 insertions(+), 57 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/plugin_xep_0277.py	Fri Feb 14 21:24:31 2014 +0100
+++ b/src/plugins/plugin_xep_0277.py	Sun Feb 23 13:54:41 2014 +0100
@@ -25,7 +25,8 @@
 from sat.tools.xml_tools import ElementParser
 
 from wokkel import pubsub
-from feed import atom
+from feed import atom, date
+from lxml import etree
 import uuid
 from time import time
 import urlparse
@@ -33,6 +34,7 @@
 
 NS_MICROBLOG = 'urn:xmpp:microblog:0'
 NS_XHTML = 'http://www.w3.org/1999/xhtml'
+NS_PUBSUB = 'http://jabber.org/protocol/pubsub'
 
 PLUGIN_INFO = {
     "name": "Microblogging over XMPP Plugin",
@@ -83,7 +85,7 @@
         parsed_queries = dict()
         for query in queries:
             parsed_queries.update(urlparse.parse_qs(query))
-        node = parsed_queries.get('node',[''])[0]
+        node = parsed_queries.get('node', [''])[0]
 
         if not node:
             raise exceptions.DataError('Invalid comments link')
@@ -106,79 +108,106 @@
         """Convert an XML Item to microblog data used in bridge API
         @param item: domish.Element of microblog item
         @return: microblog data (dictionary)"""
+
+        def xpath(elt, path):
+            """Return the XPATH result of an entry element or its descendance, works with both:
+            - no namespace, that means it is inherited from the parent item node --> NS_PUBSUB
+            - empty namespace
+            XXX: check why the received entries have no namespace when they are retrieved
+            from self.host.plugins["XEP-0060"].getItems and they have an empty namespace
+            when they are received with an event.
+            """
+            result = elt.xpath(path)
+            if len(result) > 0:
+                return result
+            return elt.xpath('/'.join(['ns:%s' % tag for tag in path.split('/')]), namespaces={'ns': NS_PUBSUB})
+
+        # convert a date string to float without dealing with the date format
+        date2float = lambda elt, path: unicode(date.rfc3339.tf_from_timestamp(xpath(elt, path)[0].text))
+
+        item_elt = etree.fromstring(item.toXml().encode('utf-8'))
         try:
-            entry_elt = [child for child in item.elements() if child.name == "entry"][0]
+            entry_elt = xpath(item_elt, 'entry')[0]
         except IndexError:
-            warning(_('No entry element in microblog item'))
-            raise exceptions.DataError('no entry found')
-        _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8'))
+            raise exceptions.DataError(_('No entry found in the pubsub item %s') % item_elt.get('id', ''))
+
         microblog_data = {}
 
-        for key in ['title', 'content']:
-            for type_ in ['', 'xhtml']:
-                try:
-                    attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key)
-                except AttributeError:
-                    continue
-                if not attr.text:
-                    continue
-                try:
-                    content_type = attr.attrs['type'].lower()
-                except KeyError:
-                    content_type = 'text'
+        for key in ['title', 'content']:  # process the textual elements
+            for attr_elt in xpath(entry_elt, key):
+                attr_content = self.__getLXMLInnerContent(attr_elt)
+                if not attr_content.strip():
+                    continue  # element with empty value
+                content_type = attr_elt.get('type', 'text').lower()
                 if content_type == 'xhtml':
-                    text = self.__decapsulateExtraNS(attr.text)
+                    text = self.__decapsulateExtraNS(attr_content)
                     microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text)
                 else:
-                    microblog_data[key] = attr.text
+                    microblog_data[key] = attr_content
             if key not in microblog_data and ('%s_xhtml' % key) in microblog_data:
                 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key])
-        if 'title' not in microblog_data:
-            raise exceptions.DataError(_("Atom entry misses a title element"))
-        if 'content' not in microblog_data:
+
+        try:  # check for mandatory elements
+            microblog_data['id'] = xpath(entry_elt, 'id')[0].text
+            microblog_data['updated'] = date2float(entry_elt, 'updated')
+            assert('title' in microblog_data)  # has been processed already
+        except IndexError:
+            error(_("Atom entry %s misses a required element") % item_elt.get('id', ''))
+            raise exceptions.DataError
+
+        if 'content' not in microblog_data:  # use the atom title data as the microblog body content
             microblog_data['content'] = microblog_data['title']
             del microblog_data['title']
             if 'title_xhtml' in microblog_data:
                 microblog_data['content_xhtml'] = microblog_data['title_xhtml']
                 del microblog_data['title_xhtml']
 
+        # recommended and optional elements with a fallback value
         try:
-            if len(_entry.authors):
-                microblog_data['author'] = _entry.authors[0].name.text
-            microblog_data['updated'] = str(int(_entry.updated.tf))
+            microblog_data['published'] = date2float(entry_elt, 'published')
+        except IndexError:
+            microblog_data['published'] = microblog_data['updated']
+
+        # other recommended and optional elements
+        try:
+            link_elt = xpath(entry_elt, "link")[0]
             try:
-                microblog_data['published'] = str(int(_entry.published.tf))
-            except (KeyError, TypeError):
-                microblog_data['published'] = microblog_data['updated']
-            microblog_data['id'] = item['id']
-            for link in _entry.links:
-                try:
-                    if link.attrs["title"] == "comments":
-                        microblog_data['comments'] = link.attrs["href"]
-                        service, node = self.parseCommentUrl(microblog_data["comments"])
-                        microblog_data['comments_service'] = service.full()
-                        microblog_data['comments_node'] = node
-                        break
-                except (KeyError, exceptions.DataError, RuntimeError):
-                    warning(_("Can't parse the link element of pubsub entry %s") % item['id'])
-                    continue
-        except (AttributeError, KeyError):
-            error(_('Error while parsing atom entry for microblogging event'))
-            raise exceptions.DataError
-
-        ##XXX: workaround for Jappix behaviour
-        if not 'author' in microblog_data:
-            from xe import NestElement
-            try:
-                author = NestElement('author')
-                author.import_xml(str(_entry))
-                microblog_data['author'] = author.nick.text
-            except:
-                error(_('Cannot find author'))
-        ##end workaround Jappix
+                assert(link_elt.attrib['title'] == "comments")
+                microblog_data['comments'] = link_elt.attrib['href']
+                service, node = self.parseCommentUrl(microblog_data["comments"])
+                microblog_data['comments_service'] = service.full()
+                microblog_data['comments_node'] = node
+            except (exceptions.DataError, RuntimeError, KeyError):
+                warning(_("Can't parse the link element of pubsub entry %s") % microblog_data['id'])
+        except:
+            pass
+        try:
+            microblog_data['author'] = xpath(entry_elt, 'author/name')[0].text
+        except IndexError:
+            try:  # XXX: workaround for Jappix behaviour
+                microblog_data['author'] = xpath(entry_elt, 'author/nick')[0].text
+            except IndexError:
+                warning(_("Can't find author element in pubsub entry %s") % microblog_data['id'])
 
         defer.returnValue(microblog_data)
 
+    def __getLXMLInnerContent(self, elt):
+        """Return the inner content of a lxml.etree.Element. It is not
+        trivial because the lxml tostring method would return the full
+        content including elt's tag and attributes, and elt.getchildren()
+        would skip a text value which is not within an element..."""
+        return self.__getDomishInnerContent(ElementParser()(etree.tostring(elt)))
+
+    def __getDomishInnerContent(self, elt):
+        """Return the inner content of a domish.Element."""
+        result = ''
+        for child in elt.children:
+            try:
+                result += child.toXml()  # child id a domish.Element
+            except AttributeError:
+                result += child  # child is unicode
+        return result
+
     def __decapsulateExtraNS(self, text):
         """Check for XHTML namespace and decapsulate the content so the user
         who wants to modify an entry will see the text that he entered. Also
@@ -187,7 +216,7 @@
         elt = ElementParser()(text)
         if elt.uri != NS_XHTML:
             raise exceptions.DataError(_('Content of type XHTML must declare its namespace!'))
-        return elt.firstChildElement().toXml()
+        return self.__getDomishInnerContent(elt)
 
     def microblogCB(self, itemsEvent, profile):
         d = defer.Deferred()
--- a/src/test/helpers.py	Fri Feb 14 21:24:31 2014 +0100
+++ b/src/test/helpers.py	Sun Feb 23 13:54:41 2014 +0100
@@ -179,7 +179,7 @@
 
         setattr(self, name, checkCall)
 
-    def addMethod(self, name, int_suffix, in_sign, out_sign, method, async=False):
+    def addMethod(self, name, int_suffix, in_sign, out_sign, method, async=False, doc=None):
         pass
 
     def addSignal(self, name, int_suffix, signature):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/test_plugin_xep_0277.py	Sun Feb 23 13:54:41 2014 +0100
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# SAT: a jabber client
+# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+""" Plugin XEP-0277 tests """
+
+from sat.test import helpers
+from sat.plugins import plugin_xep_0277
+from sat.plugins import plugin_misc_text_syntaxes
+from sat.tools.xml_tools import ElementParser
+import re
+
+
+class XEP_0277Test(helpers.SatTestCase):
+
+    PUBSUB_ENTRY_1 = """
+    <item id="c745a688-9b02-11e3-a1a3-c0143dd4fe51" xmlns="%s">
+        <entry>
+            <title type="text">&lt;span&gt;titre&lt;/span&gt;</title>
+            <id>c745a688-9b02-11e3-a1a3-c0143dd4fe51</id>
+            <updated>2014-02-21T16:16:39+02:00</updated>
+            <published>2014-02-21T16:16:38+02:00</published>
+            <content type="text">&lt;p&gt;contenu&lt;/p&gt;texte sans balise&lt;p&gt;autre contenu&lt;/p&gt;</content>
+            <content type="xhtml"><div xmlns="http://www.w3.org/1999/xhtml"><p>contenu</p>texte sans balise<p>autre contenu</p></div></content>
+        <author>
+            <name>test1@souliane.org</name>
+        </author>
+    </entry>
+    </item>
+    """ % plugin_xep_0277.NS_PUBSUB
+
+    PUBSUB_ENTRY_2 = """
+    <item id="c745a688-9b02-11e3-a1a3-c0143dd4fe51" xmlns="%s">
+        <entry xmlns=''>
+            <title type="text">&lt;div&gt;titre&lt;/div&gt;</title>
+            <title type="xhtml"><div xmlns="http://www.w3.org/1999/xhtml"><div style="background-image: url('xxx');">titre</div></div></title>
+            <id>c745a688-9b02-11e3-a1a3-c0143dd4fe51</id>
+            <updated>2014-02-21T16:16:39+02:00</updated>
+            <published>2014-02-21T16:16:38+02:00</published>
+            <content type="text">&lt;div&gt;&lt;p&gt;contenu&lt;/p&gt;texte dans balise&lt;p&gt;autre contenu&lt;/p&gt;&lt;/div&gt;</content>
+            <content type="xhtml"><div xmlns="http://www.w3.org/1999/xhtml"><div><p>contenu</p>texte dans balise<p>autre contenu</p></div></div></content>
+        <author>
+            <nick>test1@souliane.org</nick>
+        </author>
+    </entry>
+    </item>
+    """ % plugin_xep_0277.NS_PUBSUB
+
+    def setUp(self):
+        self.host = helpers.FakeSAT()
+
+        class XEP_0163(object):
+            def __init__(self, host):
+                pass
+
+            def addPEPEvent(self, *args):
+                pass
+        self.host.plugins["XEP-0163"] = XEP_0163(self.host)
+        self.host.plugins["TEXT-SYNTAXES"] = plugin_misc_text_syntaxes.TextSyntaxes(self.host)
+        self.plugin = plugin_xep_0277.XEP_0277(self.host)
+
+    def test_item2mbdata_1(self):
+        expected = {'id': 'c745a688-9b02-11e3-a1a3-c0143dd4fe51',
+                    'title': '<span>titre</span>',
+                    'updated': '1392992199.0',
+                    'published': '1392992198.0',
+                    'content': '<p>contenu</p>texte sans balise<p>autre contenu</p>',
+                    'content_xhtml': '<div><p>contenu</p>texte sans balise<p>autre contenu</p></div>',
+                    'author': 'test1@souliane.org'
+                    }
+        d = self.plugin.item2mbdata(ElementParser()(self.PUBSUB_ENTRY_1))
+        d.addCallback(self.assertEqual, expected)
+        return d
+
+    def test_item2mbdata_2(self):
+        expected = {'id': 'c745a688-9b02-11e3-a1a3-c0143dd4fe51',
+                    'title': '<div>titre</div>',
+                    'title_xhtml': '<div style="">titre</div>',
+                    'updated': '1392992199.0',
+                    'published': '1392992198.0',
+                    'content': '<div><p>contenu</p>texte dans balise<p>autre contenu</p></div>',
+                    'content_xhtml': '<div><p>contenu</p>texte dans balise<p>autre contenu</p></div>',
+                    'author': 'test1@souliane.org'
+                    }
+        d = self.plugin.item2mbdata(ElementParser()(self.PUBSUB_ENTRY_2))
+        d.addCallback(self.assertEqual, expected)
+        return d